Branch data Line data Source code
1 : : /* Array prefetching.
2 : : Copyright (C) 2005-2025 Free Software Foundation, Inc.
3 : :
4 : : This file is part of GCC.
5 : :
6 : : GCC is free software; you can redistribute it and/or modify it
7 : : under the terms of the GNU General Public License as published by the
8 : : Free Software Foundation; either version 3, or (at your option) any
9 : : later version.
10 : :
11 : : GCC is distributed in the hope that it will be useful, but WITHOUT
12 : : ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 : : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 : : for more details.
15 : :
16 : : You should have received a copy of the GNU General Public License
17 : : along with GCC; see the file COPYING3. If not see
18 : : <http://www.gnu.org/licenses/>. */
19 : :
20 : : #include "config.h"
21 : : #include "system.h"
22 : : #include "coretypes.h"
23 : : #include "backend.h"
24 : : #include "target.h"
25 : : #include "rtl.h"
26 : : #include "tree.h"
27 : : #include "gimple.h"
28 : : #include "predict.h"
29 : : #include "tree-pass.h"
30 : : #include "gimple-ssa.h"
31 : : #include "optabs-query.h"
32 : : #include "tree-pretty-print.h"
33 : : #include "fold-const.h"
34 : : #include "stor-layout.h"
35 : : #include "gimplify.h"
36 : : #include "gimple-iterator.h"
37 : : #include "gimplify-me.h"
38 : : #include "tree-ssa-loop-ivopts.h"
39 : : #include "tree-ssa-loop-manip.h"
40 : : #include "tree-ssa-loop-niter.h"
41 : : #include "tree-ssa-loop.h"
42 : : #include "ssa.h"
43 : : #include "tree-into-ssa.h"
44 : : #include "cfgloop.h"
45 : : #include "tree-scalar-evolution.h"
46 : : #include "langhooks.h"
47 : : #include "tree-inline.h"
48 : : #include "tree-data-ref.h"
49 : : #include "diagnostic-core.h"
50 : : #include "dbgcnt.h"
51 : :
52 : : /* This pass inserts prefetch instructions to optimize cache usage during
53 : : accesses to arrays in loops. It processes loops sequentially and:
54 : :
55 : : 1) Gathers all memory references in the single loop.
56 : : 2) For each of the references it decides when it is profitable to prefetch
57 : : it. To do it, we evaluate the reuse among the accesses, and determines
58 : : two values: PREFETCH_BEFORE (meaning that it only makes sense to do
59 : : prefetching in the first PREFETCH_BEFORE iterations of the loop) and
60 : : PREFETCH_MOD (meaning that it only makes sense to prefetch in the
61 : : iterations of the loop that are zero modulo PREFETCH_MOD). For example
62 : : (assuming cache line size is 64 bytes, char has size 1 byte and there
63 : : is no hardware sequential prefetch):
64 : :
65 : : char *a;
66 : : for (i = 0; i < max; i++)
67 : : {
68 : : a[255] = ...; (0)
69 : : a[i] = ...; (1)
70 : : a[i + 64] = ...; (2)
71 : : a[16*i] = ...; (3)
72 : : a[187*i] = ...; (4)
73 : : a[187*i + 50] = ...; (5)
74 : : }
75 : :
76 : : (0) obviously has PREFETCH_BEFORE 1
77 : : (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
78 : : location 64 iterations before it, and PREFETCH_MOD 64 (since
79 : : it hits the same cache line otherwise).
80 : : (2) has PREFETCH_MOD 64
81 : : (3) has PREFETCH_MOD 4
82 : : (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since
83 : : the cache line accessed by (5) is the same with probability only
84 : : 7/32.
85 : : (5) has PREFETCH_MOD 1 as well.
86 : :
87 : : Additionally, we use data dependence analysis to determine for each
88 : : reference the distance till the first reuse; this information is used
89 : : to determine the temporality of the issued prefetch instruction.
90 : :
91 : : 3) We determine how much ahead we need to prefetch. The number of
92 : : iterations needed is time to fetch / time spent in one iteration of
93 : : the loop. The problem is that we do not know either of these values,
94 : : so we just make a heuristic guess based on a magic (possibly)
95 : : target-specific constant and size of the loop.
96 : :
97 : : 4) Determine which of the references we prefetch. We take into account
98 : : that there is a maximum number of simultaneous prefetches (provided
99 : : by machine description). We prefetch as many prefetches as possible
100 : : while still within this bound (starting with those with lowest
101 : : prefetch_mod, since they are responsible for most of the cache
102 : : misses).
103 : :
104 : : 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
105 : : and PREFETCH_BEFORE requirements (within some bounds), and to avoid
106 : : prefetching nonaccessed memory.
107 : : TODO -- actually implement peeling.
108 : :
109 : : 6) We actually emit the prefetch instructions. ??? Perhaps emit the
110 : : prefetch instructions with guards in cases where 5) was not sufficient
111 : : to satisfy the constraints?
112 : :
113 : : A cost model is implemented to determine whether or not prefetching is
114 : : profitable for a given loop. The cost model has three heuristics:
115 : :
116 : : 1. Function trip_count_to_ahead_ratio_too_small_p implements a
117 : : heuristic that determines whether or not the loop has too few
118 : : iterations (compared to ahead). Prefetching is not likely to be
119 : : beneficial if the trip count to ahead ratio is below a certain
120 : : minimum.
121 : :
122 : : 2. Function mem_ref_count_reasonable_p implements a heuristic that
123 : : determines whether the given loop has enough CPU ops that can be
124 : : overlapped with cache missing memory ops. If not, the loop
125 : : won't benefit from prefetching. In the implementation,
126 : : prefetching is not considered beneficial if the ratio between
127 : : the instruction count and the mem ref count is below a certain
128 : : minimum.
129 : :
130 : : 3. Function insn_to_prefetch_ratio_too_small_p implements a
131 : : heuristic that disables prefetching in a loop if the prefetching
132 : : cost is above a certain limit. The relative prefetching cost is
133 : : estimated by taking the ratio between the prefetch count and the
134 : : total intruction count (this models the I-cache cost).
135 : :
136 : : The limits used in these heuristics are defined as parameters with
137 : : reasonable default values. Machine-specific default values will be
138 : : added later.
139 : :
140 : : Some other TODO:
141 : : -- write and use more general reuse analysis (that could be also used
142 : : in other cache aimed loop optimizations)
143 : : -- make it behave sanely together with the prefetches given by user
144 : : (now we just ignore them; at the very least we should avoid
145 : : optimizing loops in that user put his own prefetches)
146 : : -- we assume cache line size alignment of arrays; this could be
147 : : improved. */
148 : :
149 : : /* Magic constants follow. These should be replaced by machine specific
150 : : numbers. */
151 : :
152 : : /* True if write can be prefetched by a read prefetch. */
153 : :
154 : : #ifndef WRITE_CAN_USE_READ_PREFETCH
155 : : #define WRITE_CAN_USE_READ_PREFETCH 1
156 : : #endif
157 : :
158 : : /* True if read can be prefetched by a write prefetch. */
159 : :
160 : : #ifndef READ_CAN_USE_WRITE_PREFETCH
161 : : #define READ_CAN_USE_WRITE_PREFETCH 0
162 : : #endif
163 : :
164 : : /* The size of the block loaded by a single prefetch. Usually, this is
165 : : the same as cache line size (at the moment, we only consider one level
166 : : of cache hierarchy). */
167 : :
168 : : #ifndef PREFETCH_BLOCK
169 : : #define PREFETCH_BLOCK param_l1_cache_line_size
170 : : #endif
171 : :
172 : : /* Do we have a forward hardware sequential prefetching? */
173 : :
174 : : #ifndef HAVE_FORWARD_PREFETCH
175 : : #define HAVE_FORWARD_PREFETCH 0
176 : : #endif
177 : :
178 : : /* Do we have a backward hardware sequential prefetching? */
179 : :
180 : : #ifndef HAVE_BACKWARD_PREFETCH
181 : : #define HAVE_BACKWARD_PREFETCH 0
182 : : #endif
183 : :
184 : : /* In some cases we are only able to determine that there is a certain
185 : : probability that the two accesses hit the same cache line. In this
186 : : case, we issue the prefetches for both of them if this probability
187 : : is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand. */
188 : :
189 : : #ifndef ACCEPTABLE_MISS_RATE
190 : : #define ACCEPTABLE_MISS_RATE 50
191 : : #endif
192 : :
193 : : #define L1_CACHE_SIZE_BYTES ((unsigned) (param_l1_cache_size * 1024))
194 : : #define L2_CACHE_SIZE_BYTES ((unsigned) (param_l2_cache_size * 1024))
195 : :
196 : : /* We consider a memory access nontemporal if it is not reused sooner than
197 : : after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore
198 : : accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
199 : : so that we use nontemporal prefetches e.g. if single memory location
200 : : is accessed several times in a single iteration of the loop. */
201 : : #define NONTEMPORAL_FRACTION 16
202 : :
203 : : /* In case we have to emit a memory fence instruction after the loop that
204 : : uses nontemporal stores, this defines the builtin to use. */
205 : :
206 : : #ifndef FENCE_FOLLOWING_MOVNT
207 : : #define FENCE_FOLLOWING_MOVNT NULL_TREE
208 : : #endif
209 : :
210 : : /* It is not profitable to prefetch when the trip count is not at
211 : : least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance.
212 : : For example, in a loop with a prefetch ahead distance of 10,
213 : : supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is
214 : : profitable to prefetch when the trip count is greater or equal to
215 : : 40. In that case, 30 out of the 40 iterations will benefit from
216 : : prefetching. */
217 : :
218 : : #ifndef TRIP_COUNT_TO_AHEAD_RATIO
219 : : #define TRIP_COUNT_TO_AHEAD_RATIO 4
220 : : #endif
221 : :
222 : : /* The group of references between that reuse may occur. */
223 : :
224 : : struct mem_ref_group
225 : : {
226 : : tree base; /* Base of the reference. */
227 : : tree step; /* Step of the reference. */
228 : : struct mem_ref *refs; /* References in the group. */
229 : : struct mem_ref_group *next; /* Next group of references. */
230 : : unsigned int uid; /* Group UID, used only for debugging. */
231 : : };
232 : :
233 : : /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */
234 : :
235 : : #define PREFETCH_ALL HOST_WIDE_INT_M1U
236 : :
237 : : /* Do not generate a prefetch if the unroll factor is significantly less
238 : : than what is required by the prefetch. This is to avoid redundant
239 : : prefetches. For example, when prefetch_mod is 16 and unroll_factor is
240 : : 2, prefetching requires unrolling the loop 16 times, but
241 : : the loop is actually unrolled twice. In this case (ratio = 8),
242 : : prefetching is not likely to be beneficial. */
243 : :
244 : : #ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO
245 : : #define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 4
246 : : #endif
247 : :
248 : : /* Some of the prefetch computations have quadratic complexity. We want to
249 : : avoid huge compile times and, therefore, want to limit the amount of
250 : : memory references per loop where we consider prefetching. */
251 : :
252 : : #ifndef PREFETCH_MAX_MEM_REFS_PER_LOOP
253 : : #define PREFETCH_MAX_MEM_REFS_PER_LOOP 200
254 : : #endif
255 : :
256 : : /* The memory reference. */
257 : :
258 : : struct mem_ref
259 : : {
260 : : gimple *stmt; /* Statement in that the reference appears. */
261 : : tree mem; /* The reference. */
262 : : HOST_WIDE_INT delta; /* Constant offset of the reference. */
263 : : struct mem_ref_group *group; /* The group of references it belongs to. */
264 : : unsigned HOST_WIDE_INT prefetch_mod;
265 : : /* Prefetch only each PREFETCH_MOD-th
266 : : iteration. */
267 : : unsigned HOST_WIDE_INT prefetch_before;
268 : : /* Prefetch only first PREFETCH_BEFORE
269 : : iterations. */
270 : : unsigned reuse_distance; /* The amount of data accessed before the first
271 : : reuse of this value. */
272 : : struct mem_ref *next; /* The next reference in the group. */
273 : : unsigned int uid; /* Ref UID, used only for debugging. */
274 : : unsigned write_p : 1; /* Is it a write? */
275 : : unsigned independent_p : 1; /* True if the reference is independent on
276 : : all other references inside the loop. */
277 : : unsigned issue_prefetch_p : 1; /* Should we really issue the prefetch? */
278 : : unsigned storent_p : 1; /* True if we changed the store to a
279 : : nontemporal one. */
280 : : };
281 : :
282 : : /* Dumps information about memory reference */
283 : : static void
284 : 31 : dump_mem_details (FILE *file, tree base, tree step,
285 : : HOST_WIDE_INT delta, bool write_p)
286 : : {
287 : 31 : fprintf (file, "(base ");
288 : 31 : print_generic_expr (file, base, TDF_SLIM);
289 : 31 : fprintf (file, ", step ");
290 : 31 : if (cst_and_fits_in_hwi (step))
291 : 23 : fprintf (file, HOST_WIDE_INT_PRINT_DEC, int_cst_value (step));
292 : : else
293 : 8 : print_generic_expr (file, step, TDF_SLIM);
294 : 31 : fprintf (file, ")\n");
295 : 31 : fprintf (file, " delta " HOST_WIDE_INT_PRINT_DEC "\n", delta);
296 : 51 : fprintf (file, " %s\n\n", write_p ? "write" : "read");
297 : 31 : }
298 : :
299 : : /* Dumps information about reference REF to FILE. */
300 : :
301 : : static void
302 : 54 : dump_mem_ref (FILE *file, struct mem_ref *ref)
303 : : {
304 : 54 : fprintf (file, "reference %u:%u (", ref->group->uid, ref->uid);
305 : 54 : print_generic_expr (file, ref->mem, TDF_SLIM);
306 : 54 : fprintf (file, ")\n");
307 : 54 : }
308 : :
309 : : /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
310 : : exist. */
311 : :
312 : : static struct mem_ref_group *
313 : 401 : find_or_create_group (struct mem_ref_group **groups, tree base, tree step)
314 : : {
315 : : /* Global count for setting struct mem_ref_group->uid. */
316 : 401 : static unsigned int last_mem_ref_group_uid = 0;
317 : :
318 : 401 : struct mem_ref_group *group;
319 : :
320 : 1179 : for (; *groups; groups = &(*groups)->next)
321 : : {
322 : 953 : if (operand_equal_p ((*groups)->step, step, 0)
323 : 953 : && operand_equal_p ((*groups)->base, base, 0))
324 : 173 : return *groups;
325 : :
326 : : /* If step is an integer constant, keep the list of groups sorted
327 : : by decreasing step. */
328 : 1464 : if (cst_and_fits_in_hwi ((*groups)->step) && cst_and_fits_in_hwi (step)
329 : 1456 : && int_cst_value ((*groups)->step) < int_cst_value (step))
330 : : break;
331 : : }
332 : :
333 : 228 : group = XNEW (struct mem_ref_group);
334 : 228 : group->base = base;
335 : 228 : group->step = step;
336 : 228 : group->refs = NULL;
337 : 228 : group->uid = ++last_mem_ref_group_uid;
338 : 228 : group->next = *groups;
339 : 228 : *groups = group;
340 : :
341 : 228 : return group;
342 : : }
343 : :
344 : : /* Records a memory reference MEM in GROUP with offset DELTA and write status
345 : : WRITE_P. The reference occurs in statement STMT. */
346 : :
347 : : static void
348 : 401 : record_ref (struct mem_ref_group *group, gimple *stmt, tree mem,
349 : : HOST_WIDE_INT delta, bool write_p)
350 : : {
351 : 401 : unsigned int last_mem_ref_uid = 0;
352 : 401 : struct mem_ref **aref;
353 : :
354 : : /* Do not record the same address twice. */
355 : 445 : for (aref = &group->refs; *aref; aref = &(*aref)->next)
356 : : {
357 : 183 : last_mem_ref_uid = (*aref)->uid;
358 : :
359 : : /* It does not have to be possible for write reference to reuse the read
360 : : prefetch, or vice versa. */
361 : 183 : if (!WRITE_CAN_USE_READ_PREFETCH
362 : : && write_p
363 : : && !(*aref)->write_p)
364 : : continue;
365 : 183 : if (!READ_CAN_USE_WRITE_PREFETCH
366 : : && !write_p
367 : 34 : && (*aref)->write_p)
368 : 0 : continue;
369 : :
370 : 183 : if ((*aref)->delta == delta)
371 : : return;
372 : : }
373 : :
374 : 262 : (*aref) = XNEW (struct mem_ref);
375 : 262 : (*aref)->stmt = stmt;
376 : 262 : (*aref)->mem = mem;
377 : 262 : (*aref)->delta = delta;
378 : 262 : (*aref)->write_p = write_p;
379 : 262 : (*aref)->prefetch_before = PREFETCH_ALL;
380 : 262 : (*aref)->prefetch_mod = 1;
381 : 262 : (*aref)->reuse_distance = 0;
382 : 262 : (*aref)->issue_prefetch_p = false;
383 : 262 : (*aref)->group = group;
384 : 262 : (*aref)->next = NULL;
385 : 262 : (*aref)->independent_p = false;
386 : 262 : (*aref)->storent_p = false;
387 : 262 : (*aref)->uid = last_mem_ref_uid + 1;
388 : :
389 : 262 : if (dump_file && (dump_flags & TDF_DETAILS))
390 : : {
391 : 27 : dump_mem_ref (dump_file, *aref);
392 : :
393 : 27 : fprintf (dump_file, " group %u ", group->uid);
394 : 27 : dump_mem_details (dump_file, group->base, group->step, delta,
395 : : write_p);
396 : : }
397 : : }
398 : :
399 : : /* Release memory references in GROUPS. */
400 : :
401 : : static void
402 : 122 : release_mem_refs (struct mem_ref_group *groups)
403 : : {
404 : 122 : struct mem_ref_group *next_g;
405 : 122 : struct mem_ref *ref, *next_r;
406 : :
407 : 350 : for (; groups; groups = next_g)
408 : : {
409 : 228 : next_g = groups->next;
410 : 490 : for (ref = groups->refs; ref; ref = next_r)
411 : : {
412 : 262 : next_r = ref->next;
413 : 262 : free (ref);
414 : : }
415 : 228 : free (groups);
416 : : }
417 : 122 : }
418 : :
419 : : /* A structure used to pass arguments to idx_analyze_ref. */
420 : :
421 : : struct ar_data
422 : : {
423 : : class loop *loop; /* Loop of the reference. */
424 : : gimple *stmt; /* Statement of the reference. */
425 : : tree *step; /* Step of the memory reference. */
426 : : HOST_WIDE_INT *delta; /* Offset of the memory reference. */
427 : : };
428 : :
429 : : /* Analyzes a single INDEX of a memory reference to obtain information
430 : : described at analyze_ref. Callback for for_each_index. */
431 : :
432 : : static bool
433 : 556 : idx_analyze_ref (tree base, tree *index, void *data)
434 : : {
435 : 556 : struct ar_data *ar_data = (struct ar_data *) data;
436 : 556 : tree ibase, step, stepsize;
437 : 556 : HOST_WIDE_INT idelta = 0, imult = 1;
438 : 556 : affine_iv iv;
439 : :
440 : 1112 : if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt),
441 : : *index, &iv, true))
442 : : return false;
443 : 421 : ibase = iv.base;
444 : 421 : step = iv.step;
445 : :
446 : 421 : if (TREE_CODE (ibase) == POINTER_PLUS_EXPR
447 : 421 : && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
448 : : {
449 : 37 : idelta = int_cst_value (TREE_OPERAND (ibase, 1));
450 : 37 : ibase = TREE_OPERAND (ibase, 0);
451 : : }
452 : 421 : if (cst_and_fits_in_hwi (ibase))
453 : : {
454 : 23 : idelta += int_cst_value (ibase);
455 : 23 : ibase = build_int_cst (TREE_TYPE (ibase), 0);
456 : : }
457 : :
458 : 421 : if (TREE_CODE (base) == ARRAY_REF)
459 : : {
460 : 55 : stepsize = array_ref_element_size (base);
461 : 55 : if (!cst_and_fits_in_hwi (stepsize))
462 : : return false;
463 : 55 : imult = int_cst_value (stepsize);
464 : 55 : step = fold_build2 (MULT_EXPR, sizetype,
465 : : fold_convert (sizetype, step),
466 : : fold_convert (sizetype, stepsize));
467 : 55 : idelta *= imult;
468 : : }
469 : :
470 : 421 : if (*ar_data->step == NULL_TREE)
471 : 405 : *ar_data->step = step;
472 : : else
473 : 16 : *ar_data->step = fold_build2 (PLUS_EXPR, sizetype,
474 : : fold_convert (sizetype, *ar_data->step),
475 : : fold_convert (sizetype, step));
476 : 421 : *ar_data->delta += idelta;
477 : 421 : *index = ibase;
478 : :
479 : 421 : return true;
480 : : }
481 : :
482 : : /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
483 : : STEP are integer constants and iter is number of iterations of LOOP. The
484 : : reference occurs in statement STMT. Strips nonaddressable component
485 : : references from REF_P. */
486 : :
487 : : static bool
488 : 542 : analyze_ref (class loop *loop, tree *ref_p, tree *base,
489 : : tree *step, HOST_WIDE_INT *delta,
490 : : gimple *stmt)
491 : : {
492 : 542 : struct ar_data ar_data;
493 : 542 : tree off;
494 : 542 : HOST_WIDE_INT bit_offset;
495 : 542 : tree ref = *ref_p;
496 : :
497 : 542 : *step = NULL_TREE;
498 : 542 : *delta = 0;
499 : :
500 : : /* First strip off the component references. Ignore bitfields.
501 : : Also strip off the real and imagine parts of a complex, so that
502 : : they can have the same base. */
503 : 542 : if (TREE_CODE (ref) == REALPART_EXPR
504 : 542 : || TREE_CODE (ref) == IMAGPART_EXPR
505 : 542 : || (TREE_CODE (ref) == COMPONENT_REF
506 : 20 : && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))))
507 : : {
508 : 8 : if (TREE_CODE (ref) == IMAGPART_EXPR)
509 : 4 : *delta += int_size_in_bytes (TREE_TYPE (ref));
510 : 8 : ref = TREE_OPERAND (ref, 0);
511 : : }
512 : :
513 : 542 : *ref_p = ref;
514 : :
515 : 567 : for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
516 : : {
517 : 25 : off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
518 : 25 : bit_offset = TREE_INT_CST_LOW (off);
519 : 25 : gcc_assert (bit_offset % BITS_PER_UNIT == 0);
520 : :
521 : 25 : *delta += bit_offset / BITS_PER_UNIT;
522 : : }
523 : :
524 : 542 : *base = unshare_expr (ref);
525 : 542 : ar_data.loop = loop;
526 : 542 : ar_data.stmt = stmt;
527 : 542 : ar_data.step = step;
528 : 542 : ar_data.delta = delta;
529 : 542 : return for_each_index (base, idx_analyze_ref, &ar_data);
530 : : }
531 : :
532 : : /* Record a memory reference REF to the list REFS. The reference occurs in
533 : : LOOP in statement STMT and it is write if WRITE_P. Returns true if the
534 : : reference was recorded, false otherwise. */
535 : :
536 : : static bool
537 : 542 : gather_memory_references_ref (class loop *loop, struct mem_ref_group **refs,
538 : : tree ref, bool write_p, gimple *stmt)
539 : : {
540 : 542 : tree base, step;
541 : 542 : HOST_WIDE_INT delta;
542 : 542 : struct mem_ref_group *agrp;
543 : :
544 : 542 : if (get_base_address (ref) == NULL)
545 : : return false;
546 : :
547 : 542 : if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
548 : : return false;
549 : : /* If analyze_ref fails the default is a NULL_TREE. We can stop here. */
550 : 407 : if (step == NULL_TREE)
551 : : return false;
552 : :
553 : : /* Stop if the address of BASE could not be taken. */
554 : 405 : if (may_be_nonaddressable_p (base))
555 : : return false;
556 : :
557 : : /* Limit non-constant step prefetching only to the innermost loops and
558 : : only when the step is loop invariant in the entire loop nest. */
559 : 405 : if (!cst_and_fits_in_hwi (step))
560 : : {
561 : 187 : if (loop->inner != NULL)
562 : : {
563 : 0 : if (dump_file && (dump_flags & TDF_DETAILS))
564 : : {
565 : 0 : fprintf (dump_file, "Memory expression %p\n",(void *) ref );
566 : 0 : print_generic_expr (dump_file, ref, TDF_SLIM);
567 : 0 : fprintf (dump_file,":");
568 : 0 : dump_mem_details (dump_file, base, step, delta, write_p);
569 : 0 : fprintf (dump_file,
570 : : "Ignoring %p, non-constant step prefetching is "
571 : : "limited to inner most loops \n",
572 : : (void *) ref);
573 : : }
574 : 0 : return false;
575 : : }
576 : : else
577 : : {
578 : 187 : if (!expr_invariant_in_loop_p (loop_outermost (loop), step))
579 : : {
580 : 4 : if (dump_file && (dump_flags & TDF_DETAILS))
581 : : {
582 : 4 : fprintf (dump_file, "Memory expression %p\n",(void *) ref );
583 : 4 : print_generic_expr (dump_file, ref, TDF_SLIM);
584 : 4 : fprintf (dump_file,":");
585 : 4 : dump_mem_details (dump_file, base, step, delta, write_p);
586 : 4 : fprintf (dump_file,
587 : : "Not prefetching, ignoring %p due to "
588 : : "loop variant step\n",
589 : : (void *) ref);
590 : : }
591 : 4 : return false;
592 : : }
593 : : }
594 : : }
595 : :
596 : : /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
597 : : are integer constants. */
598 : 401 : agrp = find_or_create_group (refs, base, step);
599 : 401 : record_ref (agrp, stmt, ref, delta, write_p);
600 : :
601 : 401 : return true;
602 : : }
603 : :
604 : : /* Record the suitable memory references in LOOP. NO_OTHER_REFS is set to
605 : : true if there are no other memory references inside the loop. */
606 : :
607 : : static struct mem_ref_group *
608 : 122 : gather_memory_references (class loop *loop, bool *no_other_refs, unsigned *ref_count)
609 : : {
610 : 122 : basic_block *body = get_loop_body_in_dom_order (loop);
611 : 122 : basic_block bb;
612 : 122 : unsigned i;
613 : 122 : gimple_stmt_iterator bsi;
614 : 122 : gimple *stmt;
615 : 122 : tree lhs, rhs;
616 : 122 : struct mem_ref_group *refs = NULL;
617 : :
618 : 122 : *no_other_refs = true;
619 : 122 : *ref_count = 0;
620 : :
621 : : /* Scan the loop body in order, so that the former references precede the
622 : : later ones. */
623 : 1037 : for (i = 0; i < loop->num_nodes; i++)
624 : : {
625 : 915 : bb = body[i];
626 : 915 : if (bb->loop_father != loop)
627 : 543 : continue;
628 : :
629 : 4487 : for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
630 : : {
631 : 3743 : stmt = gsi_stmt (bsi);
632 : :
633 : 3743 : if (gimple_code (stmt) != GIMPLE_ASSIGN)
634 : : {
635 : 518 : if (gimple_vuse (stmt)
636 : 518 : || (is_gimple_call (stmt)
637 : 0 : && !(gimple_call_flags (stmt) & ECF_CONST)))
638 : 57 : *no_other_refs = false;
639 : 518 : continue;
640 : : }
641 : :
642 : 3225 : if (! gimple_vuse (stmt))
643 : 2140 : continue;
644 : :
645 : 1085 : lhs = gimple_assign_lhs (stmt);
646 : 1085 : rhs = gimple_assign_rhs1 (stmt);
647 : :
648 : 1085 : if (REFERENCE_CLASS_P (rhs))
649 : : {
650 : 289 : *no_other_refs &= gather_memory_references_ref (loop, &refs,
651 : : rhs, false, stmt);
652 : 289 : *ref_count += 1;
653 : : }
654 : 1085 : if (REFERENCE_CLASS_P (lhs))
655 : : {
656 : 253 : *no_other_refs &= gather_memory_references_ref (loop, &refs,
657 : : lhs, true, stmt);
658 : 253 : *ref_count += 1;
659 : : }
660 : : }
661 : : }
662 : 122 : free (body);
663 : :
664 : 122 : return refs;
665 : : }
666 : :
667 : : /* Prune the prefetch candidate REF using the self-reuse. */
668 : :
669 : : static void
670 : 186 : prune_ref_by_self_reuse (struct mem_ref *ref)
671 : : {
672 : 186 : HOST_WIDE_INT step;
673 : 186 : bool backward;
674 : :
675 : : /* If the step size is non constant, we cannot calculate prefetch_mod. */
676 : 186 : if (!cst_and_fits_in_hwi (ref->group->step))
677 : : return;
678 : :
679 : 137 : step = int_cst_value (ref->group->step);
680 : :
681 : 137 : backward = step < 0;
682 : :
683 : 137 : if (step == 0)
684 : : {
685 : : /* Prefetch references to invariant address just once. */
686 : 4 : ref->prefetch_before = 1;
687 : 4 : return;
688 : : }
689 : :
690 : 133 : if (backward)
691 : : step = -step;
692 : :
693 : 133 : if (step > PREFETCH_BLOCK)
694 : : return;
695 : :
696 : 108 : if ((backward && HAVE_BACKWARD_PREFETCH)
697 : : || (!backward && HAVE_FORWARD_PREFETCH))
698 : : {
699 : : ref->prefetch_before = 1;
700 : : return;
701 : : }
702 : :
703 : 108 : ref->prefetch_mod = PREFETCH_BLOCK / step;
704 : : }
705 : :
706 : : /* Divides X by BY, rounding down. */
707 : :
708 : : static HOST_WIDE_INT
709 : 40 : ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
710 : : {
711 : 40 : gcc_assert (by > 0);
712 : :
713 : 40 : if (x >= 0)
714 : 40 : return x / (HOST_WIDE_INT) by;
715 : : else
716 : 0 : return (x + (HOST_WIDE_INT) by - 1) / (HOST_WIDE_INT) by;
717 : : }
718 : :
719 : : /* Given a CACHE_LINE_SIZE and two inductive memory references
720 : : with a common STEP greater than CACHE_LINE_SIZE and an address
721 : : difference DELTA, compute the probability that they will fall
722 : : in different cache lines. Return true if the computed miss rate
723 : : is not greater than the ACCEPTABLE_MISS_RATE. DISTINCT_ITERS is the
724 : : number of distinct iterations after which the pattern repeats itself.
725 : : ALIGN_UNIT is the unit of alignment in bytes. */
726 : :
727 : : static bool
728 : 12 : is_miss_rate_acceptable (unsigned HOST_WIDE_INT cache_line_size,
729 : : HOST_WIDE_INT step, HOST_WIDE_INT delta,
730 : : unsigned HOST_WIDE_INT distinct_iters,
731 : : int align_unit)
732 : : {
733 : 12 : unsigned align, iter;
734 : 12 : int total_positions, miss_positions, max_allowed_miss_positions;
735 : 12 : int address1, address2, cache_line1, cache_line2;
736 : :
737 : : /* It always misses if delta is greater than or equal to the cache
738 : : line size. */
739 : 12 : if (delta >= (HOST_WIDE_INT) cache_line_size)
740 : : return false;
741 : :
742 : 6 : gcc_assert (align_unit > 0);
743 : :
744 : 6 : miss_positions = 0;
745 : 6 : total_positions = (cache_line_size / align_unit) * distinct_iters;
746 : 6 : max_allowed_miss_positions = (ACCEPTABLE_MISS_RATE * total_positions) / 1000;
747 : :
748 : : /* Iterate through all possible alignments of the first
749 : : memory reference within its cache line. */
750 : 22 : for (align = 0; align < cache_line_size; align += align_unit)
751 : :
752 : : /* Iterate through all distinct iterations. */
753 : 60 : for (iter = 0; iter < distinct_iters; iter++)
754 : : {
755 : 44 : address1 = align + step * iter;
756 : 44 : address2 = address1 + delta;
757 : 44 : cache_line1 = address1 / cache_line_size;
758 : 44 : cache_line2 = address2 / cache_line_size;
759 : 44 : if (cache_line1 != cache_line2)
760 : : {
761 : 6 : miss_positions += 1;
762 : 6 : if (miss_positions > max_allowed_miss_positions)
763 : : return false;
764 : : }
765 : : }
766 : : return true;
767 : : }
768 : :
769 : : /* Prune the prefetch candidate REF using the reuse with BY.
770 : : If BY_IS_BEFORE is true, BY is before REF in the loop. */
771 : :
772 : : static void
773 : 88 : prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
774 : : bool by_is_before)
775 : : {
776 : 88 : HOST_WIDE_INT step;
777 : 88 : bool backward;
778 : 88 : HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
779 : 88 : HOST_WIDE_INT delta = delta_b - delta_r;
780 : 88 : HOST_WIDE_INT hit_from;
781 : 88 : unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
782 : 88 : HOST_WIDE_INT reduced_step;
783 : 88 : unsigned HOST_WIDE_INT reduced_prefetch_block;
784 : 88 : tree ref_type;
785 : 88 : int align_unit;
786 : :
787 : : /* If the step is non constant we cannot calculate prefetch_before. */
788 : 88 : if (!cst_and_fits_in_hwi (ref->group->step)) {
789 : : return;
790 : : }
791 : :
792 : 86 : step = int_cst_value (ref->group->step);
793 : :
794 : 86 : backward = step < 0;
795 : :
796 : :
797 : 86 : if (delta == 0)
798 : : {
799 : : /* If the references has the same address, only prefetch the
800 : : former. */
801 : 0 : if (by_is_before)
802 : 0 : ref->prefetch_before = 0;
803 : :
804 : 0 : return;
805 : : }
806 : :
807 : 86 : if (!step)
808 : : {
809 : : /* If the reference addresses are invariant and fall into the
810 : : same cache line, prefetch just the first one. */
811 : 6 : if (!by_is_before)
812 : : return;
813 : :
814 : 6 : if (ddown (ref->delta, PREFETCH_BLOCK)
815 : 3 : != ddown (by->delta, PREFETCH_BLOCK))
816 : : return;
817 : :
818 : 3 : ref->prefetch_before = 0;
819 : 3 : return;
820 : : }
821 : :
822 : : /* Only prune the reference that is behind in the array. */
823 : 80 : if (backward)
824 : : {
825 : 0 : if (delta > 0)
826 : : return;
827 : :
828 : : /* Transform the data so that we may assume that the accesses
829 : : are forward. */
830 : 0 : delta = - delta;
831 : 0 : step = -step;
832 : 0 : delta_r = PREFETCH_BLOCK - 1 - delta_r;
833 : 0 : delta_b = PREFETCH_BLOCK - 1 - delta_b;
834 : : }
835 : : else
836 : : {
837 : 80 : if (delta < 0)
838 : : return;
839 : : }
840 : :
841 : : /* Check whether the two references are likely to hit the same cache
842 : : line, and how distant the iterations in that it occurs are from
843 : : each other. */
844 : :
845 : 40 : if (step <= PREFETCH_BLOCK)
846 : : {
847 : : /* The accesses are sure to meet. Let us check when. */
848 : 34 : hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
849 : 34 : prefetch_before = (hit_from - delta_r + step - 1) / step;
850 : :
851 : : /* Do not reduce prefetch_before if we meet beyond cache size. */
852 : 34 : if (prefetch_before > absu_hwi (L2_CACHE_SIZE_BYTES / step))
853 : : prefetch_before = PREFETCH_ALL;
854 : 34 : if (prefetch_before < ref->prefetch_before)
855 : 25 : ref->prefetch_before = prefetch_before;
856 : :
857 : 34 : return;
858 : : }
859 : :
860 : : /* A more complicated case with step > prefetch_block. First reduce
861 : : the ratio between the step and the cache line size to its simplest
862 : : terms. The resulting denominator will then represent the number of
863 : : distinct iterations after which each address will go back to its
864 : : initial location within the cache line. This computation assumes
865 : : that PREFETCH_BLOCK is a power of two. */
866 : 6 : prefetch_block = PREFETCH_BLOCK;
867 : 6 : reduced_prefetch_block = prefetch_block;
868 : 6 : reduced_step = step;
869 : 6 : while ((reduced_step & 1) == 0
870 : 36 : && reduced_prefetch_block > 1)
871 : : {
872 : 30 : reduced_step >>= 1;
873 : 30 : reduced_prefetch_block >>= 1;
874 : : }
875 : :
876 : 6 : prefetch_before = delta / step;
877 : 6 : delta %= step;
878 : 6 : ref_type = TREE_TYPE (ref->mem);
879 : 6 : align_unit = TYPE_ALIGN (ref_type) / 8;
880 : 6 : if (is_miss_rate_acceptable (prefetch_block, step, delta,
881 : : reduced_prefetch_block, align_unit))
882 : : {
883 : : /* Do not reduce prefetch_before if we meet beyond cache size. */
884 : 0 : if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK)
885 : : prefetch_before = PREFETCH_ALL;
886 : 0 : if (prefetch_before < ref->prefetch_before)
887 : 0 : ref->prefetch_before = prefetch_before;
888 : :
889 : 0 : return;
890 : : }
891 : :
892 : : /* Try also the following iteration. */
893 : 6 : prefetch_before++;
894 : 6 : delta = step - delta;
895 : 6 : if (is_miss_rate_acceptable (prefetch_block, step, delta,
896 : : reduced_prefetch_block, align_unit))
897 : : {
898 : 0 : if (prefetch_before < ref->prefetch_before)
899 : 0 : ref->prefetch_before = prefetch_before;
900 : :
901 : 0 : return;
902 : : }
903 : :
904 : : /* The ref probably does not reuse by. */
905 : : return;
906 : : }
907 : :
908 : : /* Prune the prefetch candidate REF using the reuses with other references
909 : : in REFS. */
910 : :
911 : : static void
912 : 186 : prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
913 : : {
914 : 186 : struct mem_ref *prune_by;
915 : 186 : bool before = true;
916 : :
917 : 186 : prune_ref_by_self_reuse (ref);
918 : :
919 : 646 : for (prune_by = refs; prune_by; prune_by = prune_by->next)
920 : : {
921 : 274 : if (prune_by == ref)
922 : : {
923 : 186 : before = false;
924 : 186 : continue;
925 : : }
926 : :
927 : 88 : if (!WRITE_CAN_USE_READ_PREFETCH
928 : : && ref->write_p
929 : : && !prune_by->write_p)
930 : : continue;
931 : 88 : if (!READ_CAN_USE_WRITE_PREFETCH
932 : 88 : && !ref->write_p
933 : 62 : && prune_by->write_p)
934 : 0 : continue;
935 : :
936 : 88 : prune_ref_by_group_reuse (ref, prune_by, before);
937 : : }
938 : 186 : }
939 : :
940 : : /* Prune the prefetch candidates in GROUP using the reuse analysis. */
941 : :
942 : : static void
943 : 152 : prune_group_by_reuse (struct mem_ref_group *group)
944 : : {
945 : 152 : struct mem_ref *ref_pruned;
946 : :
947 : 338 : for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
948 : : {
949 : 186 : prune_ref_by_reuse (ref_pruned, group->refs);
950 : :
951 : 186 : if (dump_file && (dump_flags & TDF_DETAILS))
952 : : {
953 : 27 : dump_mem_ref (dump_file, ref_pruned);
954 : :
955 : 27 : if (ref_pruned->prefetch_before == PREFETCH_ALL
956 : 26 : && ref_pruned->prefetch_mod == 1)
957 : 5 : fprintf (dump_file, " no restrictions");
958 : 22 : else if (ref_pruned->prefetch_before == 0)
959 : 1 : fprintf (dump_file, " do not prefetch");
960 : 21 : else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
961 : 0 : fprintf (dump_file, " prefetch once");
962 : : else
963 : : {
964 : 21 : if (ref_pruned->prefetch_before != PREFETCH_ALL)
965 : : {
966 : 0 : fprintf (dump_file, " prefetch before ");
967 : 0 : fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
968 : : ref_pruned->prefetch_before);
969 : : }
970 : 21 : if (ref_pruned->prefetch_mod != 1)
971 : : {
972 : 21 : fprintf (dump_file, " prefetch mod ");
973 : 21 : fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
974 : : ref_pruned->prefetch_mod);
975 : : }
976 : : }
977 : 27 : fprintf (dump_file, "\n");
978 : : }
979 : : }
980 : 152 : }
981 : :
982 : : /* Prune the list of prefetch candidates GROUPS using the reuse analysis. */
983 : :
984 : : static void
985 : 0 : prune_by_reuse (struct mem_ref_group *groups)
986 : : {
987 : 229 : for (; groups; groups = groups->next)
988 : 152 : prune_group_by_reuse (groups);
989 : 0 : }
990 : :
991 : : /* Returns true if we should issue prefetch for REF. */
992 : :
993 : : static bool
994 : 592 : should_issue_prefetch_p (struct mem_ref *ref)
995 : : {
996 : : /* Do we want to issue prefetches for non-constant strides? */
997 : 592 : if (!cst_and_fits_in_hwi (ref->group->step)
998 : 592 : && param_prefetch_dynamic_strides == 0)
999 : : {
1000 : 0 : if (dump_file && (dump_flags & TDF_DETAILS))
1001 : 0 : fprintf (dump_file,
1002 : : "Skipping non-constant step for reference %u:%u\n",
1003 : 0 : ref->group->uid, ref->uid);
1004 : 0 : return false;
1005 : : }
1006 : :
1007 : : /* Some processors may have a hardware prefetcher that may conflict with
1008 : : prefetch hints for a range of strides. Make sure we don't issue
1009 : : prefetches for such cases if the stride is within this particular
1010 : : range. */
1011 : 592 : if (cst_and_fits_in_hwi (ref->group->step)
1012 : 592 : && abs_hwi (int_cst_value (ref->group->step))
1013 : 441 : < (HOST_WIDE_INT) param_prefetch_minimum_stride)
1014 : : {
1015 : 0 : if (dump_file && (dump_flags & TDF_DETAILS))
1016 : 0 : fprintf (dump_file,
1017 : : "Step for reference %u:%u (" HOST_WIDE_INT_PRINT_DEC
1018 : : ") is less than the mininum required stride of %d\n",
1019 : 0 : ref->group->uid, ref->uid, int_cst_value (ref->group->step),
1020 : : param_prefetch_minimum_stride);
1021 : 0 : return false;
1022 : : }
1023 : :
1024 : : /* For now do not issue prefetches for only first few of the
1025 : : iterations. */
1026 : 592 : if (ref->prefetch_before != PREFETCH_ALL)
1027 : : {
1028 : 89 : if (dump_file && (dump_flags & TDF_DETAILS))
1029 : 4 : fprintf (dump_file, "Ignoring reference %u:%u due to prefetch_before\n",
1030 : 4 : ref->group->uid, ref->uid);
1031 : 89 : return false;
1032 : : }
1033 : :
1034 : : /* Do not prefetch nontemporal stores. */
1035 : 503 : if (ref->storent_p)
1036 : : {
1037 : 2 : if (dump_file && (dump_flags & TDF_DETAILS))
1038 : 2 : fprintf (dump_file, "Ignoring nontemporal store reference %u:%u\n", ref->group->uid, ref->uid);
1039 : 2 : return false;
1040 : : }
1041 : :
1042 : : return true;
1043 : : }
1044 : :
1045 : : /* Decide which of the prefetch candidates in GROUPS to prefetch.
1046 : : AHEAD is the number of iterations to prefetch ahead (which corresponds
1047 : : to the number of simultaneous instances of one prefetch running at a
1048 : : time). UNROLL_FACTOR is the factor by that the loop is going to be
1049 : : unrolled. Returns true if there is anything to prefetch. */
1050 : :
1051 : : static bool
1052 : 64 : schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
1053 : : unsigned ahead)
1054 : : {
1055 : 64 : unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
1056 : 64 : unsigned slots_per_prefetch;
1057 : 64 : struct mem_ref *ref;
1058 : 64 : bool any = false;
1059 : :
1060 : : /* At most param_simultaneous_prefetches should be running
1061 : : at the same time. */
1062 : 64 : remaining_prefetch_slots = param_simultaneous_prefetches;
1063 : :
1064 : : /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
1065 : : AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration,
1066 : : it will need a prefetch slot. */
1067 : 64 : slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
1068 : 64 : if (dump_file && (dump_flags & TDF_DETAILS))
1069 : 15 : fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
1070 : : slots_per_prefetch);
1071 : :
1072 : : /* For now we just take memory references one by one and issue
1073 : : prefetches for as many as possible. The groups are sorted
1074 : : starting with the largest step, since the references with
1075 : : large step are more likely to cause many cache misses. */
1076 : :
1077 : 193 : for (; groups; groups = groups->next)
1078 : 286 : for (ref = groups->refs; ref; ref = ref->next)
1079 : : {
1080 : 157 : if (!should_issue_prefetch_p (ref))
1081 : 27 : continue;
1082 : :
1083 : : /* The loop is far from being sufficiently unrolled for this
1084 : : prefetch. Do not generate prefetch to avoid many redudant
1085 : : prefetches. */
1086 : 130 : if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO)
1087 : 3 : continue;
1088 : :
1089 : : /* If we need to prefetch the reference each PREFETCH_MOD iterations,
1090 : : and we unroll the loop UNROLL_FACTOR times, we need to insert
1091 : : ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
1092 : : iteration. */
1093 : 127 : n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1094 : 127 : / ref->prefetch_mod);
1095 : 127 : prefetch_slots = n_prefetches * slots_per_prefetch;
1096 : :
1097 : : /* If more than half of the prefetches would be lost anyway, do not
1098 : : issue the prefetch. */
1099 : 127 : if (2 * remaining_prefetch_slots < prefetch_slots)
1100 : 2 : continue;
1101 : :
1102 : : /* Stop prefetching if debug counter is activated. */
1103 : 125 : if (!dbg_cnt (prefetch))
1104 : 0 : continue;
1105 : :
1106 : 125 : ref->issue_prefetch_p = true;
1107 : 125 : if (dump_file && (dump_flags & TDF_DETAILS))
1108 : 20 : fprintf (dump_file, "Decided to issue prefetch for reference %u:%u\n",
1109 : 20 : ref->group->uid, ref->uid);
1110 : :
1111 : 125 : if (remaining_prefetch_slots <= prefetch_slots)
1112 : : return true;
1113 : 123 : remaining_prefetch_slots -= prefetch_slots;
1114 : 123 : any = true;
1115 : : }
1116 : :
1117 : : return any;
1118 : : }
1119 : :
1120 : : /* Return TRUE if no prefetch is going to be generated in the given
1121 : : GROUPS. */
1122 : :
1123 : : static bool
1124 : 77 : nothing_to_prefetch_p (struct mem_ref_group *groups)
1125 : : {
1126 : 77 : struct mem_ref *ref;
1127 : :
1128 : 78 : for (; groups; groups = groups->next)
1129 : 85 : for (ref = groups->refs; ref; ref = ref->next)
1130 : 84 : if (should_issue_prefetch_p (ref))
1131 : : return false;
1132 : :
1133 : : return true;
1134 : : }
1135 : :
1136 : : /* Estimate the number of prefetches in the given GROUPS.
1137 : : UNROLL_FACTOR is the factor by which LOOP was unrolled. */
1138 : :
1139 : : static int
1140 : 72 : estimate_prefetch_count (struct mem_ref_group *groups, unsigned unroll_factor)
1141 : : {
1142 : 72 : struct mem_ref *ref;
1143 : 72 : unsigned n_prefetches;
1144 : 72 : int prefetch_count = 0;
1145 : :
1146 : 223 : for (; groups; groups = groups->next)
1147 : 334 : for (ref = groups->refs; ref; ref = ref->next)
1148 : 183 : if (should_issue_prefetch_p (ref))
1149 : : {
1150 : 157 : n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1151 : 157 : / ref->prefetch_mod);
1152 : 157 : prefetch_count += n_prefetches;
1153 : : }
1154 : :
1155 : 72 : return prefetch_count;
1156 : : }
1157 : :
1158 : : /* Issue prefetches for the reference REF into loop as decided before.
1159 : : HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR
1160 : : is the factor by which LOOP was unrolled. */
1161 : :
1162 : : static void
1163 : 125 : issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
1164 : : {
1165 : 125 : HOST_WIDE_INT delta;
1166 : 125 : tree addr, addr_base, write_p, local, forward;
1167 : 125 : gcall *prefetch;
1168 : 125 : gimple_stmt_iterator bsi;
1169 : 125 : unsigned n_prefetches, ap;
1170 : 125 : bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
1171 : :
1172 : 125 : if (dump_file && (dump_flags & TDF_DETAILS))
1173 : 20 : fprintf (dump_file, "Issued%s prefetch for reference %u:%u.\n",
1174 : : nontemporal ? " nontemporal" : "",
1175 : 20 : ref->group->uid, ref->uid);
1176 : :
1177 : 125 : bsi = gsi_for_stmt (ref->stmt);
1178 : :
1179 : 125 : n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1180 : 125 : / ref->prefetch_mod);
1181 : 125 : addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
1182 : 125 : addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base),
1183 : : true, NULL, true, GSI_SAME_STMT);
1184 : 125 : write_p = ref->write_p ? integer_one_node : integer_zero_node;
1185 : 125 : local = nontemporal ? integer_zero_node : build_int_cst (integer_type_node, 3);
1186 : :
1187 : 250 : for (ap = 0; ap < n_prefetches; ap++)
1188 : : {
1189 : 125 : if (cst_and_fits_in_hwi (ref->group->step))
1190 : : {
1191 : : /* Determine the address to prefetch. */
1192 : 160 : delta = (ahead + ap * ref->prefetch_mod) *
1193 : 80 : int_cst_value (ref->group->step);
1194 : 80 : addr = fold_build_pointer_plus_hwi (addr_base, delta);
1195 : 80 : addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
1196 : : NULL, true, GSI_SAME_STMT);
1197 : : }
1198 : : else
1199 : : {
1200 : : /* The step size is non-constant but loop-invariant. We use the
1201 : : heuristic to simply prefetch ahead iterations ahead. */
1202 : 45 : forward = fold_build2 (MULT_EXPR, sizetype,
1203 : : fold_convert (sizetype, ref->group->step),
1204 : : fold_convert (sizetype, size_int (ahead)));
1205 : 45 : addr = fold_build_pointer_plus (addr_base, forward);
1206 : 45 : addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
1207 : : NULL, true, GSI_SAME_STMT);
1208 : : }
1209 : :
1210 : 125 : if (addr_base != addr
1211 : 123 : && TREE_CODE (addr_base) == SSA_NAME
1212 : 123 : && TREE_CODE (addr) == SSA_NAME)
1213 : : {
1214 : 123 : duplicate_ssa_name_ptr_info (addr, SSA_NAME_PTR_INFO (addr_base));
1215 : : /* As this isn't a plain copy we have to reset alignment
1216 : : information. */
1217 : 123 : if (SSA_NAME_PTR_INFO (addr))
1218 : 68 : mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr));
1219 : : }
1220 : :
1221 : : /* Create the prefetch instruction. */
1222 : 125 : prefetch = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
1223 : : 3, addr, write_p, local);
1224 : 125 : gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
1225 : : }
1226 : 125 : }
1227 : :
1228 : : /* Issue prefetches for the references in GROUPS into loop as decided before.
1229 : : HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the
1230 : : factor by that LOOP was unrolled. */
1231 : :
1232 : : static void
1233 : 62 : issue_prefetches (struct mem_ref_group *groups,
1234 : : unsigned unroll_factor, unsigned ahead)
1235 : : {
1236 : 62 : struct mem_ref *ref;
1237 : :
1238 : 195 : for (; groups; groups = groups->next)
1239 : 294 : for (ref = groups->refs; ref; ref = ref->next)
1240 : 161 : if (ref->issue_prefetch_p)
1241 : 125 : issue_prefetch_ref (ref, unroll_factor, ahead);
1242 : 62 : }
1243 : :
1244 : : /* Returns true if REF is a memory write for that a nontemporal store insn
1245 : : can be used. */
1246 : :
1247 : : static bool
1248 : 151 : nontemporal_store_p (struct mem_ref *ref)
1249 : : {
1250 : 151 : machine_mode mode;
1251 : 151 : enum insn_code code;
1252 : :
1253 : : /* REF must be a write that is not reused. We require it to be independent
1254 : : on all other memory references in the loop, as the nontemporal stores may
1255 : : be reordered with respect to other memory references. */
1256 : 151 : if (!ref->write_p
1257 : 151 : || !ref->independent_p
1258 : 19 : || ref->reuse_distance < L2_CACHE_SIZE_BYTES)
1259 : : return false;
1260 : :
1261 : : /* Check that we have the storent instruction for the mode. */
1262 : 2 : mode = TYPE_MODE (TREE_TYPE (ref->mem));
1263 : 2 : if (mode == BLKmode)
1264 : : return false;
1265 : :
1266 : 2 : code = optab_handler (storent_optab, mode);
1267 : 2 : return code != CODE_FOR_nothing;
1268 : : }
1269 : :
1270 : : /* If REF is a nontemporal store, we mark the corresponding modify statement
1271 : : and return true. Otherwise, we return false. */
1272 : :
1273 : : static bool
1274 : 151 : mark_nontemporal_store (struct mem_ref *ref)
1275 : : {
1276 : 151 : if (!nontemporal_store_p (ref))
1277 : : return false;
1278 : :
1279 : 2 : if (dump_file && (dump_flags & TDF_DETAILS))
1280 : 2 : fprintf (dump_file, "Marked reference %u:%u as a nontemporal store.\n",
1281 : 2 : ref->group->uid, ref->uid);
1282 : :
1283 : 2 : gimple_assign_set_nontemporal_move (ref->stmt, true);
1284 : 2 : ref->storent_p = true;
1285 : :
1286 : 2 : return true;
1287 : : }
1288 : :
1289 : : /* Issue a memory fence instruction after LOOP. */
1290 : :
1291 : : static void
1292 : 2 : emit_mfence_after_loop (class loop *loop)
1293 : : {
1294 : 2 : auto_vec<edge> exits = get_loop_exit_edges (loop);
1295 : 2 : edge exit;
1296 : 2 : gcall *call;
1297 : 2 : gimple_stmt_iterator bsi;
1298 : 2 : unsigned i;
1299 : :
1300 : 8 : FOR_EACH_VEC_ELT (exits, i, exit)
1301 : : {
1302 : 2 : call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0);
1303 : :
1304 : 2 : if (!single_pred_p (exit->dest)
1305 : : /* If possible, we prefer not to insert the fence on other paths
1306 : : in cfg. */
1307 : 2 : && !(exit->flags & EDGE_ABNORMAL))
1308 : 0 : split_loop_exit_edge (exit);
1309 : 2 : bsi = gsi_after_labels (exit->dest);
1310 : :
1311 : 2 : gsi_insert_before (&bsi, call, GSI_NEW_STMT);
1312 : : }
1313 : 2 : }
1314 : :
1315 : : /* Returns true if we can use storent in loop, false otherwise. */
1316 : :
1317 : : static bool
1318 : 64 : may_use_storent_in_loop_p (class loop *loop)
1319 : : {
1320 : 64 : bool ret = true;
1321 : :
1322 : 64 : if (loop->inner != NULL)
1323 : : return false;
1324 : :
1325 : : /* If we must issue a mfence insn after using storent, check that there
1326 : : is a suitable place for it at each of the loop exits. */
1327 : 62 : if (FENCE_FOLLOWING_MOVNT != NULL_TREE)
1328 : : {
1329 : 62 : auto_vec<edge> exits = get_loop_exit_edges (loop);
1330 : 62 : unsigned i;
1331 : 62 : edge exit;
1332 : :
1333 : 207 : FOR_EACH_VEC_ELT (exits, i, exit)
1334 : 42 : if ((exit->flags & EDGE_ABNORMAL)
1335 : 0 : && exit->dest == EXIT_BLOCK_PTR_FOR_FN (cfun))
1336 : 42 : ret = false;
1337 : 62 : }
1338 : :
1339 : : return ret;
1340 : : }
1341 : :
1342 : : /* Marks nontemporal stores in LOOP. GROUPS contains the description of memory
1343 : : references in the loop. Returns whether we inserted any mfence call. */
1344 : :
1345 : : static bool
1346 : 64 : mark_nontemporal_stores (class loop *loop, struct mem_ref_group *groups)
1347 : : {
1348 : 64 : struct mem_ref *ref;
1349 : 64 : bool any = false;
1350 : :
1351 : 64 : if (!may_use_storent_in_loop_p (loop))
1352 : : return false;
1353 : :
1354 : 187 : for (; groups; groups = groups->next)
1355 : 276 : for (ref = groups->refs; ref; ref = ref->next)
1356 : 151 : any |= mark_nontemporal_store (ref);
1357 : :
1358 : 62 : if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
1359 : : {
1360 : 2 : emit_mfence_after_loop (loop);
1361 : 2 : return true;
1362 : : }
1363 : : return false;
1364 : : }
1365 : :
1366 : : /* Determines whether we can profitably unroll LOOP FACTOR times, and if
1367 : : this is the case, fill in DESC by the description of number of
1368 : : iterations. */
1369 : :
1370 : : static bool
1371 : 57 : should_unroll_loop_p (class loop *loop, class tree_niter_desc *desc,
1372 : : unsigned factor)
1373 : : {
1374 : 57 : if (!can_unroll_loop_p (loop, factor, desc))
1375 : : return false;
1376 : :
1377 : : /* We only consider loops without control flow for unrolling. This is not
1378 : : a hard restriction -- tree_unroll_loop works with arbitrary loops
1379 : : as well; but the unrolling/prefetching is usually more profitable for
1380 : : loops consisting of a single basic block, and we want to limit the
1381 : : code growth. */
1382 : 47 : if (loop->num_nodes > 2)
1383 : 3 : return false;
1384 : :
1385 : : return true;
1386 : : }
1387 : :
1388 : : /* Determine the coefficient by that unroll LOOP, from the information
1389 : : contained in the list of memory references REFS. Description of
1390 : : number of iterations of LOOP is stored to DESC. NINSNS is the number of
1391 : : insns of the LOOP. EST_NITER is the estimated number of iterations of
1392 : : the loop, or -1 if no estimate is available. */
1393 : :
1394 : : static unsigned
1395 : 72 : determine_unroll_factor (class loop *loop, struct mem_ref_group *refs,
1396 : : unsigned ninsns, class tree_niter_desc *desc,
1397 : : HOST_WIDE_INT est_niter)
1398 : : {
1399 : 72 : unsigned upper_bound;
1400 : 72 : unsigned nfactor, factor, mod_constraint;
1401 : 72 : struct mem_ref_group *agp;
1402 : 72 : struct mem_ref *ref;
1403 : :
1404 : : /* Bail out early in case we must not unroll loops. */
1405 : 72 : if (!flag_unroll_loops)
1406 : : return 1;
1407 : :
1408 : : /* First check whether the loop is not too large to unroll. We ignore
1409 : : PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
1410 : : from unrolling them enough to make exactly one cache line covered by each
1411 : : iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
1412 : : us from unrolling the loops too many times in cases where we only expect
1413 : : gains from better scheduling and decreasing loop overhead, which is not
1414 : : the case here. */
1415 : 71 : upper_bound = param_max_unrolled_insns / ninsns;
1416 : :
1417 : : /* If we unrolled the loop more times than it iterates, the unrolled version
1418 : : of the loop would be never entered. */
1419 : 71 : if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
1420 : 0 : upper_bound = est_niter;
1421 : :
1422 : 71 : if (upper_bound <= 1)
1423 : : return 1;
1424 : :
1425 : : /* Choose the factor so that we may prefetch each cache just once,
1426 : : but bound the unrolling by UPPER_BOUND. */
1427 : : factor = 1;
1428 : 193 : for (agp = refs; agp; agp = agp->next)
1429 : 304 : for (ref = agp->refs; ref; ref = ref->next)
1430 : 168 : if (should_issue_prefetch_p (ref))
1431 : : {
1432 : 142 : mod_constraint = ref->prefetch_mod;
1433 : 142 : nfactor = least_common_multiple (mod_constraint, factor);
1434 : 142 : if (nfactor <= upper_bound)
1435 : 168 : factor = nfactor;
1436 : : }
1437 : :
1438 : 57 : if (!should_unroll_loop_p (loop, desc, factor))
1439 : : return 1;
1440 : :
1441 : : return factor;
1442 : : }
1443 : :
1444 : : /* Returns the total volume of the memory references REFS, taking into account
1445 : : reuses in the innermost loop and cache line size. TODO -- we should also
1446 : : take into account reuses across the iterations of the loops in the loop
1447 : : nest. */
1448 : :
1449 : : static unsigned
1450 : 70 : volume_of_references (struct mem_ref_group *refs)
1451 : : {
1452 : 70 : unsigned volume = 0;
1453 : 70 : struct mem_ref_group *gr;
1454 : 70 : struct mem_ref *ref;
1455 : :
1456 : 211 : for (gr = refs; gr; gr = gr->next)
1457 : 312 : for (ref = gr->refs; ref; ref = ref->next)
1458 : : {
1459 : : /* Almost always reuses another value? */
1460 : 171 : if (ref->prefetch_before != PREFETCH_ALL)
1461 : 26 : continue;
1462 : :
1463 : : /* If several iterations access the same cache line, use the size of
1464 : : the line divided by this number. Otherwise, a cache line is
1465 : : accessed in each iteration. TODO -- in the latter case, we should
1466 : : take the size of the reference into account, rounding it up on cache
1467 : : line size multiple. */
1468 : 145 : volume += param_l1_cache_line_size / ref->prefetch_mod;
1469 : : }
1470 : 70 : return volume;
1471 : : }
1472 : :
1473 : : /* Returns the volume of memory references accessed across VEC iterations of
1474 : : loops, whose sizes are described in the LOOP_SIZES array. N is the number
1475 : : of the loops in the nest (length of VEC and LOOP_SIZES vectors). */
1476 : :
1477 : : static unsigned
1478 : 178 : volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
1479 : : {
1480 : 178 : unsigned i;
1481 : :
1482 : 372 : for (i = 0; i < n; i++)
1483 : 206 : if (vec[i] != 0)
1484 : : break;
1485 : :
1486 : 178 : if (i == n)
1487 : : return 0;
1488 : :
1489 : 12 : gcc_assert (vec[i] > 0);
1490 : :
1491 : : /* We ignore the parts of the distance vector in subloops, since usually
1492 : : the numbers of iterations are much smaller. */
1493 : 12 : return loop_sizes[i] * vec[i];
1494 : : }
1495 : :
1496 : : /* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
1497 : : at the position corresponding to the loop of the step. N is the depth
1498 : : of the considered loop nest, and, LOOP is its innermost loop. */
1499 : :
1500 : : static void
1501 : 179 : add_subscript_strides (tree access_fn, unsigned stride,
1502 : : HOST_WIDE_INT *strides, unsigned n, class loop *loop)
1503 : : {
1504 : 179 : class loop *aloop;
1505 : 179 : tree step;
1506 : 179 : HOST_WIDE_INT astep;
1507 : 179 : unsigned min_depth = loop_depth (loop) - n;
1508 : :
1509 : 367 : while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
1510 : : {
1511 : 188 : aloop = get_chrec_loop (access_fn);
1512 : 188 : step = CHREC_RIGHT (access_fn);
1513 : 188 : access_fn = CHREC_LEFT (access_fn);
1514 : :
1515 : 188 : if ((unsigned) loop_depth (aloop) <= min_depth)
1516 : 0 : continue;
1517 : :
1518 : 188 : if (tree_fits_shwi_p (step))
1519 : 139 : astep = tree_to_shwi (step);
1520 : : else
1521 : 49 : astep = param_l1_cache_line_size;
1522 : :
1523 : 188 : strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
1524 : :
1525 : : }
1526 : 179 : }
1527 : :
1528 : : /* Returns the volume of memory references accessed between two consecutive
1529 : : self-reuses of the reference DR. We consider the subscripts of DR in N
1530 : : loops, and LOOP_SIZES contains the volumes of accesses in each of the
1531 : : loops. LOOP is the innermost loop of the current loop nest. */
1532 : :
1533 : : static unsigned
1534 : 171 : self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
1535 : : class loop *loop)
1536 : : {
1537 : 171 : tree stride, access_fn;
1538 : 171 : HOST_WIDE_INT *strides, astride;
1539 : 171 : vec<tree> access_fns;
1540 : 171 : tree ref = DR_REF (dr);
1541 : 171 : unsigned i, ret = ~0u;
1542 : :
1543 : : /* In the following example:
1544 : :
1545 : : for (i = 0; i < N; i++)
1546 : : for (j = 0; j < N; j++)
1547 : : use (a[j][i]);
1548 : : the same cache line is accessed each N steps (except if the change from
1549 : : i to i + 1 crosses the boundary of the cache line). Thus, for self-reuse,
1550 : : we cannot rely purely on the results of the data dependence analysis.
1551 : :
1552 : : Instead, we compute the stride of the reference in each loop, and consider
1553 : : the innermost loop in that the stride is less than cache size. */
1554 : :
1555 : 171 : strides = XCNEWVEC (HOST_WIDE_INT, n);
1556 : 171 : access_fns = DR_ACCESS_FNS (dr);
1557 : :
1558 : 698 : FOR_EACH_VEC_ELT (access_fns, i, access_fn)
1559 : : {
1560 : : /* Keep track of the reference corresponding to the subscript, so that we
1561 : : know its stride. */
1562 : 181 : while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
1563 : 2 : ref = TREE_OPERAND (ref, 0);
1564 : :
1565 : 179 : if (TREE_CODE (ref) == ARRAY_REF)
1566 : : {
1567 : 47 : stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1568 : 47 : if (tree_fits_uhwi_p (stride))
1569 : 47 : astride = tree_to_uhwi (stride);
1570 : : else
1571 : 0 : astride = param_l1_cache_line_size;
1572 : :
1573 : 47 : ref = TREE_OPERAND (ref, 0);
1574 : : }
1575 : : else
1576 : : astride = 1;
1577 : :
1578 : 179 : add_subscript_strides (access_fn, astride, strides, n, loop);
1579 : : }
1580 : :
1581 : 369 : for (i = n; i-- > 0; )
1582 : : {
1583 : 207 : unsigned HOST_WIDE_INT s;
1584 : :
1585 : 207 : s = strides[i] < 0 ? -strides[i] : strides[i];
1586 : :
1587 : 207 : if (s < (unsigned) param_l1_cache_line_size
1588 : 130 : && (loop_sizes[i]
1589 : 130 : > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
1590 : : {
1591 : : ret = loop_sizes[i];
1592 : : break;
1593 : : }
1594 : : }
1595 : :
1596 : 171 : free (strides);
1597 : 171 : return ret;
1598 : : }
1599 : :
1600 : : /* Determines the distance till the first reuse of each reference in REFS
1601 : : in the loop nest of LOOP. NO_OTHER_REFS is true if there are no other
1602 : : memory references in the loop. Return false if the analysis fails. */
1603 : :
1604 : : static bool
1605 : 72 : determine_loop_nest_reuse (class loop *loop, struct mem_ref_group *refs,
1606 : : bool no_other_refs)
1607 : : {
1608 : 72 : class loop *nest, *aloop;
1609 : 72 : vec<data_reference_p> datarefs = vNULL;
1610 : 72 : vec<ddr_p> dependences = vNULL;
1611 : 72 : struct mem_ref_group *gr;
1612 : 72 : struct mem_ref *ref, *refb;
1613 : 72 : auto_vec<loop_p> vloops;
1614 : 72 : unsigned *loop_data_size;
1615 : 72 : unsigned i, j, n;
1616 : 72 : unsigned volume, dist, adist;
1617 : 72 : HOST_WIDE_INT vol;
1618 : 72 : data_reference_p dr;
1619 : 72 : ddr_p dep;
1620 : :
1621 : 72 : if (loop->inner)
1622 : : return true;
1623 : :
1624 : : /* Find the outermost loop of the loop nest of loop (we require that
1625 : : there are no sibling loops inside the nest). */
1626 : : nest = loop;
1627 : 80 : while (1)
1628 : : {
1629 : 80 : aloop = loop_outer (nest);
1630 : :
1631 : 80 : if (aloop == current_loops->tree_root
1632 : 25 : || aloop->inner->next)
1633 : : break;
1634 : :
1635 : : nest = aloop;
1636 : : }
1637 : :
1638 : : /* For each loop, determine the amount of data accessed in each iteration.
1639 : : We use this to estimate whether the reference is evicted from the
1640 : : cache before its reuse. */
1641 : 70 : find_loop_nest (nest, &vloops);
1642 : 70 : n = vloops.length ();
1643 : 70 : loop_data_size = XNEWVEC (unsigned, n);
1644 : 70 : volume = volume_of_references (refs);
1645 : 70 : i = n;
1646 : 150 : while (i-- != 0)
1647 : : {
1648 : 80 : loop_data_size[i] = volume;
1649 : : /* Bound the volume by the L2 cache size, since above this bound,
1650 : : all dependence distances are equivalent. */
1651 : 80 : if (volume > L2_CACHE_SIZE_BYTES)
1652 : 0 : continue;
1653 : :
1654 : 80 : aloop = vloops[i];
1655 : 80 : vol = estimated_stmt_executions_int (aloop);
1656 : 80 : if (vol == -1)
1657 : 37 : vol = expected_loop_iterations (aloop);
1658 : 80 : volume *= vol;
1659 : : }
1660 : :
1661 : : /* Prepare the references in the form suitable for data dependence
1662 : : analysis. We ignore unanalyzable data references (the results
1663 : : are used just as a heuristics to estimate temporality of the
1664 : : references, hence we do not need to worry about correctness). */
1665 : 211 : for (gr = refs; gr; gr = gr->next)
1666 : 312 : for (ref = gr->refs; ref; ref = ref->next)
1667 : : {
1668 : 171 : dr = create_data_ref (loop_preheader_edge (nest),
1669 : : loop_containing_stmt (ref->stmt),
1670 : 171 : ref->mem, ref->stmt, !ref->write_p, false);
1671 : :
1672 : 171 : if (dr)
1673 : : {
1674 : 171 : ref->reuse_distance = volume;
1675 : 171 : dr->aux = ref;
1676 : 171 : datarefs.safe_push (dr);
1677 : : }
1678 : : else
1679 : : no_other_refs = false;
1680 : : }
1681 : :
1682 : 241 : FOR_EACH_VEC_ELT (datarefs, i, dr)
1683 : : {
1684 : 171 : dist = self_reuse_distance (dr, loop_data_size, n, loop);
1685 : 171 : ref = (struct mem_ref *) dr->aux;
1686 : 171 : if (ref->reuse_distance > dist)
1687 : 9 : ref->reuse_distance = dist;
1688 : :
1689 : 171 : if (no_other_refs)
1690 : 120 : ref->independent_p = true;
1691 : : }
1692 : :
1693 : 70 : if (!compute_all_dependences (datarefs, &dependences, vloops, true))
1694 : : return false;
1695 : :
1696 : 490 : FOR_EACH_VEC_ELT (dependences, i, dep)
1697 : : {
1698 : 420 : if (DDR_ARE_DEPENDENT (dep) == chrec_known)
1699 : 137 : continue;
1700 : :
1701 : 283 : ref = (struct mem_ref *) DDR_A (dep)->aux;
1702 : 283 : refb = (struct mem_ref *) DDR_B (dep)->aux;
1703 : :
1704 : 283 : if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
1705 : 170 : || DDR_COULD_BE_INDEPENDENT_P (dep)
1706 : 453 : || DDR_NUM_DIST_VECTS (dep) == 0)
1707 : : {
1708 : : /* If the dependence cannot be analyzed, assume that there might be
1709 : : a reuse. */
1710 : 113 : dist = 0;
1711 : :
1712 : 113 : ref->independent_p = false;
1713 : 113 : refb->independent_p = false;
1714 : : }
1715 : : else
1716 : : {
1717 : : /* The distance vectors are normalized to be always lexicographically
1718 : : positive, hence we cannot tell just from them whether DDR_A comes
1719 : : before DDR_B or vice versa. However, it is not important,
1720 : : anyway -- if DDR_A is close to DDR_B, then it is either reused in
1721 : : DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
1722 : : in cache (and marking it as nontemporal would not affect
1723 : : anything). */
1724 : :
1725 : : dist = volume;
1726 : 348 : for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
1727 : : {
1728 : 178 : adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
1729 : : loop_data_size, n);
1730 : :
1731 : : /* If this is a dependence in the innermost loop (i.e., the
1732 : : distances in all superloops are zero) and it is not
1733 : : the trivial self-dependence with distance zero, record that
1734 : : the references are not completely independent. */
1735 : 178 : if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1)
1736 : 178 : && (ref != refb
1737 : 165 : || DDR_DIST_VECT (dep, j)[n-1] != 0))
1738 : : {
1739 : 3 : ref->independent_p = false;
1740 : 3 : refb->independent_p = false;
1741 : : }
1742 : :
1743 : : /* Ignore accesses closer than
1744 : : L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
1745 : : so that we use nontemporal prefetches e.g. if single memory
1746 : : location is accessed several times in a single iteration of
1747 : : the loop. */
1748 : 178 : if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
1749 : 166 : continue;
1750 : :
1751 : 12 : if (adist < dist)
1752 : 178 : dist = adist;
1753 : : }
1754 : : }
1755 : :
1756 : 283 : if (ref->reuse_distance > dist)
1757 : 39 : ref->reuse_distance = dist;
1758 : 283 : if (refb->reuse_distance > dist)
1759 : 61 : refb->reuse_distance = dist;
1760 : : }
1761 : :
1762 : 70 : free_dependence_relations (dependences);
1763 : 70 : free_data_refs (datarefs);
1764 : 70 : free (loop_data_size);
1765 : :
1766 : 70 : if (dump_file && (dump_flags & TDF_DETAILS))
1767 : : {
1768 : 16 : fprintf (dump_file, "Reuse distances:\n");
1769 : 58 : for (gr = refs; gr; gr = gr->next)
1770 : 53 : for (ref = gr->refs; ref; ref = ref->next)
1771 : 27 : fprintf (dump_file, " reference %u:%u distance %u\n",
1772 : 27 : ref->group->uid, ref->uid, ref->reuse_distance);
1773 : : }
1774 : :
1775 : : return true;
1776 : 72 : }
1777 : :
1778 : : /* Determine whether or not the trip count to ahead ratio is too small based
1779 : : on prefitablility consideration.
1780 : : AHEAD: the iteration ahead distance,
1781 : : EST_NITER: the estimated trip count. */
1782 : :
1783 : : static bool
1784 : 153 : trip_count_to_ahead_ratio_too_small_p (unsigned ahead, HOST_WIDE_INT est_niter)
1785 : : {
1786 : : /* Assume trip count to ahead ratio is big enough if the trip count could not
1787 : : be estimated at compile time. */
1788 : 153 : if (est_niter < 0)
1789 : : return false;
1790 : :
1791 : 122 : if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead))
1792 : : {
1793 : 31 : if (dump_file && (dump_flags & TDF_DETAILS))
1794 : 1 : fprintf (dump_file,
1795 : : "Not prefetching -- loop estimated to roll only %d times\n",
1796 : : (int) est_niter);
1797 : 31 : return true;
1798 : : }
1799 : :
1800 : : return false;
1801 : : }
1802 : :
1803 : : /* Determine whether or not the number of memory references in the loop is
1804 : : reasonable based on the profitablity and compilation time considerations.
1805 : : NINSNS: estimated number of instructions in the loop,
1806 : : MEM_REF_COUNT: total number of memory references in the loop. */
1807 : :
1808 : : static bool
1809 : 122 : mem_ref_count_reasonable_p (unsigned ninsns, unsigned mem_ref_count)
1810 : : {
1811 : 122 : int insn_to_mem_ratio;
1812 : :
1813 : 122 : if (mem_ref_count == 0)
1814 : : return false;
1815 : :
1816 : : /* Miss rate computation (is_miss_rate_acceptable) and dependence analysis
1817 : : (compute_all_dependences) have high costs based on quadratic complexity.
1818 : : To avoid huge compilation time, we give up prefetching if mem_ref_count
1819 : : is too large. */
1820 : 94 : if (mem_ref_count > PREFETCH_MAX_MEM_REFS_PER_LOOP)
1821 : : return false;
1822 : :
1823 : : /* Prefetching improves performance by overlapping cache missing
1824 : : memory accesses with CPU operations. If the loop does not have
1825 : : enough CPU operations to overlap with memory operations, prefetching
1826 : : won't give a significant benefit. One approximate way of checking
1827 : : this is to require the ratio of instructions to memory references to
1828 : : be above a certain limit. This approximation works well in practice.
1829 : : TODO: Implement a more precise computation by estimating the time
1830 : : for each CPU or memory op in the loop. Time estimates for memory ops
1831 : : should account for cache misses. */
1832 : 94 : insn_to_mem_ratio = ninsns / mem_ref_count;
1833 : :
1834 : 94 : if (insn_to_mem_ratio < param_prefetch_min_insn_to_mem_ratio)
1835 : : {
1836 : 17 : if (dump_file && (dump_flags & TDF_DETAILS))
1837 : 0 : fprintf (dump_file,
1838 : : "Not prefetching -- instruction to memory reference ratio (%d) too small\n",
1839 : : insn_to_mem_ratio);
1840 : 17 : return false;
1841 : : }
1842 : :
1843 : : return true;
1844 : : }
1845 : :
1846 : : /* Determine whether or not the instruction to prefetch ratio in the loop is
1847 : : too small based on the profitablity consideration.
1848 : : NINSNS: estimated number of instructions in the loop,
1849 : : PREFETCH_COUNT: an estimate of the number of prefetches,
1850 : : UNROLL_FACTOR: the factor to unroll the loop if prefetching. */
1851 : :
1852 : : static bool
1853 : 72 : insn_to_prefetch_ratio_too_small_p (unsigned ninsns, unsigned prefetch_count,
1854 : : unsigned unroll_factor)
1855 : : {
1856 : 72 : int insn_to_prefetch_ratio;
1857 : :
1858 : : /* Prefetching most likely causes performance degradation when the instruction
1859 : : to prefetch ratio is too small. Too many prefetch instructions in a loop
1860 : : may reduce the I-cache performance.
1861 : : (unroll_factor * ninsns) is used to estimate the number of instructions in
1862 : : the unrolled loop. This implementation is a bit simplistic -- the number
1863 : : of issued prefetch instructions is also affected by unrolling. So,
1864 : : prefetch_mod and the unroll factor should be taken into account when
1865 : : determining prefetch_count. Also, the number of insns of the unrolled
1866 : : loop will usually be significantly smaller than the number of insns of the
1867 : : original loop * unroll_factor (at least the induction variable increases
1868 : : and the exit branches will get eliminated), so it might be better to use
1869 : : tree_estimate_loop_size + estimated_unrolled_size. */
1870 : 72 : insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count;
1871 : 72 : if (insn_to_prefetch_ratio < param_min_insn_to_prefetch_ratio)
1872 : : {
1873 : 8 : if (dump_file && (dump_flags & TDF_DETAILS))
1874 : 1 : fprintf (dump_file,
1875 : : "Not prefetching -- instruction to prefetch ratio (%d) too small\n",
1876 : : insn_to_prefetch_ratio);
1877 : 8 : return true;
1878 : : }
1879 : :
1880 : : return false;
1881 : : }
1882 : :
1883 : :
1884 : : /* Issue prefetch instructions for array references in LOOP. Returns
1885 : : true if the LOOP was unrolled and updates NEED_LC_SSA_UPDATE if we need
1886 : : to update SSA for virtual operands and LC SSA for a split edge. */
1887 : :
1888 : : static bool
1889 : 153 : loop_prefetch_arrays (class loop *loop, bool &need_lc_ssa_update)
1890 : : {
1891 : 153 : struct mem_ref_group *refs;
1892 : 153 : unsigned ahead, ninsns, time, unroll_factor;
1893 : 153 : HOST_WIDE_INT est_niter;
1894 : 153 : class tree_niter_desc desc;
1895 : 153 : bool unrolled = false, no_other_refs;
1896 : 153 : unsigned prefetch_count;
1897 : 153 : unsigned mem_ref_count;
1898 : :
1899 : 153 : if (optimize_loop_nest_for_size_p (loop))
1900 : : {
1901 : 0 : if (dump_file && (dump_flags & TDF_DETAILS))
1902 : 0 : fprintf (dump_file, " ignored (cold area)\n");
1903 : 0 : return false;
1904 : : }
1905 : :
1906 : : /* FIXME: the time should be weighted by the probabilities of the blocks in
1907 : : the loop body. */
1908 : 153 : time = tree_num_loop_insns (loop, &eni_time_weights);
1909 : 153 : if (time == 0)
1910 : : return false;
1911 : :
1912 : 153 : ahead = (param_prefetch_latency + time - 1) / time;
1913 : 153 : est_niter = estimated_stmt_executions_int (loop);
1914 : 153 : if (est_niter == -1)
1915 : 82 : est_niter = likely_max_stmt_executions_int (loop);
1916 : :
1917 : : /* Prefetching is not likely to be profitable if the trip count to ahead
1918 : : ratio is too small. */
1919 : 153 : if (trip_count_to_ahead_ratio_too_small_p (ahead, est_niter))
1920 : : return false;
1921 : :
1922 : 122 : ninsns = tree_num_loop_insns (loop, &eni_size_weights);
1923 : :
1924 : : /* Step 1: gather the memory references. */
1925 : 122 : refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
1926 : :
1927 : : /* Give up prefetching if the number of memory references in the
1928 : : loop is not reasonable based on profitablity and compilation time
1929 : : considerations. */
1930 : 122 : if (!mem_ref_count_reasonable_p (ninsns, mem_ref_count))
1931 : 45 : goto fail;
1932 : :
1933 : : /* Step 2: estimate the reuse effects. */
1934 : 77 : prune_by_reuse (refs);
1935 : :
1936 : 77 : if (nothing_to_prefetch_p (refs))
1937 : 5 : goto fail;
1938 : :
1939 : 72 : if (!determine_loop_nest_reuse (loop, refs, no_other_refs))
1940 : 0 : goto fail;
1941 : :
1942 : : /* Step 3: determine unroll factor. */
1943 : 72 : unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
1944 : : est_niter);
1945 : :
1946 : : /* Estimate prefetch count for the unrolled loop. */
1947 : 72 : prefetch_count = estimate_prefetch_count (refs, unroll_factor);
1948 : 72 : if (prefetch_count == 0)
1949 : 0 : goto fail;
1950 : :
1951 : 72 : if (dump_file && (dump_flags & TDF_DETAILS))
1952 : 16 : fprintf (dump_file, "Ahead %d, unroll factor %d, trip count "
1953 : : HOST_WIDE_INT_PRINT_DEC "\n"
1954 : : "insn count %d, mem ref count %d, prefetch count %d\n",
1955 : : ahead, unroll_factor, est_niter,
1956 : : ninsns, mem_ref_count, prefetch_count);
1957 : :
1958 : : /* Prefetching is not likely to be profitable if the instruction to prefetch
1959 : : ratio is too small. */
1960 : 72 : if (insn_to_prefetch_ratio_too_small_p (ninsns, prefetch_count,
1961 : : unroll_factor))
1962 : 8 : goto fail;
1963 : :
1964 : 64 : need_lc_ssa_update |= mark_nontemporal_stores (loop, refs);
1965 : :
1966 : : /* Step 4: what to prefetch? */
1967 : 64 : if (!schedule_prefetches (refs, unroll_factor, ahead))
1968 : 2 : goto fail;
1969 : :
1970 : : /* Step 5: unroll the loop. TODO -- peeling of first and last few
1971 : : iterations so that we do not issue superfluous prefetches. */
1972 : 62 : if (unroll_factor != 1)
1973 : : {
1974 : 33 : tree_unroll_loop (loop, unroll_factor, &desc);
1975 : 33 : unrolled = true;
1976 : : }
1977 : :
1978 : : /* Step 6: issue the prefetches. */
1979 : 62 : issue_prefetches (refs, unroll_factor, ahead);
1980 : :
1981 : 122 : fail:
1982 : 122 : release_mem_refs (refs);
1983 : 122 : return unrolled;
1984 : 153 : }
1985 : :
1986 : : /* Issue prefetch instructions for array references in loops. */
1987 : :
1988 : : unsigned int
1989 : 107 : tree_ssa_prefetch_arrays (void)
1990 : : {
1991 : 107 : bool unrolled = false;
1992 : 107 : bool need_lc_ssa_update = false;
1993 : 107 : int todo_flags = 0;
1994 : :
1995 : 107 : if (!targetm.have_prefetch ()
1996 : : /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1997 : : -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1998 : : of processor costs and i486 does not have prefetch, but
1999 : : -march=pentium4 causes targetm.have_prefetch to be true. Ugh. */
2000 : 107 : || PREFETCH_BLOCK == 0)
2001 : : return 0;
2002 : :
2003 : 107 : if (dump_file && (dump_flags & TDF_DETAILS))
2004 : : {
2005 : 11 : fprintf (dump_file, "Prefetching parameters:\n");
2006 : 11 : fprintf (dump_file, " simultaneous prefetches: %d\n",
2007 : : param_simultaneous_prefetches);
2008 : 11 : fprintf (dump_file, " prefetch latency: %d\n", param_prefetch_latency);
2009 : 11 : fprintf (dump_file, " prefetch block size: %d\n", PREFETCH_BLOCK);
2010 : 11 : fprintf (dump_file, " L1 cache size: %d lines, %d kB\n",
2011 : 11 : L1_CACHE_SIZE_BYTES / param_l1_cache_line_size,
2012 : : param_l1_cache_size);
2013 : 11 : fprintf (dump_file, " L1 cache line size: %d\n",
2014 : : param_l1_cache_line_size);
2015 : 11 : fprintf (dump_file, " L2 cache size: %d kB\n", param_l2_cache_size);
2016 : 11 : fprintf (dump_file, " min insn-to-prefetch ratio: %d \n",
2017 : : param_min_insn_to_prefetch_ratio);
2018 : 11 : fprintf (dump_file, " min insn-to-mem ratio: %d \n",
2019 : : param_prefetch_min_insn_to_mem_ratio);
2020 : 11 : fprintf (dump_file, "\n");
2021 : : }
2022 : :
2023 : 107 : initialize_original_copy_tables ();
2024 : :
2025 : 107 : if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH))
2026 : : {
2027 : 4 : tree type = build_function_type_list (void_type_node,
2028 : : const_ptr_type_node, NULL_TREE);
2029 : 4 : tree decl = add_builtin_function ("__builtin_prefetch", type,
2030 : : BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
2031 : : NULL, NULL_TREE);
2032 : 4 : DECL_IS_NOVOPS (decl) = true;
2033 : 4 : set_builtin_decl (BUILT_IN_PREFETCH, decl, false);
2034 : : }
2035 : :
2036 : 474 : for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
2037 : : {
2038 : 153 : if (dump_file && (dump_flags & TDF_DETAILS))
2039 : 25 : fprintf (dump_file, "Processing loop %d:\n", loop->num);
2040 : :
2041 : 153 : unrolled |= loop_prefetch_arrays (loop, need_lc_ssa_update);
2042 : :
2043 : 153 : if (dump_file && (dump_flags & TDF_DETAILS))
2044 : 25 : fprintf (dump_file, "\n\n");
2045 : 107 : }
2046 : :
2047 : 107 : if (need_lc_ssa_update)
2048 : 2 : rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa_only_virtuals);
2049 : :
2050 : 107 : if (unrolled)
2051 : : {
2052 : 19 : scev_reset ();
2053 : 19 : todo_flags |= TODO_cleanup_cfg;
2054 : : }
2055 : :
2056 : 107 : free_original_copy_tables ();
2057 : 107 : return todo_flags;
2058 : : }
2059 : :
2060 : : /* Prefetching. */
2061 : :
2062 : : namespace {
2063 : :
2064 : : const pass_data pass_data_loop_prefetch =
2065 : : {
2066 : : GIMPLE_PASS, /* type */
2067 : : "aprefetch", /* name */
2068 : : OPTGROUP_LOOP, /* optinfo_flags */
2069 : : TV_TREE_PREFETCH, /* tv_id */
2070 : : ( PROP_cfg | PROP_ssa ), /* properties_required */
2071 : : 0, /* properties_provided */
2072 : : 0, /* properties_destroyed */
2073 : : 0, /* todo_flags_start */
2074 : : 0, /* todo_flags_finish */
2075 : : };
2076 : :
2077 : : class pass_loop_prefetch : public gimple_opt_pass
2078 : : {
2079 : : public:
2080 : 280047 : pass_loop_prefetch (gcc::context *ctxt)
2081 : 560094 : : gimple_opt_pass (pass_data_loop_prefetch, ctxt)
2082 : : {}
2083 : :
2084 : : /* opt_pass methods: */
2085 : 226801 : bool gate (function *) final override
2086 : : {
2087 : 226801 : return flag_prefetch_loop_arrays > 0;
2088 : : }
2089 : : unsigned int execute (function *) final override;
2090 : :
2091 : : }; // class pass_loop_prefetch
2092 : :
2093 : : unsigned int
2094 : 109 : pass_loop_prefetch::execute (function *fun)
2095 : : {
2096 : 220 : if (number_of_loops (fun) <= 1)
2097 : : return 0;
2098 : :
2099 : 109 : if ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) != 0)
2100 : : {
2101 : 2 : static bool warned = false;
2102 : :
2103 : 2 : if (!warned)
2104 : : {
2105 : 1 : warning (OPT_Wdisabled_optimization,
2106 : : "%<l1-cache-size%> parameter is not a power of two: %d",
2107 : : PREFETCH_BLOCK);
2108 : 1 : warned = true;
2109 : : }
2110 : 2 : return 0;
2111 : : }
2112 : :
2113 : 107 : return tree_ssa_prefetch_arrays ();
2114 : : }
2115 : :
2116 : : } // anon namespace
2117 : :
2118 : : gimple_opt_pass *
2119 : 280047 : make_pass_loop_prefetch (gcc::context *ctxt)
2120 : : {
2121 : 280047 : return new pass_loop_prefetch (ctxt);
2122 : : }
2123 : :
2124 : :
|