Line data Source code
1 : /* Array prefetching.
2 : Copyright (C) 2005-2026 Free Software Foundation, Inc.
3 :
4 : This file is part of GCC.
5 :
6 : GCC is free software; you can redistribute it and/or modify it
7 : under the terms of the GNU General Public License as published by the
8 : Free Software Foundation; either version 3, or (at your option) any
9 : later version.
10 :
11 : GCC is distributed in the hope that it will be useful, but WITHOUT
12 : ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 : for more details.
15 :
16 : You should have received a copy of the GNU General Public License
17 : along with GCC; see the file COPYING3. If not see
18 : <http://www.gnu.org/licenses/>. */
19 :
20 : #include "config.h"
21 : #include "system.h"
22 : #include "coretypes.h"
23 : #include "backend.h"
24 : #include "target.h"
25 : #include "rtl.h"
26 : #include "tree.h"
27 : #include "gimple.h"
28 : #include "predict.h"
29 : #include "tree-pass.h"
30 : #include "gimple-ssa.h"
31 : #include "optabs-query.h"
32 : #include "tree-pretty-print.h"
33 : #include "fold-const.h"
34 : #include "stor-layout.h"
35 : #include "gimplify.h"
36 : #include "gimple-iterator.h"
37 : #include "gimplify-me.h"
38 : #include "tree-ssa-loop-ivopts.h"
39 : #include "tree-ssa-loop-manip.h"
40 : #include "tree-ssa-loop-niter.h"
41 : #include "tree-ssa-loop.h"
42 : #include "ssa.h"
43 : #include "tree-into-ssa.h"
44 : #include "cfgloop.h"
45 : #include "tree-scalar-evolution.h"
46 : #include "langhooks.h"
47 : #include "tree-inline.h"
48 : #include "tree-data-ref.h"
49 : #include "diagnostic-core.h"
50 : #include "dbgcnt.h"
51 :
52 : /* This pass inserts prefetch instructions to optimize cache usage during
53 : accesses to arrays in loops. It processes loops sequentially and:
54 :
55 : 1) Gathers all memory references in the single loop.
56 : 2) For each of the references it decides when it is profitable to prefetch
57 : it. To do it, we evaluate the reuse among the accesses, and determines
58 : two values: PREFETCH_BEFORE (meaning that it only makes sense to do
59 : prefetching in the first PREFETCH_BEFORE iterations of the loop) and
60 : PREFETCH_MOD (meaning that it only makes sense to prefetch in the
61 : iterations of the loop that are zero modulo PREFETCH_MOD). For example
62 : (assuming cache line size is 64 bytes, char has size 1 byte and there
63 : is no hardware sequential prefetch):
64 :
65 : char *a;
66 : for (i = 0; i < max; i++)
67 : {
68 : a[255] = ...; (0)
69 : a[i] = ...; (1)
70 : a[i + 64] = ...; (2)
71 : a[16*i] = ...; (3)
72 : a[187*i] = ...; (4)
73 : a[187*i + 50] = ...; (5)
74 : }
75 :
76 : (0) obviously has PREFETCH_BEFORE 1
77 : (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
78 : location 64 iterations before it, and PREFETCH_MOD 64 (since
79 : it hits the same cache line otherwise).
80 : (2) has PREFETCH_MOD 64
81 : (3) has PREFETCH_MOD 4
82 : (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since
83 : the cache line accessed by (5) is the same with probability only
84 : 7/32.
85 : (5) has PREFETCH_MOD 1 as well.
86 :
87 : Additionally, we use data dependence analysis to determine for each
88 : reference the distance till the first reuse; this information is used
89 : to determine the temporality of the issued prefetch instruction.
90 :
91 : 3) We determine how much ahead we need to prefetch. The number of
92 : iterations needed is time to fetch / time spent in one iteration of
93 : the loop. The problem is that we do not know either of these values,
94 : so we just make a heuristic guess based on a magic (possibly)
95 : target-specific constant and size of the loop.
96 :
97 : 4) Determine which of the references we prefetch. We take into account
98 : that there is a maximum number of simultaneous prefetches (provided
99 : by machine description). We prefetch as many prefetches as possible
100 : while still within this bound (starting with those with lowest
101 : prefetch_mod, since they are responsible for most of the cache
102 : misses).
103 :
104 : 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
105 : and PREFETCH_BEFORE requirements (within some bounds), and to avoid
106 : prefetching nonaccessed memory.
107 : TODO -- actually implement peeling.
108 :
109 : 6) We actually emit the prefetch instructions. ??? Perhaps emit the
110 : prefetch instructions with guards in cases where 5) was not sufficient
111 : to satisfy the constraints?
112 :
113 : A cost model is implemented to determine whether or not prefetching is
114 : profitable for a given loop. The cost model has three heuristics:
115 :
116 : 1. Function trip_count_to_ahead_ratio_too_small_p implements a
117 : heuristic that determines whether or not the loop has too few
118 : iterations (compared to ahead). Prefetching is not likely to be
119 : beneficial if the trip count to ahead ratio is below a certain
120 : minimum.
121 :
122 : 2. Function mem_ref_count_reasonable_p implements a heuristic that
123 : determines whether the given loop has enough CPU ops that can be
124 : overlapped with cache missing memory ops. If not, the loop
125 : won't benefit from prefetching. In the implementation,
126 : prefetching is not considered beneficial if the ratio between
127 : the instruction count and the mem ref count is below a certain
128 : minimum.
129 :
130 : 3. Function insn_to_prefetch_ratio_too_small_p implements a
131 : heuristic that disables prefetching in a loop if the prefetching
132 : cost is above a certain limit. The relative prefetching cost is
133 : estimated by taking the ratio between the prefetch count and the
134 : total intruction count (this models the I-cache cost).
135 :
136 : The limits used in these heuristics are defined as parameters with
137 : reasonable default values. Machine-specific default values will be
138 : added later.
139 :
140 : Some other TODO:
141 : -- write and use more general reuse analysis (that could be also used
142 : in other cache aimed loop optimizations)
143 : -- make it behave sanely together with the prefetches given by user
144 : (now we just ignore them; at the very least we should avoid
145 : optimizing loops in that user put his own prefetches)
146 : -- we assume cache line size alignment of arrays; this could be
147 : improved. */
148 :
149 : /* Magic constants follow. These should be replaced by machine specific
150 : numbers. */
151 :
152 : /* True if write can be prefetched by a read prefetch. */
153 :
154 : #ifndef WRITE_CAN_USE_READ_PREFETCH
155 : #define WRITE_CAN_USE_READ_PREFETCH 1
156 : #endif
157 :
158 : /* True if read can be prefetched by a write prefetch. */
159 :
160 : #ifndef READ_CAN_USE_WRITE_PREFETCH
161 : #define READ_CAN_USE_WRITE_PREFETCH 0
162 : #endif
163 :
164 : /* The size of the block loaded by a single prefetch. Usually, this is
165 : the same as cache line size (at the moment, we only consider one level
166 : of cache hierarchy). */
167 :
168 : #ifndef PREFETCH_BLOCK
169 : #define PREFETCH_BLOCK param_l1_cache_line_size
170 : #endif
171 :
172 : /* Do we have a forward hardware sequential prefetching? */
173 :
174 : #ifndef HAVE_FORWARD_PREFETCH
175 : #define HAVE_FORWARD_PREFETCH 0
176 : #endif
177 :
178 : /* Do we have a backward hardware sequential prefetching? */
179 :
180 : #ifndef HAVE_BACKWARD_PREFETCH
181 : #define HAVE_BACKWARD_PREFETCH 0
182 : #endif
183 :
184 : /* In some cases we are only able to determine that there is a certain
185 : probability that the two accesses hit the same cache line. In this
186 : case, we issue the prefetches for both of them if this probability
187 : is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand. */
188 :
189 : #ifndef ACCEPTABLE_MISS_RATE
190 : #define ACCEPTABLE_MISS_RATE 50
191 : #endif
192 :
193 : #define L1_CACHE_SIZE_BYTES ((unsigned) (param_l1_cache_size * 1024))
194 : #define L2_CACHE_SIZE_BYTES ((unsigned) (param_l2_cache_size * 1024))
195 :
196 : /* We consider a memory access nontemporal if it is not reused sooner than
197 : after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore
198 : accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
199 : so that we use nontemporal prefetches e.g. if single memory location
200 : is accessed several times in a single iteration of the loop. */
201 : #define NONTEMPORAL_FRACTION 16
202 :
203 : /* In case we have to emit a memory fence instruction after the loop that
204 : uses nontemporal stores, this defines the builtin to use. */
205 :
206 : #ifndef FENCE_FOLLOWING_MOVNT
207 : #define FENCE_FOLLOWING_MOVNT NULL_TREE
208 : #endif
209 :
210 : /* It is not profitable to prefetch when the trip count is not at
211 : least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance.
212 : For example, in a loop with a prefetch ahead distance of 10,
213 : supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is
214 : profitable to prefetch when the trip count is greater or equal to
215 : 40. In that case, 30 out of the 40 iterations will benefit from
216 : prefetching. */
217 :
218 : #ifndef TRIP_COUNT_TO_AHEAD_RATIO
219 : #define TRIP_COUNT_TO_AHEAD_RATIO 4
220 : #endif
221 :
222 : /* The group of references between that reuse may occur. */
223 :
224 : struct mem_ref_group
225 : {
226 : tree base; /* Base of the reference. */
227 : tree step; /* Step of the reference. */
228 : struct mem_ref *refs; /* References in the group. */
229 : struct mem_ref_group *next; /* Next group of references. */
230 : unsigned int uid; /* Group UID, used only for debugging. */
231 : };
232 :
233 : /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */
234 :
235 : #define PREFETCH_ALL HOST_WIDE_INT_M1U
236 :
237 : /* Do not generate a prefetch if the unroll factor is significantly less
238 : than what is required by the prefetch. This is to avoid redundant
239 : prefetches. For example, when prefetch_mod is 16 and unroll_factor is
240 : 2, prefetching requires unrolling the loop 16 times, but
241 : the loop is actually unrolled twice. In this case (ratio = 8),
242 : prefetching is not likely to be beneficial. */
243 :
244 : #ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO
245 : #define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 4
246 : #endif
247 :
248 : /* Some of the prefetch computations have quadratic complexity. We want to
249 : avoid huge compile times and, therefore, want to limit the amount of
250 : memory references per loop where we consider prefetching. */
251 :
252 : #ifndef PREFETCH_MAX_MEM_REFS_PER_LOOP
253 : #define PREFETCH_MAX_MEM_REFS_PER_LOOP 200
254 : #endif
255 :
256 : /* The memory reference. */
257 :
258 : struct mem_ref
259 : {
260 : gimple *stmt; /* Statement in that the reference appears. */
261 : tree mem; /* The reference. */
262 : HOST_WIDE_INT delta; /* Constant offset of the reference. */
263 : struct mem_ref_group *group; /* The group of references it belongs to. */
264 : unsigned HOST_WIDE_INT prefetch_mod;
265 : /* Prefetch only each PREFETCH_MOD-th
266 : iteration. */
267 : unsigned HOST_WIDE_INT prefetch_before;
268 : /* Prefetch only first PREFETCH_BEFORE
269 : iterations. */
270 : unsigned reuse_distance; /* The amount of data accessed before the first
271 : reuse of this value. */
272 : struct mem_ref *next; /* The next reference in the group. */
273 : unsigned int uid; /* Ref UID, used only for debugging. */
274 : unsigned write_p : 1; /* Is it a write? */
275 : unsigned independent_p : 1; /* True if the reference is independent on
276 : all other references inside the loop. */
277 : unsigned issue_prefetch_p : 1; /* Should we really issue the prefetch? */
278 : unsigned storent_p : 1; /* True if we changed the store to a
279 : nontemporal one. */
280 : };
281 :
282 : /* Dumps information about memory reference */
283 : static void
284 31 : dump_mem_details (FILE *file, tree base, tree step,
285 : HOST_WIDE_INT delta, bool write_p)
286 : {
287 31 : fprintf (file, "(base ");
288 31 : print_generic_expr (file, base, TDF_SLIM);
289 31 : fprintf (file, ", step ");
290 31 : if (cst_and_fits_in_hwi (step))
291 23 : fprintf (file, HOST_WIDE_INT_PRINT_DEC, int_cst_value (step));
292 : else
293 8 : print_generic_expr (file, step, TDF_SLIM);
294 31 : fprintf (file, ")\n");
295 31 : fprintf (file, " delta " HOST_WIDE_INT_PRINT_DEC "\n", delta);
296 51 : fprintf (file, " %s\n\n", write_p ? "write" : "read");
297 31 : }
298 :
299 : /* Dumps information about reference REF to FILE. */
300 :
301 : static void
302 54 : dump_mem_ref (FILE *file, struct mem_ref *ref)
303 : {
304 54 : fprintf (file, "reference %u:%u (", ref->group->uid, ref->uid);
305 54 : print_generic_expr (file, ref->mem, TDF_SLIM);
306 54 : fprintf (file, ")\n");
307 54 : }
308 :
309 : /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
310 : exist. */
311 :
312 : static struct mem_ref_group *
313 395 : find_or_create_group (struct mem_ref_group **groups, tree base, tree step)
314 : {
315 : /* Global count for setting struct mem_ref_group->uid. */
316 395 : static unsigned int last_mem_ref_group_uid = 0;
317 :
318 395 : struct mem_ref_group *group;
319 :
320 1172 : for (; *groups; groups = &(*groups)->next)
321 : {
322 948 : if (operand_equal_p ((*groups)->step, step, 0)
323 948 : && operand_equal_p ((*groups)->base, base, 0))
324 169 : return *groups;
325 :
326 : /* If step is an integer constant, keep the list of groups sorted
327 : by decreasing step. */
328 1462 : if (cst_and_fits_in_hwi ((*groups)->step) && cst_and_fits_in_hwi (step)
329 1454 : && int_cst_value ((*groups)->step) < int_cst_value (step))
330 : break;
331 : }
332 :
333 226 : group = XNEW (struct mem_ref_group);
334 226 : group->base = base;
335 226 : group->step = step;
336 226 : group->refs = NULL;
337 226 : group->uid = ++last_mem_ref_group_uid;
338 226 : group->next = *groups;
339 226 : *groups = group;
340 :
341 226 : return group;
342 : }
343 :
344 : /* Records a memory reference MEM in GROUP with offset DELTA and write status
345 : WRITE_P. The reference occurs in statement STMT. */
346 :
347 : static void
348 395 : record_ref (struct mem_ref_group *group, gimple *stmt, tree mem,
349 : HOST_WIDE_INT delta, bool write_p)
350 : {
351 395 : unsigned int last_mem_ref_uid = 0;
352 395 : struct mem_ref **aref;
353 :
354 : /* Do not record the same address twice. */
355 435 : for (aref = &group->refs; *aref; aref = &(*aref)->next)
356 : {
357 179 : last_mem_ref_uid = (*aref)->uid;
358 :
359 : /* It does not have to be possible for write reference to reuse the read
360 : prefetch, or vice versa. */
361 179 : if (!WRITE_CAN_USE_READ_PREFETCH
362 : && write_p
363 : && !(*aref)->write_p)
364 : continue;
365 179 : if (!READ_CAN_USE_WRITE_PREFETCH
366 : && !write_p
367 30 : && (*aref)->write_p)
368 0 : continue;
369 :
370 179 : if ((*aref)->delta == delta)
371 : return;
372 : }
373 :
374 256 : (*aref) = XNEW (struct mem_ref);
375 256 : (*aref)->stmt = stmt;
376 256 : (*aref)->mem = mem;
377 256 : (*aref)->delta = delta;
378 256 : (*aref)->write_p = write_p;
379 256 : (*aref)->prefetch_before = PREFETCH_ALL;
380 256 : (*aref)->prefetch_mod = 1;
381 256 : (*aref)->reuse_distance = 0;
382 256 : (*aref)->issue_prefetch_p = false;
383 256 : (*aref)->group = group;
384 256 : (*aref)->next = NULL;
385 256 : (*aref)->independent_p = false;
386 256 : (*aref)->storent_p = false;
387 256 : (*aref)->uid = last_mem_ref_uid + 1;
388 :
389 256 : if (dump_file && (dump_flags & TDF_DETAILS))
390 : {
391 27 : dump_mem_ref (dump_file, *aref);
392 :
393 27 : fprintf (dump_file, " group %u ", group->uid);
394 27 : dump_mem_details (dump_file, group->base, group->step, delta,
395 : write_p);
396 : }
397 : }
398 :
399 : /* Release memory references in GROUPS. */
400 :
401 : static void
402 120 : release_mem_refs (struct mem_ref_group *groups)
403 : {
404 120 : struct mem_ref_group *next_g;
405 120 : struct mem_ref *ref, *next_r;
406 :
407 346 : for (; groups; groups = next_g)
408 : {
409 226 : next_g = groups->next;
410 482 : for (ref = groups->refs; ref; ref = next_r)
411 : {
412 256 : next_r = ref->next;
413 256 : free (ref);
414 : }
415 226 : free (groups);
416 : }
417 120 : }
418 :
419 : /* A structure used to pass arguments to idx_analyze_ref. */
420 :
421 : struct ar_data
422 : {
423 : class loop *loop; /* Loop of the reference. */
424 : gimple *stmt; /* Statement of the reference. */
425 : tree *step; /* Step of the memory reference. */
426 : HOST_WIDE_INT *delta; /* Offset of the memory reference. */
427 : };
428 :
429 : /* Analyzes a single INDEX of a memory reference to obtain information
430 : described at analyze_ref. Callback for for_each_index. */
431 :
432 : static bool
433 550 : idx_analyze_ref (tree base, tree *index, void *data)
434 : {
435 550 : struct ar_data *ar_data = (struct ar_data *) data;
436 550 : tree ibase, step, stepsize;
437 550 : HOST_WIDE_INT idelta = 0, imult = 1;
438 550 : affine_iv iv;
439 :
440 1100 : if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt),
441 : *index, &iv, true))
442 : return false;
443 415 : ibase = iv.base;
444 415 : step = iv.step;
445 :
446 415 : if (TREE_CODE (ibase) == POINTER_PLUS_EXPR
447 415 : && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
448 : {
449 29 : idelta = int_cst_value (TREE_OPERAND (ibase, 1));
450 29 : ibase = TREE_OPERAND (ibase, 0);
451 : }
452 415 : if (cst_and_fits_in_hwi (ibase))
453 : {
454 23 : idelta += int_cst_value (ibase);
455 23 : ibase = build_int_cst (TREE_TYPE (ibase), 0);
456 : }
457 :
458 415 : if (TREE_CODE (base) == ARRAY_REF)
459 : {
460 55 : stepsize = array_ref_element_size (base);
461 55 : if (!cst_and_fits_in_hwi (stepsize))
462 : return false;
463 55 : imult = int_cst_value (stepsize);
464 55 : step = fold_build2 (MULT_EXPR, sizetype,
465 : fold_convert (sizetype, step),
466 : fold_convert (sizetype, stepsize));
467 55 : idelta *= imult;
468 : }
469 :
470 415 : if (*ar_data->step == NULL_TREE)
471 399 : *ar_data->step = step;
472 : else
473 16 : *ar_data->step = fold_build2 (PLUS_EXPR, sizetype,
474 : fold_convert (sizetype, *ar_data->step),
475 : fold_convert (sizetype, step));
476 415 : *ar_data->delta += idelta;
477 415 : *index = ibase;
478 :
479 415 : return true;
480 : }
481 :
482 : /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
483 : STEP are integer constants and iter is number of iterations of LOOP. The
484 : reference occurs in statement STMT. Strips nonaddressable component
485 : references from REF_P. */
486 :
487 : static bool
488 536 : analyze_ref (class loop *loop, tree *ref_p, tree *base,
489 : tree *step, HOST_WIDE_INT *delta,
490 : gimple *stmt)
491 : {
492 536 : struct ar_data ar_data;
493 536 : tree off;
494 536 : HOST_WIDE_INT bit_offset;
495 536 : tree ref = *ref_p;
496 :
497 536 : *step = NULL_TREE;
498 536 : *delta = 0;
499 :
500 : /* First strip off the component references. Ignore bitfields.
501 : Also strip off the real and imagine parts of a complex, so that
502 : they can have the same base. */
503 536 : if (TREE_CODE (ref) == REALPART_EXPR
504 536 : || TREE_CODE (ref) == IMAGPART_EXPR
505 536 : || (TREE_CODE (ref) == COMPONENT_REF
506 23 : && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))))
507 : {
508 8 : if (TREE_CODE (ref) == IMAGPART_EXPR)
509 4 : *delta += int_size_in_bytes (TREE_TYPE (ref));
510 8 : ref = TREE_OPERAND (ref, 0);
511 : }
512 :
513 536 : *ref_p = ref;
514 :
515 564 : for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
516 : {
517 28 : off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
518 28 : bit_offset = TREE_INT_CST_LOW (off);
519 28 : gcc_assert (bit_offset % BITS_PER_UNIT == 0);
520 :
521 28 : *delta += bit_offset / BITS_PER_UNIT;
522 : }
523 :
524 536 : *base = unshare_expr (ref);
525 536 : ar_data.loop = loop;
526 536 : ar_data.stmt = stmt;
527 536 : ar_data.step = step;
528 536 : ar_data.delta = delta;
529 536 : return for_each_index (base, idx_analyze_ref, &ar_data);
530 : }
531 :
532 : /* Record a memory reference REF to the list REFS. The reference occurs in
533 : LOOP in statement STMT and it is write if WRITE_P. Returns true if the
534 : reference was recorded, false otherwise. */
535 :
536 : static bool
537 536 : gather_memory_references_ref (class loop *loop, struct mem_ref_group **refs,
538 : tree ref, bool write_p, gimple *stmt)
539 : {
540 536 : tree base, step;
541 536 : HOST_WIDE_INT delta;
542 536 : struct mem_ref_group *agrp;
543 :
544 536 : if (get_base_address (ref) == NULL)
545 : return false;
546 :
547 536 : if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
548 : return false;
549 : /* If analyze_ref fails the default is a NULL_TREE. We can stop here. */
550 401 : if (step == NULL_TREE)
551 : return false;
552 :
553 : /* Stop if the address of BASE could not be taken. */
554 399 : if (may_be_nonaddressable_p (base))
555 : return false;
556 :
557 : /* Limit non-constant step prefetching only to the innermost loops and
558 : only when the step is loop invariant in the entire loop nest. */
559 399 : if (!cst_and_fits_in_hwi (step))
560 : {
561 187 : if (loop->inner != NULL)
562 : {
563 0 : if (dump_file && (dump_flags & TDF_DETAILS))
564 : {
565 0 : fprintf (dump_file, "Memory expression %p\n",(void *) ref );
566 0 : print_generic_expr (dump_file, ref, TDF_SLIM);
567 0 : fprintf (dump_file,":");
568 0 : dump_mem_details (dump_file, base, step, delta, write_p);
569 0 : fprintf (dump_file,
570 : "Ignoring %p, non-constant step prefetching is "
571 : "limited to inner most loops \n",
572 : (void *) ref);
573 : }
574 0 : return false;
575 : }
576 : else
577 : {
578 187 : if (!expr_invariant_in_loop_p (loop_outermost (loop), step))
579 : {
580 4 : if (dump_file && (dump_flags & TDF_DETAILS))
581 : {
582 4 : fprintf (dump_file, "Memory expression %p\n",(void *) ref );
583 4 : print_generic_expr (dump_file, ref, TDF_SLIM);
584 4 : fprintf (dump_file,":");
585 4 : dump_mem_details (dump_file, base, step, delta, write_p);
586 4 : fprintf (dump_file,
587 : "Not prefetching, ignoring %p due to "
588 : "loop variant step\n",
589 : (void *) ref);
590 : }
591 4 : return false;
592 : }
593 : }
594 : }
595 :
596 : /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
597 : are integer constants. */
598 395 : agrp = find_or_create_group (refs, base, step);
599 395 : record_ref (agrp, stmt, ref, delta, write_p);
600 :
601 395 : return true;
602 : }
603 :
604 : /* Record the suitable memory references in LOOP. NO_OTHER_REFS is set to
605 : true if there are no other memory references inside the loop. */
606 :
607 : static struct mem_ref_group *
608 120 : gather_memory_references (class loop *loop, bool *no_other_refs, unsigned *ref_count)
609 : {
610 120 : basic_block *body = get_loop_body_in_dom_order (loop);
611 120 : basic_block bb;
612 120 : unsigned i;
613 120 : gimple_stmt_iterator bsi;
614 120 : gimple *stmt;
615 120 : tree lhs, rhs;
616 120 : struct mem_ref_group *refs = NULL;
617 :
618 120 : *no_other_refs = true;
619 120 : *ref_count = 0;
620 :
621 : /* Scan the loop body in order, so that the former references precede the
622 : later ones. */
623 1019 : for (i = 0; i < loop->num_nodes; i++)
624 : {
625 899 : bb = body[i];
626 899 : if (bb->loop_father != loop)
627 535 : continue;
628 :
629 4446 : for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
630 : {
631 3718 : stmt = gsi_stmt (bsi);
632 :
633 3718 : if (gimple_code (stmt) != GIMPLE_ASSIGN)
634 : {
635 515 : if (gimple_vuse (stmt)
636 515 : || (is_gimple_call (stmt)
637 0 : && !(gimple_call_flags (stmt) & ECF_CONST)))
638 57 : *no_other_refs = false;
639 515 : continue;
640 : }
641 :
642 3203 : if (! gimple_vuse (stmt))
643 2124 : continue;
644 :
645 1079 : lhs = gimple_assign_lhs (stmt);
646 1079 : rhs = gimple_assign_rhs1 (stmt);
647 :
648 1079 : if (REFERENCE_CLASS_P (rhs))
649 : {
650 281 : *no_other_refs &= gather_memory_references_ref (loop, &refs,
651 : rhs, false, stmt);
652 281 : *ref_count += 1;
653 : }
654 1079 : if (REFERENCE_CLASS_P (lhs))
655 : {
656 255 : *no_other_refs &= gather_memory_references_ref (loop, &refs,
657 : lhs, true, stmt);
658 255 : *ref_count += 1;
659 : }
660 : }
661 : }
662 120 : free (body);
663 :
664 120 : return refs;
665 : }
666 :
667 : /* Prune the prefetch candidate REF using the self-reuse. */
668 :
669 : static void
670 174 : prune_ref_by_self_reuse (struct mem_ref *ref)
671 : {
672 174 : HOST_WIDE_INT step;
673 174 : bool backward;
674 :
675 : /* If the step size is non constant, we cannot calculate prefetch_mod. */
676 174 : if (!cst_and_fits_in_hwi (ref->group->step))
677 : return;
678 :
679 125 : step = int_cst_value (ref->group->step);
680 :
681 125 : backward = step < 0;
682 :
683 125 : if (step == 0)
684 : {
685 : /* Prefetch references to invariant address just once. */
686 4 : ref->prefetch_before = 1;
687 4 : return;
688 : }
689 :
690 121 : if (backward)
691 : step = -step;
692 :
693 121 : if (step > PREFETCH_BLOCK)
694 : return;
695 :
696 108 : if ((backward && HAVE_BACKWARD_PREFETCH)
697 : || (!backward && HAVE_FORWARD_PREFETCH))
698 : {
699 : ref->prefetch_before = 1;
700 : return;
701 : }
702 :
703 108 : ref->prefetch_mod = PREFETCH_BLOCK / step;
704 : }
705 :
706 : /* Divides X by BY, rounding down. */
707 :
708 : static HOST_WIDE_INT
709 40 : ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
710 : {
711 40 : gcc_assert (by > 0);
712 :
713 40 : if (x >= 0)
714 40 : return x / (HOST_WIDE_INT) by;
715 : else
716 0 : return (x + (HOST_WIDE_INT) by - 1) / (HOST_WIDE_INT) by;
717 : }
718 :
719 : /* Given a CACHE_LINE_SIZE and two inductive memory references
720 : with a common STEP greater than CACHE_LINE_SIZE and an address
721 : difference DELTA, compute the probability that they will fall
722 : in different cache lines. Return true if the computed miss rate
723 : is not greater than the ACCEPTABLE_MISS_RATE. DISTINCT_ITERS is the
724 : number of distinct iterations after which the pattern repeats itself.
725 : ALIGN_UNIT is the unit of alignment in bytes. */
726 :
727 : static bool
728 4 : is_miss_rate_acceptable (unsigned HOST_WIDE_INT cache_line_size,
729 : HOST_WIDE_INT step, HOST_WIDE_INT delta,
730 : unsigned HOST_WIDE_INT distinct_iters,
731 : int align_unit)
732 : {
733 4 : unsigned align, iter;
734 4 : int total_positions, miss_positions, max_allowed_miss_positions;
735 4 : int address1, address2, cache_line1, cache_line2;
736 :
737 : /* It always misses if delta is greater than or equal to the cache
738 : line size. */
739 4 : if (delta >= (HOST_WIDE_INT) cache_line_size)
740 : return false;
741 :
742 2 : gcc_assert (align_unit > 0);
743 :
744 2 : miss_positions = 0;
745 2 : total_positions = (cache_line_size / align_unit) * distinct_iters;
746 2 : max_allowed_miss_positions = (ACCEPTABLE_MISS_RATE * total_positions) / 1000;
747 :
748 : /* Iterate through all possible alignments of the first
749 : memory reference within its cache line. */
750 6 : for (align = 0; align < cache_line_size; align += align_unit)
751 :
752 : /* Iterate through all distinct iterations. */
753 16 : for (iter = 0; iter < distinct_iters; iter++)
754 : {
755 12 : address1 = align + step * iter;
756 12 : address2 = address1 + delta;
757 12 : cache_line1 = address1 / cache_line_size;
758 12 : cache_line2 = address2 / cache_line_size;
759 12 : if (cache_line1 != cache_line2)
760 : {
761 2 : miss_positions += 1;
762 2 : if (miss_positions > max_allowed_miss_positions)
763 : return false;
764 : }
765 : }
766 : return true;
767 : }
768 :
769 : /* Prune the prefetch candidate REF using the reuse with BY.
770 : If BY_IS_BEFORE is true, BY is before REF in the loop. */
771 :
772 : static void
773 80 : prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
774 : bool by_is_before)
775 : {
776 80 : HOST_WIDE_INT step;
777 80 : bool backward;
778 80 : HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
779 80 : HOST_WIDE_INT delta = delta_b - delta_r;
780 80 : HOST_WIDE_INT hit_from;
781 80 : unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
782 80 : HOST_WIDE_INT reduced_step;
783 80 : unsigned HOST_WIDE_INT reduced_prefetch_block;
784 80 : tree ref_type;
785 80 : int align_unit;
786 :
787 : /* If the step is non constant we cannot calculate prefetch_before. */
788 80 : if (!cst_and_fits_in_hwi (ref->group->step)) {
789 : return;
790 : }
791 :
792 78 : step = int_cst_value (ref->group->step);
793 :
794 78 : backward = step < 0;
795 :
796 :
797 78 : if (delta == 0)
798 : {
799 : /* If the references has the same address, only prefetch the
800 : former. */
801 0 : if (by_is_before)
802 0 : ref->prefetch_before = 0;
803 :
804 0 : return;
805 : }
806 :
807 78 : if (!step)
808 : {
809 : /* If the reference addresses are invariant and fall into the
810 : same cache line, prefetch just the first one. */
811 6 : if (!by_is_before)
812 : return;
813 :
814 6 : if (ddown (ref->delta, PREFETCH_BLOCK)
815 3 : != ddown (by->delta, PREFETCH_BLOCK))
816 : return;
817 :
818 3 : ref->prefetch_before = 0;
819 3 : return;
820 : }
821 :
822 : /* Only prune the reference that is behind in the array. */
823 72 : if (backward)
824 : {
825 0 : if (delta > 0)
826 : return;
827 :
828 : /* Transform the data so that we may assume that the accesses
829 : are forward. */
830 0 : delta = - delta;
831 0 : step = -step;
832 0 : delta_r = PREFETCH_BLOCK - 1 - delta_r;
833 0 : delta_b = PREFETCH_BLOCK - 1 - delta_b;
834 : }
835 : else
836 : {
837 72 : if (delta < 0)
838 : return;
839 : }
840 :
841 : /* Check whether the two references are likely to hit the same cache
842 : line, and how distant the iterations in that it occurs are from
843 : each other. */
844 :
845 36 : if (step <= PREFETCH_BLOCK)
846 : {
847 : /* The accesses are sure to meet. Let us check when. */
848 34 : hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
849 34 : prefetch_before = (hit_from - delta_r + step - 1) / step;
850 :
851 : /* Do not reduce prefetch_before if we meet beyond cache size. */
852 34 : if (prefetch_before > absu_hwi (L2_CACHE_SIZE_BYTES / step))
853 : prefetch_before = PREFETCH_ALL;
854 34 : if (prefetch_before < ref->prefetch_before)
855 25 : ref->prefetch_before = prefetch_before;
856 :
857 34 : return;
858 : }
859 :
860 : /* A more complicated case with step > prefetch_block. First reduce
861 : the ratio between the step and the cache line size to its simplest
862 : terms. The resulting denominator will then represent the number of
863 : distinct iterations after which each address will go back to its
864 : initial location within the cache line. This computation assumes
865 : that PREFETCH_BLOCK is a power of two. */
866 2 : prefetch_block = PREFETCH_BLOCK;
867 2 : reduced_prefetch_block = prefetch_block;
868 2 : reduced_step = step;
869 2 : while ((reduced_step & 1) == 0
870 12 : && reduced_prefetch_block > 1)
871 : {
872 10 : reduced_step >>= 1;
873 10 : reduced_prefetch_block >>= 1;
874 : }
875 :
876 2 : prefetch_before = delta / step;
877 2 : delta %= step;
878 2 : ref_type = TREE_TYPE (ref->mem);
879 2 : align_unit = TYPE_ALIGN (ref_type) / 8;
880 2 : if (is_miss_rate_acceptable (prefetch_block, step, delta,
881 : reduced_prefetch_block, align_unit))
882 : {
883 : /* Do not reduce prefetch_before if we meet beyond cache size. */
884 0 : if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK)
885 : prefetch_before = PREFETCH_ALL;
886 0 : if (prefetch_before < ref->prefetch_before)
887 0 : ref->prefetch_before = prefetch_before;
888 :
889 0 : return;
890 : }
891 :
892 : /* Try also the following iteration. */
893 2 : prefetch_before++;
894 2 : delta = step - delta;
895 2 : if (is_miss_rate_acceptable (prefetch_block, step, delta,
896 : reduced_prefetch_block, align_unit))
897 : {
898 0 : if (prefetch_before < ref->prefetch_before)
899 0 : ref->prefetch_before = prefetch_before;
900 :
901 0 : return;
902 : }
903 :
904 : /* The ref probably does not reuse by. */
905 : return;
906 : }
907 :
908 : /* Prune the prefetch candidate REF using the reuses with other references
909 : in REFS. */
910 :
911 : static void
912 174 : prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
913 : {
914 174 : struct mem_ref *prune_by;
915 174 : bool before = true;
916 :
917 174 : prune_ref_by_self_reuse (ref);
918 :
919 602 : for (prune_by = refs; prune_by; prune_by = prune_by->next)
920 : {
921 254 : if (prune_by == ref)
922 : {
923 174 : before = false;
924 174 : continue;
925 : }
926 :
927 80 : if (!WRITE_CAN_USE_READ_PREFETCH
928 : && ref->write_p
929 : && !prune_by->write_p)
930 : continue;
931 80 : if (!READ_CAN_USE_WRITE_PREFETCH
932 80 : && !ref->write_p
933 54 : && prune_by->write_p)
934 0 : continue;
935 :
936 80 : prune_ref_by_group_reuse (ref, prune_by, before);
937 : }
938 174 : }
939 :
940 : /* Prune the prefetch candidates in GROUP using the reuse analysis. */
941 :
942 : static void
943 144 : prune_group_by_reuse (struct mem_ref_group *group)
944 : {
945 144 : struct mem_ref *ref_pruned;
946 :
947 318 : for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
948 : {
949 174 : prune_ref_by_reuse (ref_pruned, group->refs);
950 :
951 174 : if (dump_file && (dump_flags & TDF_DETAILS))
952 : {
953 27 : dump_mem_ref (dump_file, ref_pruned);
954 :
955 27 : if (ref_pruned->prefetch_before == PREFETCH_ALL
956 26 : && ref_pruned->prefetch_mod == 1)
957 5 : fprintf (dump_file, " no restrictions");
958 22 : else if (ref_pruned->prefetch_before == 0)
959 1 : fprintf (dump_file, " do not prefetch");
960 21 : else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
961 0 : fprintf (dump_file, " prefetch once");
962 : else
963 : {
964 21 : if (ref_pruned->prefetch_before != PREFETCH_ALL)
965 : {
966 0 : fprintf (dump_file, " prefetch before ");
967 0 : fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
968 : ref_pruned->prefetch_before);
969 : }
970 21 : if (ref_pruned->prefetch_mod != 1)
971 : {
972 21 : fprintf (dump_file, " prefetch mod ");
973 21 : fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
974 : ref_pruned->prefetch_mod);
975 : }
976 : }
977 27 : fprintf (dump_file, "\n");
978 : }
979 : }
980 144 : }
981 :
982 : /* Prune the list of prefetch candidates GROUPS using the reuse analysis. */
983 :
984 : static void
985 0 : prune_by_reuse (struct mem_ref_group *groups)
986 : {
987 217 : for (; groups; groups = groups->next)
988 144 : prune_group_by_reuse (groups);
989 0 : }
990 :
991 : /* Returns true if we should issue prefetch for REF. */
992 :
993 : static bool
994 564 : should_issue_prefetch_p (struct mem_ref *ref)
995 : {
996 : /* Do we want to issue prefetches for non-constant strides? */
997 564 : if (!cst_and_fits_in_hwi (ref->group->step)
998 564 : && param_prefetch_dynamic_strides == 0)
999 : {
1000 0 : if (dump_file && (dump_flags & TDF_DETAILS))
1001 0 : fprintf (dump_file,
1002 : "Skipping non-constant step for reference %u:%u\n",
1003 0 : ref->group->uid, ref->uid);
1004 0 : return false;
1005 : }
1006 :
1007 : /* Some processors may have a hardware prefetcher that may conflict with
1008 : prefetch hints for a range of strides. Make sure we don't issue
1009 : prefetches for such cases if the stride is within this particular
1010 : range. */
1011 564 : if (cst_and_fits_in_hwi (ref->group->step)
1012 564 : && abs_hwi (int_cst_value (ref->group->step))
1013 413 : < (HOST_WIDE_INT) param_prefetch_minimum_stride)
1014 : {
1015 0 : if (dump_file && (dump_flags & TDF_DETAILS))
1016 0 : fprintf (dump_file,
1017 : "Step for reference %u:%u (" HOST_WIDE_INT_PRINT_DEC
1018 : ") is less than the mininum required stride of %d\n",
1019 0 : ref->group->uid, ref->uid, int_cst_value (ref->group->step),
1020 : param_prefetch_minimum_stride);
1021 0 : return false;
1022 : }
1023 :
1024 : /* For now do not issue prefetches for only first few of the
1025 : iterations. */
1026 564 : if (ref->prefetch_before != PREFETCH_ALL)
1027 : {
1028 89 : if (dump_file && (dump_flags & TDF_DETAILS))
1029 4 : fprintf (dump_file, "Ignoring reference %u:%u due to prefetch_before\n",
1030 4 : ref->group->uid, ref->uid);
1031 89 : return false;
1032 : }
1033 :
1034 : /* Do not prefetch nontemporal stores. */
1035 475 : if (ref->storent_p)
1036 : {
1037 2 : if (dump_file && (dump_flags & TDF_DETAILS))
1038 2 : fprintf (dump_file, "Ignoring nontemporal store reference %u:%u\n", ref->group->uid, ref->uid);
1039 2 : return false;
1040 : }
1041 :
1042 : return true;
1043 : }
1044 :
1045 : /* Decide which of the prefetch candidates in GROUPS to prefetch.
1046 : AHEAD is the number of iterations to prefetch ahead (which corresponds
1047 : to the number of simultaneous instances of one prefetch running at a
1048 : time). UNROLL_FACTOR is the factor by that the loop is going to be
1049 : unrolled. Returns true if there is anything to prefetch. */
1050 :
1051 : static bool
1052 64 : schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
1053 : unsigned ahead)
1054 : {
1055 64 : unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
1056 64 : unsigned slots_per_prefetch;
1057 64 : struct mem_ref *ref;
1058 64 : bool any = false;
1059 :
1060 : /* At most param_simultaneous_prefetches should be running
1061 : at the same time. */
1062 64 : remaining_prefetch_slots = param_simultaneous_prefetches;
1063 :
1064 : /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
1065 : AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration,
1066 : it will need a prefetch slot. */
1067 64 : slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
1068 64 : if (dump_file && (dump_flags & TDF_DETAILS))
1069 15 : fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
1070 : slots_per_prefetch);
1071 :
1072 : /* For now we just take memory references one by one and issue
1073 : prefetches for as many as possible. The groups are sorted
1074 : starting with the largest step, since the references with
1075 : large step are more likely to cause many cache misses. */
1076 :
1077 193 : for (; groups; groups = groups->next)
1078 286 : for (ref = groups->refs; ref; ref = ref->next)
1079 : {
1080 157 : if (!should_issue_prefetch_p (ref))
1081 27 : continue;
1082 :
1083 : /* The loop is far from being sufficiently unrolled for this
1084 : prefetch. Do not generate prefetch to avoid many redudant
1085 : prefetches. */
1086 130 : if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO)
1087 3 : continue;
1088 :
1089 : /* If we need to prefetch the reference each PREFETCH_MOD iterations,
1090 : and we unroll the loop UNROLL_FACTOR times, we need to insert
1091 : ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
1092 : iteration. */
1093 127 : n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1094 127 : / ref->prefetch_mod);
1095 127 : prefetch_slots = n_prefetches * slots_per_prefetch;
1096 :
1097 : /* If more than half of the prefetches would be lost anyway, do not
1098 : issue the prefetch. */
1099 127 : if (2 * remaining_prefetch_slots < prefetch_slots)
1100 2 : continue;
1101 :
1102 : /* Stop prefetching if debug counter is activated. */
1103 125 : if (!dbg_cnt (prefetch))
1104 0 : continue;
1105 :
1106 125 : ref->issue_prefetch_p = true;
1107 125 : if (dump_file && (dump_flags & TDF_DETAILS))
1108 20 : fprintf (dump_file, "Decided to issue prefetch for reference %u:%u\n",
1109 20 : ref->group->uid, ref->uid);
1110 :
1111 125 : if (remaining_prefetch_slots <= prefetch_slots)
1112 : return true;
1113 123 : remaining_prefetch_slots -= prefetch_slots;
1114 123 : any = true;
1115 : }
1116 :
1117 : return any;
1118 : }
1119 :
1120 : /* Return TRUE if no prefetch is going to be generated in the given
1121 : GROUPS. */
1122 :
1123 : static bool
1124 73 : nothing_to_prefetch_p (struct mem_ref_group *groups)
1125 : {
1126 73 : struct mem_ref *ref;
1127 :
1128 74 : for (; groups; groups = groups->next)
1129 81 : for (ref = groups->refs; ref; ref = ref->next)
1130 80 : if (should_issue_prefetch_p (ref))
1131 : return false;
1132 :
1133 : return true;
1134 : }
1135 :
1136 : /* Estimate the number of prefetches in the given GROUPS.
1137 : UNROLL_FACTOR is the factor by which LOOP was unrolled. */
1138 :
1139 : static int
1140 68 : estimate_prefetch_count (struct mem_ref_group *groups, unsigned unroll_factor)
1141 : {
1142 68 : struct mem_ref *ref;
1143 68 : unsigned n_prefetches;
1144 68 : int prefetch_count = 0;
1145 :
1146 211 : for (; groups; groups = groups->next)
1147 314 : for (ref = groups->refs; ref; ref = ref->next)
1148 171 : if (should_issue_prefetch_p (ref))
1149 : {
1150 145 : n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1151 145 : / ref->prefetch_mod);
1152 145 : prefetch_count += n_prefetches;
1153 : }
1154 :
1155 68 : return prefetch_count;
1156 : }
1157 :
1158 : /* Issue prefetches for the reference REF into loop as decided before.
1159 : HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR
1160 : is the factor by which LOOP was unrolled. */
1161 :
1162 : static void
1163 125 : issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
1164 : {
1165 125 : HOST_WIDE_INT delta;
1166 125 : tree addr, addr_base, write_p, local, forward;
1167 125 : gcall *prefetch;
1168 125 : gimple_stmt_iterator bsi;
1169 125 : unsigned n_prefetches, ap;
1170 125 : bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
1171 :
1172 125 : if (dump_file && (dump_flags & TDF_DETAILS))
1173 20 : fprintf (dump_file, "Issued%s prefetch for reference %u:%u.\n",
1174 : nontemporal ? " nontemporal" : "",
1175 20 : ref->group->uid, ref->uid);
1176 :
1177 125 : bsi = gsi_for_stmt (ref->stmt);
1178 :
1179 125 : n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1180 125 : / ref->prefetch_mod);
1181 125 : addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
1182 125 : addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base),
1183 : true, NULL, true, GSI_SAME_STMT);
1184 125 : write_p = ref->write_p ? integer_one_node : integer_zero_node;
1185 125 : local = nontemporal ? integer_zero_node : build_int_cst (integer_type_node, 3);
1186 :
1187 250 : for (ap = 0; ap < n_prefetches; ap++)
1188 : {
1189 125 : if (cst_and_fits_in_hwi (ref->group->step))
1190 : {
1191 : /* Determine the address to prefetch. */
1192 160 : delta = (ahead + ap * ref->prefetch_mod) *
1193 80 : int_cst_value (ref->group->step);
1194 80 : addr = fold_build_pointer_plus_hwi (addr_base, delta);
1195 80 : addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
1196 : NULL, true, GSI_SAME_STMT);
1197 : }
1198 : else
1199 : {
1200 : /* The step size is non-constant but loop-invariant. We use the
1201 : heuristic to simply prefetch ahead iterations ahead. */
1202 45 : forward = fold_build2 (MULT_EXPR, sizetype,
1203 : fold_convert (sizetype, ref->group->step),
1204 : fold_convert (sizetype, size_int (ahead)));
1205 45 : addr = fold_build_pointer_plus (addr_base, forward);
1206 45 : addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
1207 : NULL, true, GSI_SAME_STMT);
1208 : }
1209 :
1210 125 : if (addr_base != addr
1211 123 : && TREE_CODE (addr_base) == SSA_NAME
1212 123 : && TREE_CODE (addr) == SSA_NAME)
1213 : {
1214 123 : duplicate_ssa_name_ptr_info (addr, SSA_NAME_PTR_INFO (addr_base));
1215 : /* As this isn't a plain copy we have to reset alignment
1216 : information. */
1217 123 : if (SSA_NAME_PTR_INFO (addr))
1218 70 : mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr));
1219 : }
1220 :
1221 : /* Create the prefetch instruction. */
1222 125 : prefetch = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
1223 : 3, addr, write_p, local);
1224 125 : gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
1225 : }
1226 125 : }
1227 :
1228 : /* Issue prefetches for the references in GROUPS into loop as decided before.
1229 : HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the
1230 : factor by that LOOP was unrolled. */
1231 :
1232 : static void
1233 62 : issue_prefetches (struct mem_ref_group *groups,
1234 : unsigned unroll_factor, unsigned ahead)
1235 : {
1236 62 : struct mem_ref *ref;
1237 :
1238 195 : for (; groups; groups = groups->next)
1239 294 : for (ref = groups->refs; ref; ref = ref->next)
1240 161 : if (ref->issue_prefetch_p)
1241 125 : issue_prefetch_ref (ref, unroll_factor, ahead);
1242 62 : }
1243 :
1244 : /* Returns true if REF is a memory write for that a nontemporal store insn
1245 : can be used. */
1246 :
1247 : static bool
1248 151 : nontemporal_store_p (struct mem_ref *ref)
1249 : {
1250 151 : machine_mode mode;
1251 151 : enum insn_code code;
1252 :
1253 : /* REF must be a write that is not reused. We require it to be independent
1254 : on all other memory references in the loop, as the nontemporal stores may
1255 : be reordered with respect to other memory references. */
1256 151 : if (!ref->write_p
1257 45 : || !ref->independent_p
1258 19 : || ref->reuse_distance < L2_CACHE_SIZE_BYTES)
1259 : return false;
1260 :
1261 : /* Check that we have the storent instruction for the mode. */
1262 2 : mode = TYPE_MODE (TREE_TYPE (ref->mem));
1263 2 : if (mode == BLKmode)
1264 : return false;
1265 :
1266 2 : code = optab_handler (storent_optab, mode);
1267 2 : return code != CODE_FOR_nothing;
1268 : }
1269 :
1270 : /* If REF is a nontemporal store, we mark the corresponding modify statement
1271 : and return true. Otherwise, we return false. */
1272 :
1273 : static bool
1274 151 : mark_nontemporal_store (struct mem_ref *ref)
1275 : {
1276 151 : if (!nontemporal_store_p (ref))
1277 : return false;
1278 :
1279 2 : if (dump_file && (dump_flags & TDF_DETAILS))
1280 2 : fprintf (dump_file, "Marked reference %u:%u as a nontemporal store.\n",
1281 2 : ref->group->uid, ref->uid);
1282 :
1283 2 : gimple_assign_set_nontemporal_move (ref->stmt, true);
1284 2 : ref->storent_p = true;
1285 :
1286 2 : return true;
1287 : }
1288 :
1289 : /* Issue a memory fence instruction after LOOP. */
1290 :
1291 : static void
1292 2 : emit_mfence_after_loop (class loop *loop)
1293 : {
1294 2 : auto_vec<edge> exits = get_loop_exit_edges (loop);
1295 2 : edge exit;
1296 2 : gcall *call;
1297 2 : gimple_stmt_iterator bsi;
1298 2 : unsigned i;
1299 :
1300 8 : FOR_EACH_VEC_ELT (exits, i, exit)
1301 : {
1302 2 : call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0);
1303 :
1304 2 : if (!single_pred_p (exit->dest)
1305 : /* If possible, we prefer not to insert the fence on other paths
1306 : in cfg. */
1307 2 : && !(exit->flags & EDGE_ABNORMAL))
1308 0 : split_loop_exit_edge (exit);
1309 2 : bsi = gsi_after_labels (exit->dest);
1310 :
1311 2 : gsi_insert_before (&bsi, call, GSI_NEW_STMT);
1312 : }
1313 2 : }
1314 :
1315 : /* Returns true if we can use storent in loop, false otherwise. */
1316 :
1317 : static bool
1318 64 : may_use_storent_in_loop_p (class loop *loop)
1319 : {
1320 64 : bool ret = true;
1321 :
1322 64 : if (loop->inner != NULL)
1323 : return false;
1324 :
1325 : /* If we must issue a mfence insn after using storent, check that there
1326 : is a suitable place for it at each of the loop exits. */
1327 62 : if (FENCE_FOLLOWING_MOVNT != NULL_TREE)
1328 : {
1329 62 : auto_vec<edge> exits = get_loop_exit_edges (loop);
1330 62 : unsigned i;
1331 62 : edge exit;
1332 :
1333 207 : FOR_EACH_VEC_ELT (exits, i, exit)
1334 42 : if ((exit->flags & EDGE_ABNORMAL)
1335 0 : && exit->dest == EXIT_BLOCK_PTR_FOR_FN (cfun))
1336 42 : ret = false;
1337 62 : }
1338 :
1339 : return ret;
1340 : }
1341 :
1342 : /* Marks nontemporal stores in LOOP. GROUPS contains the description of memory
1343 : references in the loop. Returns whether we inserted any mfence call. */
1344 :
1345 : static bool
1346 64 : mark_nontemporal_stores (class loop *loop, struct mem_ref_group *groups)
1347 : {
1348 64 : struct mem_ref *ref;
1349 64 : bool any = false;
1350 :
1351 64 : if (!may_use_storent_in_loop_p (loop))
1352 : return false;
1353 :
1354 187 : for (; groups; groups = groups->next)
1355 276 : for (ref = groups->refs; ref; ref = ref->next)
1356 151 : any |= mark_nontemporal_store (ref);
1357 :
1358 62 : if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
1359 : {
1360 2 : emit_mfence_after_loop (loop);
1361 2 : return true;
1362 : }
1363 : return false;
1364 : }
1365 :
1366 : /* Determines whether we can profitably unroll LOOP FACTOR times, and if
1367 : this is the case, fill in DESC by the description of number of
1368 : iterations. */
1369 :
1370 : static bool
1371 53 : should_unroll_loop_p (class loop *loop, class tree_niter_desc *desc,
1372 : unsigned factor)
1373 : {
1374 53 : if (!can_unroll_loop_p (loop, factor, desc))
1375 : return false;
1376 :
1377 : /* We only consider loops without control flow for unrolling. This is not
1378 : a hard restriction -- tree_unroll_loop works with arbitrary loops
1379 : as well; but the unrolling/prefetching is usually more profitable for
1380 : loops consisting of a single basic block, and we want to limit the
1381 : code growth. */
1382 43 : if (loop->num_nodes > 2)
1383 3 : return false;
1384 :
1385 : return true;
1386 : }
1387 :
1388 : /* Determine the coefficient by that unroll LOOP, from the information
1389 : contained in the list of memory references REFS. Description of
1390 : number of iterations of LOOP is stored to DESC. NINSNS is the number of
1391 : insns of the LOOP. EST_NITER is the estimated number of iterations of
1392 : the loop, or -1 if no estimate is available. */
1393 :
1394 : static unsigned
1395 68 : determine_unroll_factor (class loop *loop, struct mem_ref_group *refs,
1396 : unsigned ninsns, class tree_niter_desc *desc,
1397 : HOST_WIDE_INT est_niter)
1398 : {
1399 68 : unsigned upper_bound;
1400 68 : unsigned nfactor, factor, mod_constraint;
1401 68 : struct mem_ref_group *agp;
1402 68 : struct mem_ref *ref;
1403 :
1404 : /* Bail out early in case we must not unroll loops. */
1405 68 : if (!flag_unroll_loops)
1406 : return 1;
1407 :
1408 : /* First check whether the loop is not too large to unroll. We ignore
1409 : PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
1410 : from unrolling them enough to make exactly one cache line covered by each
1411 : iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
1412 : us from unrolling the loops too many times in cases where we only expect
1413 : gains from better scheduling and decreasing loop overhead, which is not
1414 : the case here. */
1415 67 : upper_bound = param_max_unrolled_insns / ninsns;
1416 :
1417 : /* If we unrolled the loop more times than it iterates, the unrolled version
1418 : of the loop would be never entered. */
1419 67 : if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
1420 0 : upper_bound = est_niter;
1421 :
1422 67 : if (upper_bound <= 1)
1423 : return 1;
1424 :
1425 : /* Choose the factor so that we may prefetch each cache just once,
1426 : but bound the unrolling by UPPER_BOUND. */
1427 : factor = 1;
1428 181 : for (agp = refs; agp; agp = agp->next)
1429 284 : for (ref = agp->refs; ref; ref = ref->next)
1430 156 : if (should_issue_prefetch_p (ref))
1431 : {
1432 130 : mod_constraint = ref->prefetch_mod;
1433 130 : nfactor = least_common_multiple (mod_constraint, factor);
1434 130 : if (nfactor <= upper_bound)
1435 156 : factor = nfactor;
1436 : }
1437 :
1438 53 : if (!should_unroll_loop_p (loop, desc, factor))
1439 : return 1;
1440 :
1441 : return factor;
1442 : }
1443 :
1444 : /* Returns the total volume of the memory references REFS, taking into account
1445 : reuses in the innermost loop and cache line size. TODO -- we should also
1446 : take into account reuses across the iterations of the loops in the loop
1447 : nest. */
1448 :
1449 : static unsigned
1450 66 : volume_of_references (struct mem_ref_group *refs)
1451 : {
1452 66 : unsigned volume = 0;
1453 66 : struct mem_ref_group *gr;
1454 66 : struct mem_ref *ref;
1455 :
1456 199 : for (gr = refs; gr; gr = gr->next)
1457 292 : for (ref = gr->refs; ref; ref = ref->next)
1458 : {
1459 : /* Almost always reuses another value? */
1460 159 : if (ref->prefetch_before != PREFETCH_ALL)
1461 26 : continue;
1462 :
1463 : /* If several iterations access the same cache line, use the size of
1464 : the line divided by this number. Otherwise, a cache line is
1465 : accessed in each iteration. TODO -- in the latter case, we should
1466 : take the size of the reference into account, rounding it up on cache
1467 : line size multiple. */
1468 133 : volume += param_l1_cache_line_size / ref->prefetch_mod;
1469 : }
1470 66 : return volume;
1471 : }
1472 :
1473 : /* Returns the volume of memory references accessed across VEC iterations of
1474 : loops, whose sizes are described in the LOOP_SIZES array. N is the number
1475 : of the loops in the nest (length of VEC and LOOP_SIZES vectors). */
1476 :
1477 : static unsigned
1478 168 : volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
1479 : {
1480 168 : unsigned i;
1481 :
1482 353 : for (i = 0; i < n; i++)
1483 199 : if (vec[i] != 0)
1484 : break;
1485 :
1486 168 : if (i == n)
1487 : return 0;
1488 :
1489 14 : gcc_assert (vec[i] > 0);
1490 :
1491 : /* We ignore the parts of the distance vector in subloops, since usually
1492 : the numbers of iterations are much smaller. */
1493 14 : return loop_sizes[i] * vec[i];
1494 : }
1495 :
1496 : /* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
1497 : at the position corresponding to the loop of the step. N is the depth
1498 : of the considered loop nest, and, LOOP is its innermost loop. */
1499 :
1500 : static void
1501 167 : add_subscript_strides (tree access_fn, unsigned stride,
1502 : HOST_WIDE_INT *strides, unsigned n, class loop *loop)
1503 : {
1504 167 : class loop *aloop;
1505 167 : tree step;
1506 167 : HOST_WIDE_INT astep;
1507 167 : unsigned min_depth = loop_depth (loop) - n;
1508 :
1509 343 : while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
1510 : {
1511 176 : aloop = get_chrec_loop (access_fn);
1512 176 : step = CHREC_RIGHT (access_fn);
1513 176 : access_fn = CHREC_LEFT (access_fn);
1514 :
1515 176 : if ((unsigned) loop_depth (aloop) <= min_depth)
1516 0 : continue;
1517 :
1518 176 : if (tree_fits_shwi_p (step))
1519 127 : astep = tree_to_shwi (step);
1520 : else
1521 49 : astep = param_l1_cache_line_size;
1522 :
1523 176 : strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
1524 :
1525 : }
1526 167 : }
1527 :
1528 : /* Returns the volume of memory references accessed between two consecutive
1529 : self-reuses of the reference DR. We consider the subscripts of DR in N
1530 : loops, and LOOP_SIZES contains the volumes of accesses in each of the
1531 : loops. LOOP is the innermost loop of the current loop nest. */
1532 :
1533 : static unsigned
1534 159 : self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
1535 : class loop *loop)
1536 : {
1537 159 : tree stride, access_fn;
1538 159 : HOST_WIDE_INT *strides, astride;
1539 159 : vec<tree> access_fns;
1540 159 : tree ref = DR_REF (dr);
1541 159 : unsigned i, ret = ~0u;
1542 :
1543 : /* In the following example:
1544 :
1545 : for (i = 0; i < N; i++)
1546 : for (j = 0; j < N; j++)
1547 : use (a[j][i]);
1548 : the same cache line is accessed each N steps (except if the change from
1549 : i to i + 1 crosses the boundary of the cache line). Thus, for self-reuse,
1550 : we cannot rely purely on the results of the data dependence analysis.
1551 :
1552 : Instead, we compute the stride of the reference in each loop, and consider
1553 : the innermost loop in that the stride is less than cache size. */
1554 :
1555 159 : strides = XCNEWVEC (HOST_WIDE_INT, n);
1556 159 : access_fns = DR_ACCESS_FNS (dr);
1557 :
1558 650 : FOR_EACH_VEC_ELT (access_fns, i, access_fn)
1559 : {
1560 : /* Keep track of the reference corresponding to the subscript, so that we
1561 : know its stride. */
1562 169 : while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
1563 2 : ref = TREE_OPERAND (ref, 0);
1564 :
1565 167 : if (TREE_CODE (ref) == ARRAY_REF)
1566 : {
1567 47 : stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1568 47 : if (tree_fits_uhwi_p (stride))
1569 47 : astride = tree_to_uhwi (stride);
1570 : else
1571 0 : astride = param_l1_cache_line_size;
1572 :
1573 47 : ref = TREE_OPERAND (ref, 0);
1574 : }
1575 : else
1576 : astride = 1;
1577 :
1578 167 : add_subscript_strides (access_fn, astride, strides, n, loop);
1579 : }
1580 :
1581 347 : for (i = n; i-- > 0; )
1582 : {
1583 197 : unsigned HOST_WIDE_INT s;
1584 :
1585 197 : s = strides[i] < 0 ? -strides[i] : strides[i];
1586 :
1587 197 : if (s < (unsigned) param_l1_cache_line_size
1588 132 : && (loop_sizes[i]
1589 132 : > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
1590 : {
1591 : ret = loop_sizes[i];
1592 : break;
1593 : }
1594 : }
1595 :
1596 159 : free (strides);
1597 159 : return ret;
1598 : }
1599 :
1600 : /* Determines the distance till the first reuse of each reference in REFS
1601 : in the loop nest of LOOP. NO_OTHER_REFS is true if there are no other
1602 : memory references in the loop. Return false if the analysis fails. */
1603 :
1604 : static bool
1605 68 : determine_loop_nest_reuse (class loop *loop, struct mem_ref_group *refs,
1606 : bool no_other_refs)
1607 : {
1608 68 : class loop *nest, *aloop;
1609 68 : vec<data_reference_p> datarefs = vNULL;
1610 68 : vec<ddr_p> dependences = vNULL;
1611 68 : struct mem_ref_group *gr;
1612 68 : struct mem_ref *ref, *refb;
1613 68 : auto_vec<loop_p> vloops;
1614 68 : unsigned *loop_data_size;
1615 68 : unsigned i, j, n;
1616 68 : unsigned volume, dist, adist;
1617 68 : HOST_WIDE_INT vol;
1618 68 : data_reference_p dr;
1619 68 : ddr_p dep;
1620 :
1621 68 : if (loop->inner)
1622 : return true;
1623 :
1624 : /* Find the outermost loop of the loop nest of loop (we require that
1625 : there are no sibling loops inside the nest). */
1626 : nest = loop;
1627 78 : while (1)
1628 : {
1629 78 : aloop = loop_outer (nest);
1630 :
1631 78 : if (aloop == current_loops->tree_root
1632 22 : || aloop->inner->next)
1633 : break;
1634 :
1635 : nest = aloop;
1636 : }
1637 :
1638 : /* For each loop, determine the amount of data accessed in each iteration.
1639 : We use this to estimate whether the reference is evicted from the
1640 : cache before its reuse. */
1641 66 : find_loop_nest (nest, &vloops);
1642 66 : n = vloops.length ();
1643 66 : loop_data_size = XNEWVEC (unsigned, n);
1644 66 : volume = volume_of_references (refs);
1645 66 : i = n;
1646 144 : while (i-- != 0)
1647 : {
1648 78 : loop_data_size[i] = volume;
1649 : /* Bound the volume by the L2 cache size, since above this bound,
1650 : all dependence distances are equivalent. */
1651 78 : if (volume > L2_CACHE_SIZE_BYTES)
1652 0 : continue;
1653 :
1654 78 : aloop = vloops[i];
1655 78 : vol = estimated_stmt_executions_int (aloop);
1656 78 : if (vol == -1)
1657 39 : vol = expected_loop_iterations (aloop);
1658 78 : volume *= vol;
1659 : }
1660 :
1661 : /* Prepare the references in the form suitable for data dependence
1662 : analysis. We ignore unanalyzable data references (the results
1663 : are used just as a heuristics to estimate temporality of the
1664 : references, hence we do not need to worry about correctness). */
1665 199 : for (gr = refs; gr; gr = gr->next)
1666 292 : for (ref = gr->refs; ref; ref = ref->next)
1667 : {
1668 159 : dr = create_data_ref (loop_preheader_edge (nest),
1669 : loop_containing_stmt (ref->stmt),
1670 159 : ref->mem, ref->stmt, !ref->write_p, false);
1671 :
1672 159 : if (dr)
1673 : {
1674 159 : ref->reuse_distance = volume;
1675 159 : dr->aux = ref;
1676 159 : datarefs.safe_push (dr);
1677 : }
1678 : else
1679 : no_other_refs = false;
1680 : }
1681 :
1682 225 : FOR_EACH_VEC_ELT (datarefs, i, dr)
1683 : {
1684 159 : dist = self_reuse_distance (dr, loop_data_size, n, loop);
1685 159 : ref = (struct mem_ref *) dr->aux;
1686 159 : if (ref->reuse_distance > dist)
1687 9 : ref->reuse_distance = dist;
1688 :
1689 159 : if (no_other_refs)
1690 108 : ref->independent_p = true;
1691 : }
1692 :
1693 66 : if (!compute_all_dependences (datarefs, &dependences, vloops, true))
1694 : return false;
1695 :
1696 462 : FOR_EACH_VEC_ELT (dependences, i, dep)
1697 : {
1698 396 : if (DDR_ARE_DEPENDENT (dep) == chrec_known)
1699 125 : continue;
1700 :
1701 271 : ref = (struct mem_ref *) DDR_A (dep)->aux;
1702 271 : refb = (struct mem_ref *) DDR_B (dep)->aux;
1703 :
1704 271 : if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
1705 158 : || DDR_COULD_BE_INDEPENDENT_P (dep)
1706 429 : || DDR_NUM_DIST_VECTS (dep) == 0)
1707 : {
1708 : /* If the dependence cannot be analyzed, assume that there might be
1709 : a reuse. */
1710 113 : dist = 0;
1711 :
1712 113 : ref->independent_p = false;
1713 113 : refb->independent_p = false;
1714 : }
1715 : else
1716 : {
1717 : /* The distance vectors are normalized to be always lexicographically
1718 : positive, hence we cannot tell just from them whether DDR_A comes
1719 : before DDR_B or vice versa. However, it is not important,
1720 : anyway -- if DDR_A is close to DDR_B, then it is either reused in
1721 : DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
1722 : in cache (and marking it as nontemporal would not affect
1723 : anything). */
1724 :
1725 : dist = volume;
1726 326 : for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
1727 : {
1728 168 : adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
1729 : loop_data_size, n);
1730 :
1731 : /* If this is a dependence in the innermost loop (i.e., the
1732 : distances in all superloops are zero) and it is not
1733 : the trivial self-dependence with distance zero, record that
1734 : the references are not completely independent. */
1735 168 : if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1)
1736 168 : && (ref != refb
1737 153 : || DDR_DIST_VECT (dep, j)[n-1] != 0))
1738 : {
1739 3 : ref->independent_p = false;
1740 3 : refb->independent_p = false;
1741 : }
1742 :
1743 : /* Ignore accesses closer than
1744 : L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
1745 : so that we use nontemporal prefetches e.g. if single memory
1746 : location is accessed several times in a single iteration of
1747 : the loop. */
1748 168 : if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
1749 156 : continue;
1750 :
1751 12 : if (adist < dist)
1752 168 : dist = adist;
1753 : }
1754 : }
1755 :
1756 271 : if (ref->reuse_distance > dist)
1757 39 : ref->reuse_distance = dist;
1758 271 : if (refb->reuse_distance > dist)
1759 61 : refb->reuse_distance = dist;
1760 : }
1761 :
1762 66 : free_dependence_relations (dependences);
1763 66 : free_data_refs (datarefs);
1764 66 : free (loop_data_size);
1765 :
1766 66 : if (dump_file && (dump_flags & TDF_DETAILS))
1767 : {
1768 16 : fprintf (dump_file, "Reuse distances:\n");
1769 58 : for (gr = refs; gr; gr = gr->next)
1770 53 : for (ref = gr->refs; ref; ref = ref->next)
1771 27 : fprintf (dump_file, " reference %u:%u distance %u\n",
1772 27 : ref->group->uid, ref->uid, ref->reuse_distance);
1773 : }
1774 :
1775 : return true;
1776 68 : }
1777 :
1778 : /* Determine whether or not the trip count to ahead ratio is too small based
1779 : on prefitablility consideration.
1780 : AHEAD: the iteration ahead distance,
1781 : EST_NITER: the estimated trip count. */
1782 :
1783 : static bool
1784 155 : trip_count_to_ahead_ratio_too_small_p (unsigned ahead, HOST_WIDE_INT est_niter)
1785 : {
1786 : /* Assume trip count to ahead ratio is big enough if the trip count could not
1787 : be estimated at compile time. */
1788 155 : if (est_niter < 0)
1789 : return false;
1790 :
1791 125 : if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead))
1792 : {
1793 35 : if (dump_file && (dump_flags & TDF_DETAILS))
1794 1 : fprintf (dump_file,
1795 : "Not prefetching -- loop estimated to roll only %d times\n",
1796 : (int) est_niter);
1797 35 : return true;
1798 : }
1799 :
1800 : return false;
1801 : }
1802 :
1803 : /* Determine whether or not the number of memory references in the loop is
1804 : reasonable based on the profitablity and compilation time considerations.
1805 : NINSNS: estimated number of instructions in the loop,
1806 : MEM_REF_COUNT: total number of memory references in the loop. */
1807 :
1808 : static bool
1809 120 : mem_ref_count_reasonable_p (unsigned ninsns, unsigned mem_ref_count)
1810 : {
1811 120 : int insn_to_mem_ratio;
1812 :
1813 120 : if (mem_ref_count == 0)
1814 : return false;
1815 :
1816 : /* Miss rate computation (is_miss_rate_acceptable) and dependence analysis
1817 : (compute_all_dependences) have high costs based on quadratic complexity.
1818 : To avoid huge compilation time, we give up prefetching if mem_ref_count
1819 : is too large. */
1820 93 : if (mem_ref_count > PREFETCH_MAX_MEM_REFS_PER_LOOP)
1821 : return false;
1822 :
1823 : /* Prefetching improves performance by overlapping cache missing
1824 : memory accesses with CPU operations. If the loop does not have
1825 : enough CPU operations to overlap with memory operations, prefetching
1826 : won't give a significant benefit. One approximate way of checking
1827 : this is to require the ratio of instructions to memory references to
1828 : be above a certain limit. This approximation works well in practice.
1829 : TODO: Implement a more precise computation by estimating the time
1830 : for each CPU or memory op in the loop. Time estimates for memory ops
1831 : should account for cache misses. */
1832 93 : insn_to_mem_ratio = ninsns / mem_ref_count;
1833 :
1834 93 : if (insn_to_mem_ratio < param_prefetch_min_insn_to_mem_ratio)
1835 : {
1836 20 : if (dump_file && (dump_flags & TDF_DETAILS))
1837 0 : fprintf (dump_file,
1838 : "Not prefetching -- instruction to memory reference ratio (%d) too small\n",
1839 : insn_to_mem_ratio);
1840 20 : return false;
1841 : }
1842 :
1843 : return true;
1844 : }
1845 :
1846 : /* Determine whether or not the instruction to prefetch ratio in the loop is
1847 : too small based on the profitablity consideration.
1848 : NINSNS: estimated number of instructions in the loop,
1849 : PREFETCH_COUNT: an estimate of the number of prefetches,
1850 : UNROLL_FACTOR: the factor to unroll the loop if prefetching. */
1851 :
1852 : static bool
1853 68 : insn_to_prefetch_ratio_too_small_p (unsigned ninsns, unsigned prefetch_count,
1854 : unsigned unroll_factor)
1855 : {
1856 68 : int insn_to_prefetch_ratio;
1857 :
1858 : /* Prefetching most likely causes performance degradation when the instruction
1859 : to prefetch ratio is too small. Too many prefetch instructions in a loop
1860 : may reduce the I-cache performance.
1861 : (unroll_factor * ninsns) is used to estimate the number of instructions in
1862 : the unrolled loop. This implementation is a bit simplistic -- the number
1863 : of issued prefetch instructions is also affected by unrolling. So,
1864 : prefetch_mod and the unroll factor should be taken into account when
1865 : determining prefetch_count. Also, the number of insns of the unrolled
1866 : loop will usually be significantly smaller than the number of insns of the
1867 : original loop * unroll_factor (at least the induction variable increases
1868 : and the exit branches will get eliminated), so it might be better to use
1869 : tree_estimate_loop_size + estimated_unrolled_size. */
1870 68 : insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count;
1871 68 : if (insn_to_prefetch_ratio < param_min_insn_to_prefetch_ratio)
1872 : {
1873 4 : if (dump_file && (dump_flags & TDF_DETAILS))
1874 1 : fprintf (dump_file,
1875 : "Not prefetching -- instruction to prefetch ratio (%d) too small\n",
1876 : insn_to_prefetch_ratio);
1877 4 : return true;
1878 : }
1879 :
1880 : return false;
1881 : }
1882 :
1883 :
1884 : /* Issue prefetch instructions for array references in LOOP. Returns
1885 : true if the LOOP was unrolled and updates NEED_LC_SSA_UPDATE if we need
1886 : to update SSA for virtual operands and LC SSA for a split edge. */
1887 :
1888 : static bool
1889 155 : loop_prefetch_arrays (class loop *loop, bool &need_lc_ssa_update)
1890 : {
1891 155 : struct mem_ref_group *refs;
1892 155 : unsigned ahead, ninsns, time, unroll_factor;
1893 155 : HOST_WIDE_INT est_niter;
1894 155 : class tree_niter_desc desc;
1895 155 : bool unrolled = false, no_other_refs;
1896 155 : unsigned prefetch_count;
1897 155 : unsigned mem_ref_count;
1898 :
1899 155 : if (optimize_loop_nest_for_size_p (loop))
1900 : {
1901 0 : if (dump_file && (dump_flags & TDF_DETAILS))
1902 0 : fprintf (dump_file, " ignored (cold area)\n");
1903 0 : return false;
1904 : }
1905 :
1906 : /* FIXME: the time should be weighted by the probabilities of the blocks in
1907 : the loop body. */
1908 155 : time = tree_num_loop_insns (loop, &eni_time_weights);
1909 155 : if (time == 0)
1910 : return false;
1911 :
1912 155 : ahead = (param_prefetch_latency + time - 1) / time;
1913 155 : est_niter = estimated_stmt_executions_int (loop);
1914 155 : if (est_niter == -1)
1915 84 : est_niter = likely_max_stmt_executions_int (loop);
1916 :
1917 : /* Prefetching is not likely to be profitable if the trip count to ahead
1918 : ratio is too small. */
1919 155 : if (trip_count_to_ahead_ratio_too_small_p (ahead, est_niter))
1920 : return false;
1921 :
1922 120 : ninsns = tree_num_loop_insns (loop, &eni_size_weights);
1923 :
1924 : /* Step 1: gather the memory references. */
1925 120 : refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
1926 :
1927 : /* Give up prefetching if the number of memory references in the
1928 : loop is not reasonable based on profitablity and compilation time
1929 : considerations. */
1930 120 : if (!mem_ref_count_reasonable_p (ninsns, mem_ref_count))
1931 47 : goto fail;
1932 :
1933 : /* Step 2: estimate the reuse effects. */
1934 73 : prune_by_reuse (refs);
1935 :
1936 73 : if (nothing_to_prefetch_p (refs))
1937 5 : goto fail;
1938 :
1939 68 : if (!determine_loop_nest_reuse (loop, refs, no_other_refs))
1940 0 : goto fail;
1941 :
1942 : /* Step 3: determine unroll factor. */
1943 68 : unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
1944 : est_niter);
1945 :
1946 : /* Estimate prefetch count for the unrolled loop. */
1947 68 : prefetch_count = estimate_prefetch_count (refs, unroll_factor);
1948 68 : if (prefetch_count == 0)
1949 0 : goto fail;
1950 :
1951 68 : if (dump_file && (dump_flags & TDF_DETAILS))
1952 16 : fprintf (dump_file, "Ahead %d, unroll factor %d, trip count "
1953 : HOST_WIDE_INT_PRINT_DEC "\n"
1954 : "insn count %d, mem ref count %d, prefetch count %d\n",
1955 : ahead, unroll_factor, est_niter,
1956 : ninsns, mem_ref_count, prefetch_count);
1957 :
1958 : /* Prefetching is not likely to be profitable if the instruction to prefetch
1959 : ratio is too small. */
1960 68 : if (insn_to_prefetch_ratio_too_small_p (ninsns, prefetch_count,
1961 : unroll_factor))
1962 4 : goto fail;
1963 :
1964 64 : need_lc_ssa_update |= mark_nontemporal_stores (loop, refs);
1965 :
1966 : /* Step 4: what to prefetch? */
1967 64 : if (!schedule_prefetches (refs, unroll_factor, ahead))
1968 2 : goto fail;
1969 :
1970 : /* Step 5: unroll the loop. TODO -- peeling of first and last few
1971 : iterations so that we do not issue superfluous prefetches. */
1972 62 : if (unroll_factor != 1)
1973 : {
1974 33 : tree_unroll_loop (loop, unroll_factor, &desc);
1975 33 : unrolled = true;
1976 : }
1977 :
1978 : /* Step 6: issue the prefetches. */
1979 62 : issue_prefetches (refs, unroll_factor, ahead);
1980 :
1981 120 : fail:
1982 120 : release_mem_refs (refs);
1983 120 : return unrolled;
1984 155 : }
1985 :
1986 : /* Issue prefetch instructions for array references in loops. */
1987 :
1988 : unsigned int
1989 110 : tree_ssa_prefetch_arrays (void)
1990 : {
1991 110 : bool unrolled = false;
1992 110 : bool need_lc_ssa_update = false;
1993 110 : int todo_flags = 0;
1994 :
1995 110 : if (!targetm.have_prefetch ()
1996 : /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1997 : -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1998 : of processor costs and i486 does not have prefetch, but
1999 : -march=pentium4 causes targetm.have_prefetch to be true. Ugh. */
2000 110 : || PREFETCH_BLOCK == 0)
2001 : return 0;
2002 :
2003 110 : if (dump_file && (dump_flags & TDF_DETAILS))
2004 : {
2005 11 : fprintf (dump_file, "Prefetching parameters:\n");
2006 11 : fprintf (dump_file, " simultaneous prefetches: %d\n",
2007 : param_simultaneous_prefetches);
2008 11 : fprintf (dump_file, " prefetch latency: %d\n", param_prefetch_latency);
2009 11 : fprintf (dump_file, " prefetch block size: %d\n", PREFETCH_BLOCK);
2010 11 : fprintf (dump_file, " L1 cache size: %d lines, %d kB\n",
2011 11 : L1_CACHE_SIZE_BYTES / param_l1_cache_line_size,
2012 : param_l1_cache_size);
2013 11 : fprintf (dump_file, " L1 cache line size: %d\n",
2014 : param_l1_cache_line_size);
2015 11 : fprintf (dump_file, " L2 cache size: %d kB\n", param_l2_cache_size);
2016 11 : fprintf (dump_file, " min insn-to-prefetch ratio: %d \n",
2017 : param_min_insn_to_prefetch_ratio);
2018 11 : fprintf (dump_file, " min insn-to-mem ratio: %d \n",
2019 : param_prefetch_min_insn_to_mem_ratio);
2020 11 : fprintf (dump_file, "\n");
2021 : }
2022 :
2023 110 : initialize_original_copy_tables ();
2024 :
2025 110 : if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH))
2026 : {
2027 4 : tree type = build_function_type_list (void_type_node,
2028 : const_ptr_type_node, NULL_TREE);
2029 4 : tree decl = add_builtin_function ("__builtin_prefetch", type,
2030 : BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
2031 : NULL, NULL_TREE);
2032 4 : DECL_IS_NOVOPS (decl) = true;
2033 4 : set_builtin_decl (BUILT_IN_PREFETCH, decl, false);
2034 : }
2035 :
2036 485 : for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
2037 : {
2038 155 : if (dump_file && (dump_flags & TDF_DETAILS))
2039 25 : fprintf (dump_file, "Processing loop %d:\n", loop->num);
2040 :
2041 155 : unrolled |= loop_prefetch_arrays (loop, need_lc_ssa_update);
2042 :
2043 155 : if (dump_file && (dump_flags & TDF_DETAILS))
2044 25 : fprintf (dump_file, "\n\n");
2045 110 : }
2046 :
2047 110 : if (need_lc_ssa_update)
2048 2 : rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa_only_virtuals);
2049 :
2050 110 : if (unrolled)
2051 : {
2052 19 : scev_reset ();
2053 19 : todo_flags |= TODO_cleanup_cfg;
2054 : }
2055 :
2056 110 : free_original_copy_tables ();
2057 110 : return todo_flags;
2058 : }
2059 :
2060 : /* Prefetching. */
2061 :
2062 : namespace {
2063 :
2064 : const pass_data pass_data_loop_prefetch =
2065 : {
2066 : GIMPLE_PASS, /* type */
2067 : "aprefetch", /* name */
2068 : OPTGROUP_LOOP, /* optinfo_flags */
2069 : TV_TREE_PREFETCH, /* tv_id */
2070 : ( PROP_cfg | PROP_ssa ), /* properties_required */
2071 : 0, /* properties_provided */
2072 : 0, /* properties_destroyed */
2073 : 0, /* todo_flags_start */
2074 : 0, /* todo_flags_finish */
2075 : };
2076 :
2077 : class pass_loop_prefetch : public gimple_opt_pass
2078 : {
2079 : public:
2080 285722 : pass_loop_prefetch (gcc::context *ctxt)
2081 571444 : : gimple_opt_pass (pass_data_loop_prefetch, ctxt)
2082 : {}
2083 :
2084 : /* opt_pass methods: */
2085 241458 : bool gate (function *) final override
2086 : {
2087 241458 : return flag_prefetch_loop_arrays > 0;
2088 : }
2089 : unsigned int execute (function *) final override;
2090 :
2091 : }; // class pass_loop_prefetch
2092 :
2093 : unsigned int
2094 112 : pass_loop_prefetch::execute (function *fun)
2095 : {
2096 226 : if (number_of_loops (fun) <= 1)
2097 : return 0;
2098 :
2099 112 : if ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) != 0)
2100 : {
2101 2 : static bool warned = false;
2102 :
2103 2 : if (!warned)
2104 : {
2105 1 : warning (OPT_Wdisabled_optimization,
2106 : "%<l1-cache-size%> parameter is not a power of two: %d",
2107 : PREFETCH_BLOCK);
2108 1 : warned = true;
2109 : }
2110 2 : return 0;
2111 : }
2112 :
2113 110 : return tree_ssa_prefetch_arrays ();
2114 : }
2115 :
2116 : } // anon namespace
2117 :
2118 : gimple_opt_pass *
2119 285722 : make_pass_loop_prefetch (gcc::context *ctxt)
2120 : {
2121 285722 : return new pass_loop_prefetch (ctxt);
2122 : }
2123 :
2124 :
|