Branch data Line data Source code
1 : : /* Loop unroll-and-jam.
2 : : Copyright (C) 2017-2025 Free Software Foundation, Inc.
3 : :
4 : : This file is part of GCC.
5 : :
6 : : GCC is free software; you can redistribute it and/or modify it
7 : : under the terms of the GNU General Public License as published by the
8 : : Free Software Foundation; either version 3, or (at your option) any
9 : : later version.
10 : :
11 : : GCC is distributed in the hope that it will be useful, but WITHOUT
12 : : ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 : : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 : : for more details.
15 : :
16 : : You should have received a copy of the GNU General Public License
17 : : along with GCC; see the file COPYING3. If not see
18 : : <http://www.gnu.org/licenses/>. */
19 : :
20 : : #include "config.h"
21 : : #include "system.h"
22 : : #include "coretypes.h"
23 : : #include "tree-pass.h"
24 : : #include "backend.h"
25 : : #include "tree.h"
26 : : #include "gimple.h"
27 : : #include "ssa.h"
28 : : #include "fold-const.h"
29 : : #include "tree-cfg.h"
30 : : #include "tree-ssa.h"
31 : : #include "tree-ssa-loop-niter.h"
32 : : #include "tree-ssa-loop.h"
33 : : #include "tree-ssa-loop-manip.h"
34 : : #include "cfgloop.h"
35 : : #include "tree-scalar-evolution.h"
36 : : #include "gimple-iterator.h"
37 : : #include "cfghooks.h"
38 : : #include "tree-data-ref.h"
39 : : #include "tree-ssa-loop-ivopts.h"
40 : : #include "tree-vectorizer.h"
41 : : #include "tree-ssa-sccvn.h"
42 : : #include "tree-cfgcleanup.h"
43 : :
44 : : /* Unroll and Jam transformation
45 : :
46 : : This is a combination of two transformations, where the second
47 : : is not always valid. It's applicable if a loop nest has redundancies
48 : : over the iterations of an outer loop while not having that with
49 : : an inner loop.
50 : :
51 : : Given this nest:
52 : : for (i) {
53 : : for (j) {
54 : : B(i,j)
55 : : }
56 : : }
57 : :
58 : : first unroll:
59 : : for (i by 2) {
60 : : for (j) {
61 : : B(i,j)
62 : : }
63 : : for (j) {
64 : : B(i+1,j)
65 : : }
66 : : }
67 : :
68 : : then fuse the two adjacent inner loops resulting from that:
69 : : for (i by 2) {
70 : : for (j) {
71 : : B(i,j)
72 : : B(i+1,j)
73 : : }
74 : : }
75 : :
76 : : As the order of evaluations of the body B changes this is valid
77 : : only in certain situations: all distance vectors need to be forward.
78 : : Additionally if there are multiple induction variables than just
79 : : a counting control IV (j above) we can also deal with some situations.
80 : :
81 : : The validity is checked by unroll_jam_possible_p, and the data-dep
82 : : testing below.
83 : :
84 : : A trivial example where the fusion is wrong would be when
85 : : B(i,j) == x[j-1] = x[j];
86 : : for (i by 2) {
87 : : for (j) {
88 : : x[j-1] = x[j];
89 : : }
90 : : for (j) {
91 : : x[j-1] = x[j];
92 : : }
93 : : } effect: move content to front by two elements
94 : : -->
95 : : for (i by 2) {
96 : : for (j) {
97 : : x[j-1] = x[j];
98 : : x[j-1] = x[j];
99 : : }
100 : : } effect: move content to front by one element
101 : : */
102 : :
103 : : /* Modify the loop tree for the fact that all code once belonging
104 : : to the OLD loop or the outer loop of OLD now is inside LOOP. */
105 : :
106 : : static void
107 : 55 : merge_loop_tree (class loop *loop, class loop *old)
108 : : {
109 : 55 : basic_block *bbs;
110 : 55 : int i, n;
111 : 55 : class loop *subloop;
112 : 55 : edge e;
113 : 55 : edge_iterator ei;
114 : :
115 : : /* Find its nodes. */
116 : 55 : bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
117 : 55 : n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun));
118 : :
119 : 446 : for (i = 0; i < n; i++)
120 : : {
121 : : /* If the block was direct child of OLD loop it's now part
122 : : of LOOP. If it was outside OLD, then it moved into LOOP
123 : : as well. This avoids changing the loop father for BBs
124 : : in inner loops of OLD. */
125 : 719 : if (bbs[i]->loop_father == old
126 : 664 : || loop_depth (bbs[i]->loop_father) < loop_depth (old))
127 : : {
128 : 328 : remove_bb_from_loops (bbs[i]);
129 : 328 : add_bb_to_loop (bbs[i], loop);
130 : 328 : continue;
131 : : }
132 : :
133 : : /* If we find a direct subloop of OLD, move it to LOOP. */
134 : 63 : subloop = bbs[i]->loop_father;
135 : 63 : if (loop_outer (subloop) == old && subloop->header == bbs[i])
136 : : {
137 : 0 : flow_loop_tree_node_remove (subloop);
138 : 0 : flow_loop_tree_node_add (loop, subloop);
139 : : }
140 : : }
141 : :
142 : : /* Update the information about loop exit edges. */
143 : 446 : for (i = 0; i < n; i++)
144 : : {
145 : 845 : FOR_EACH_EDGE (e, ei, bbs[i]->succs)
146 : : {
147 : 454 : rescan_loop_exit (e, false, false);
148 : : }
149 : : }
150 : :
151 : 55 : loop->num_nodes = n;
152 : :
153 : 55 : free (bbs);
154 : 55 : }
155 : :
156 : : /* BB is part of the outer loop of an unroll-and-jam situation.
157 : : Check if any statements therein would prevent the transformation. */
158 : :
159 : : static bool
160 : 6882 : bb_prevents_fusion_p (basic_block bb)
161 : : {
162 : 6882 : gimple_stmt_iterator gsi;
163 : : /* BB is duplicated by outer unrolling and then all N-1 first copies
164 : : move into the body of the fused inner loop. If BB exits the outer loop
165 : : the last copy still does so, and the first N-1 copies are cancelled
166 : : by loop unrolling, so also after fusion it's the exit block.
167 : : But there might be other reasons that prevent fusion:
168 : : * stores or unknown side-effects prevent fusion
169 : : * loads don't
170 : : * computations into SSA names: these aren't problematic. Their
171 : : result will be unused on the exit edges of the first N-1 copies
172 : : (those aren't taken after unrolling). If they are used on the
173 : : other edge (the one leading to the outer latch block) they are
174 : : loop-carried (on the outer loop) and the Nth copy of BB will
175 : : compute them again (i.e. the first N-1 copies will be dead). */
176 : 27252 : for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
177 : : {
178 : 13760 : gimple *g = gsi_stmt (gsi);
179 : 23326 : if (gimple_vdef (g) || gimple_has_side_effects (g))
180 : 272 : return true;
181 : : }
182 : : return false;
183 : : }
184 : :
185 : : /* Given an inner loop LOOP (of some OUTER loop) determine if
186 : : we can safely fuse copies of it (generated by outer unrolling).
187 : : If so return true, otherwise return false. */
188 : :
189 : : static bool
190 : 14325 : unroll_jam_possible_p (class loop *outer, class loop *loop)
191 : : {
192 : 14325 : basic_block *bbs;
193 : 14325 : int i, n;
194 : 14325 : class tree_niter_desc niter;
195 : :
196 : : /* When fusing the loops we skip the latch block
197 : : of the first one, so it mustn't have any effects to
198 : : preserve. */
199 : 14325 : if (!empty_block_p (loop->latch))
200 : : return false;
201 : :
202 : 13726 : edge exit;
203 : 13726 : if (!(exit = single_exit (loop)))
204 : : return false;
205 : :
206 : : /* We need a perfect nest. Quick check for adjacent inner loops. */
207 : 9895 : if (outer->inner != loop || loop->next)
208 : : return false;
209 : :
210 : : /* Prevent head-controlled inner loops, that we usually have.
211 : : The guard block would need to be accepted
212 : : (invariant condition either entering or skipping the loop),
213 : : without also accepting arbitrary control flow. When unswitching
214 : : ran before us (as with -O3) this won't be a problem because its
215 : : outer loop unswitching will have moved out the invariant condition.
216 : :
217 : : If we do that we need to extend fuse_loops() to cope with this
218 : : by threading through the (still invariant) copied condition
219 : : between the two loop copies. */
220 : 5843 : if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header))
221 : : return false;
222 : :
223 : : /* The number of iterations of the inner loop must be loop invariant
224 : : with respect to the outer loop. */
225 : 3632 : if (!number_of_iterations_exit (loop, single_exit (loop), &niter,
226 : : false, true)
227 : 3258 : || niter.cmp == ERROR_MARK
228 : 3258 : || !integer_zerop (niter.may_be_zero)
229 : 6663 : || !expr_invariant_in_loop_p (outer, niter.niter))
230 : 924 : return false;
231 : :
232 : : /* If the inner loop produces any values that are used inside the
233 : : outer loop (except the virtual op) then it can flow
234 : : back (perhaps indirectly) into the inner loop. This prevents
235 : : fusion: without fusion the value at the last iteration is used,
236 : : with fusion the value after the initial iteration is used.
237 : :
238 : : If all uses are outside the outer loop this doesn't prevent fusion;
239 : : the value of the last iteration is still used (and the values from
240 : : all intermediate iterations are dead). */
241 : 2708 : gphi_iterator psi;
242 : 2708 : for (psi = gsi_start_phis (single_exit (loop)->dest);
243 : 5141 : !gsi_end_p (psi); gsi_next (&psi))
244 : : {
245 : 2807 : imm_use_iterator imm_iter;
246 : 2807 : use_operand_p use_p;
247 : 2807 : tree op = gimple_phi_result (psi.phi ());
248 : 5614 : if (virtual_operand_p (op))
249 : 2341 : continue;
250 : 566 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, op)
251 : : {
252 : 474 : gimple *use_stmt = USE_STMT (use_p);
253 : 474 : if (!is_gimple_debug (use_stmt)
254 : 474 : && flow_bb_inside_loop_p (outer, gimple_bb (use_stmt)))
255 : 374 : return false;
256 : : }
257 : : }
258 : :
259 : : /* And check blocks belonging to just outer loop. */
260 : 2334 : bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
261 : 2334 : n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun));
262 : :
263 : 13629 : for (i = 0; i < n; i++)
264 : 11602 : if (bbs[i]->loop_father == outer
265 : 11602 : && (bb_prevents_fusion_p (bbs[i])
266 : : /* Outer loop exits must come after the inner loop, otherwise
267 : : we'll put the outer loop exit into the fused inner loop. */
268 : 6610 : || (loop_exits_from_bb_p (outer, bbs[i])
269 : 2064 : && !dominated_by_p (CDI_DOMINATORS, bbs[i], exit->src))))
270 : : break;
271 : 2334 : free (bbs);
272 : 2334 : if (i != n)
273 : : return false;
274 : :
275 : : /* For now we can safely fuse copies of LOOP only if all
276 : : loop carried variables are inductions (or the virtual op).
277 : :
278 : : We could handle reductions as well (the initial value in the second
279 : : body would be the after-iter value of the first body) if it's over
280 : : an associative and commutative operation. We wouldn't
281 : : be able to handle unknown cycles. */
282 : 2027 : bool inner_vdef = false;
283 : 5358 : for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi))
284 : : {
285 : 3720 : affine_iv iv;
286 : 3720 : tree op = gimple_phi_result (psi.phi ());
287 : :
288 : 7440 : if (virtual_operand_p (op))
289 : : {
290 : 1621 : inner_vdef = true;
291 : 1621 : continue;
292 : : }
293 : 2099 : if (!simple_iv (loop, loop, op, &iv, true))
294 : 389 : return false;
295 : : /* The inductions must be regular, loop invariant step and initial
296 : : value. */
297 : 2078 : if (!expr_invariant_in_loop_p (outer, iv.step)
298 : 2078 : || !expr_invariant_in_loop_p (outer, iv.base))
299 : 368 : return false;
300 : : /* XXX With more effort we could also be able to deal with inductions
301 : : where the initial value is loop variant but a simple IV in the
302 : : outer loop. The initial value for the second body would be
303 : : the original initial value plus iv.base.step. The next value
304 : : for the fused loop would be the original next value of the first
305 : : copy, _not_ the next value of the second body. */
306 : : }
307 : :
308 : : /* When there's no inner loop virtual PHI IV we cannot handle the update
309 : : required to the inner loop if that doesn't already have one. See
310 : : PR117113. */
311 : 1638 : if (!inner_vdef && get_virtual_phi (outer->header))
312 : : return false;
313 : :
314 : : return true;
315 : 14325 : }
316 : :
317 : : /* Fuse LOOP with all further neighbors. The loops are expected to
318 : : be in appropriate form. */
319 : :
320 : : static void
321 : 55 : fuse_loops (class loop *loop)
322 : : {
323 : 55 : class loop *next = loop->next;
324 : :
325 : 110 : while (next)
326 : : {
327 : 55 : edge e;
328 : :
329 : 55 : remove_branch (single_pred_edge (loop->latch));
330 : : /* Make delete_basic_block not fiddle with the loop structure. */
331 : 55 : basic_block oldlatch = loop->latch;
332 : 55 : loop->latch = NULL;
333 : 55 : delete_basic_block (oldlatch);
334 : 55 : e = redirect_edge_and_branch (loop_latch_edge (next),
335 : : loop->header);
336 : 55 : loop->latch = e->src;
337 : 55 : flush_pending_stmts (e);
338 : :
339 : 55 : gcc_assert (EDGE_COUNT (next->header->preds) == 1);
340 : :
341 : : /* The PHI nodes of the second body (single-argument now)
342 : : need adjustments to use the right values: either directly
343 : : the value of the corresponding PHI in the first copy or
344 : : the one leaving the first body which unrolling did for us.
345 : :
346 : : See also unroll_jam_possible_p() for further possibilities. */
347 : 55 : gphi_iterator psi_first, psi_second;
348 : 55 : e = single_pred_edge (next->header);
349 : 55 : for (psi_first = gsi_start_phis (loop->header),
350 : 55 : psi_second = gsi_start_phis (next->header);
351 : 166 : !gsi_end_p (psi_first);
352 : 111 : gsi_next (&psi_first), gsi_next (&psi_second))
353 : : {
354 : 111 : gphi *phi_first = psi_first.phi ();
355 : 111 : gphi *phi_second = psi_second.phi ();
356 : 111 : tree firstop = gimple_phi_result (phi_first);
357 : : /* The virtual operand is correct already as it's
358 : : always live at exit, hence has a LCSSA node and outer
359 : : loop unrolling updated SSA form. */
360 : 222 : if (virtual_operand_p (firstop))
361 : 55 : continue;
362 : :
363 : : /* Due to unroll_jam_possible_p() we know that this is
364 : : an induction. The second body goes over the same
365 : : iteration space. */
366 : 56 : add_phi_arg (phi_second, firstop, e,
367 : : gimple_location (phi_first));
368 : : }
369 : 55 : gcc_assert (gsi_end_p (psi_second));
370 : :
371 : 55 : merge_loop_tree (loop, next);
372 : 55 : gcc_assert (!next->num_nodes);
373 : 55 : class loop *ln = next->next;
374 : 55 : delete_loop (next);
375 : 55 : next = ln;
376 : : }
377 : 55 : }
378 : :
379 : : /* Return true if any of the access functions for dataref A
380 : : isn't invariant with respect to loop LOOP_NEST. */
381 : : static bool
382 : 3636 : any_access_function_variant_p (const struct data_reference *a,
383 : : const class loop *loop_nest)
384 : : {
385 : 3636 : vec<tree> fns = DR_ACCESS_FNS (a);
386 : :
387 : 11447 : for (tree t : fns)
388 : 4161 : if (!evolution_function_is_invariant_p (t, loop_nest->num))
389 : : return true;
390 : :
391 : : return false;
392 : : }
393 : :
394 : : /* Returns true if the distance in DDR can be determined and adjusts
395 : : the unroll factor in *UNROLL to make unrolling valid for that distance.
396 : : Otherwise return false. DDR is with respect to the outer loop of INNER.
397 : :
398 : : If this data dep can lead to a removed memory reference, increment
399 : : *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
400 : : for this to happen. */
401 : :
402 : : static bool
403 : 3629 : adjust_unroll_factor (class loop *inner, struct data_dependence_relation *ddr,
404 : : unsigned *unroll, unsigned *profit_unroll,
405 : : unsigned *removed)
406 : : {
407 : 3629 : bool ret = false;
408 : 3629 : if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
409 : : {
410 : 3629 : if (DDR_NUM_DIST_VECTS (ddr) == 0)
411 : 3629 : return false;
412 : : unsigned i;
413 : : lambda_vector dist_v;
414 : 3633 : FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
415 : : {
416 : : /* A distance (a,b) is at worst transformed into (a/N,b) by the
417 : : unrolling (factor N), so the transformation is valid if
418 : : a >= N, or b > 0, or b is zero and a > 0. Otherwise the unroll
419 : : factor needs to be limited so that the first condition holds.
420 : : That may limit the factor down to zero in the worst case. */
421 : 2381 : lambda_int dist = dist_v[0];
422 : 2381 : if (dist < 0)
423 : 0 : gcc_unreachable ();
424 : 2381 : else if (dist >= (lambda_int)*unroll)
425 : : ;
426 : 6981 : else if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
427 : : {
428 : : /* We have (a,0) with a < N, so this will be transformed into
429 : : (0,0) after unrolling by N. This might potentially be a
430 : : problem, if it's not a read-read dependency. */
431 : 2265 : if (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr)))
432 : : ;
433 : : else
434 : : {
435 : : /* So, at least one is a write, and we might reduce the
436 : : distance vector to (0,0). This is still no problem
437 : : if both data-refs are affine with respect to the inner
438 : : loops. But if one of them is invariant with respect
439 : : to an inner loop our reordering implicit in loop fusion
440 : : corrupts the program, as our data dependences don't
441 : : capture this. E.g. for:
442 : : for (0 <= i < n)
443 : : for (0 <= j < m)
444 : : a[i][0] = a[i+1][0] + 2; // (1)
445 : : b[i][j] = b[i+1][j] + 2; // (2)
446 : : the distance vector for both statements is (-1,0),
447 : : but exchanging the order for (2) is okay, while
448 : : for (1) it is not. To see this, write out the original
449 : : accesses (assume m is 2):
450 : : a i j original
451 : : 0 0 0 r a[1][0] b[1][0]
452 : : 1 0 0 w a[0][0] b[0][0]
453 : : 2 0 1 r a[1][0] b[1][1]
454 : : 3 0 1 w a[0][0] b[0][1]
455 : : 4 1 0 r a[2][0] b[2][0]
456 : : 5 1 0 w a[1][0] b[1][0]
457 : : after unroll-by-2 and fusion the accesses are done in
458 : : this order (from column a): 0,1, 4,5, 2,3, i.e. this:
459 : : a i j transformed
460 : : 0 0 0 r a[1][0] b[1][0]
461 : : 1 0 0 w a[0][0] b[0][0]
462 : : 4 1 0 r a[2][0] b[2][0]
463 : : 5 1 0 w a[1][0] b[1][0]
464 : : 2 0 1 r a[1][0] b[1][1]
465 : : 3 0 1 w a[0][0] b[0][1]
466 : : Note how access 2 accesses the same element as access 5
467 : : for array 'a' but not for array 'b'. */
468 : 1825 : if (any_access_function_variant_p (DDR_A (ddr), inner)
469 : 1825 : && any_access_function_variant_p (DDR_B (ddr), inner))
470 : : ;
471 : : else
472 : : /* And if any dataref of this pair is invariant with
473 : : respect to the inner loop, we have no chance than
474 : : to reduce the unroll factor. */
475 : 14 : *unroll = dist;
476 : : }
477 : : }
478 : 2425 : else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
479 : : ;
480 : : else
481 : 44 : *unroll = dist;
482 : :
483 : : /* With a distance (a,0) it's always profitable to unroll-and-jam
484 : : (by a+1), because one memory reference will go away. With
485 : : (a,b) and b != 0 that's less clear. We will increase the
486 : : number of streams without lowering the number of mem refs.
487 : : So for now only handle the first situation. */
488 : 7143 : if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
489 : : {
490 : 2269 : *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1);
491 : 2269 : (*removed)++;
492 : : }
493 : :
494 : 2381 : ret = true;
495 : : }
496 : : }
497 : : return ret;
498 : : }
499 : :
500 : : /* Main entry point for the unroll-and-jam transformation
501 : : described above. */
502 : :
503 : : static unsigned int
504 : 25682 : tree_loop_unroll_and_jam (void)
505 : : {
506 : 25682 : unsigned int todo = 0;
507 : :
508 : 25682 : gcc_assert (scev_initialized_p ());
509 : :
510 : : /* Go through all innermost loops. */
511 : 139667 : for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST))
512 : : {
513 : 62621 : class loop *outer = loop_outer (loop);
514 : :
515 : 62621 : if (loop_depth (loop) < 2
516 : 62621 : || optimize_loop_nest_for_size_p (outer))
517 : 61410 : continue;
518 : :
519 : 14325 : if (!unroll_jam_possible_p (outer, loop))
520 : 12717 : continue;
521 : :
522 : 1608 : vec<data_reference_p> datarefs = vNULL;
523 : 1608 : vec<ddr_p> dependences = vNULL;
524 : 1608 : unsigned unroll_factor, profit_unroll, removed;
525 : 1608 : class tree_niter_desc desc;
526 : 1608 : bool unroll = false;
527 : :
528 : 1608 : auto_vec<loop_p, 3> loop_nest;
529 : 1608 : if (!compute_data_dependences_for_loop (outer, true, &loop_nest,
530 : : &datarefs, &dependences))
531 : : {
532 : 318 : if (dump_file && (dump_flags & TDF_DETAILS))
533 : 0 : fprintf (dump_file, "Cannot analyze data dependencies\n");
534 : 318 : free_data_refs (datarefs);
535 : 318 : free_dependence_relations (dependences);
536 : 318 : continue;
537 : : }
538 : 1290 : if (!datarefs.length ())
539 : 79 : continue;
540 : :
541 : 1211 : if (dump_file && (dump_flags & TDF_DETAILS))
542 : 14 : dump_data_dependence_relations (dump_file, dependences);
543 : :
544 : 1211 : unroll_factor = (unsigned)-1;
545 : 1211 : profit_unroll = 1;
546 : 1211 : removed = 0;
547 : :
548 : : /* Check all dependencies. */
549 : 1211 : unsigned i;
550 : 1211 : struct data_dependence_relation *ddr;
551 : 40586 : FOR_EACH_VEC_ELT (dependences, i, ddr)
552 : : {
553 : 39722 : struct data_reference *dra, *drb;
554 : :
555 : : /* If the refs are independend there's nothing to do. */
556 : 39722 : if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
557 : 32252 : continue;
558 : :
559 : 7470 : dra = DDR_A (ddr);
560 : 7470 : drb = DDR_B (ddr);
561 : :
562 : : /* Nothing interesting for the self dependencies, except for WAW if
563 : : the access function is not affine or constant because we may end
564 : : up reordering writes to the same location. */
565 : 7470 : if (dra == drb)
566 : : {
567 : 7534 : if (DR_IS_WRITE (dra)
568 : 2055 : && !DR_ACCESS_FNS (dra).is_empty ()
569 : 5885 : && DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
570 : : {
571 : 148 : unroll_factor = 0;
572 : 148 : break;
573 : : }
574 : : else
575 : 3693 : continue;
576 : : }
577 : :
578 : : /* Now check the distance vector, for determining a sensible
579 : : outer unroll factor, and for validity of merging the inner
580 : : loop copies. */
581 : 3629 : if (!adjust_unroll_factor (loop, ddr, &unroll_factor, &profit_unroll,
582 : : &removed))
583 : : {
584 : : /* Couldn't get the distance vector. For two reads that's
585 : : harmless (we assume we should unroll). For at least
586 : : one write this means we can't check the dependence direction
587 : : and hence can't determine safety. */
588 : :
589 : 2377 : if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb))
590 : : {
591 : 199 : unroll_factor = 0;
592 : 199 : break;
593 : : }
594 : : }
595 : : }
596 : :
597 : : /* We regard a user-specified minimum percentage of zero as a request
598 : : to ignore all profitability concerns and apply the transformation
599 : : always. */
600 : 1211 : if (!param_unroll_jam_min_percent)
601 : 24 : profit_unroll = MAX(2, profit_unroll);
602 : 1187 : else if (removed * 100 / datarefs.length ()
603 : 1187 : < (unsigned)param_unroll_jam_min_percent)
604 : 978 : profit_unroll = 1;
605 : 1211 : if (unroll_factor > profit_unroll)
606 : 822 : unroll_factor = profit_unroll;
607 : 1211 : if (unroll_factor > (unsigned)param_unroll_jam_max_unroll)
608 : 0 : unroll_factor = param_unroll_jam_max_unroll;
609 : 2477 : unroll = (unroll_factor > 1
610 : 1211 : && can_unroll_loop_p (outer, unroll_factor, &desc));
611 : :
612 : 55 : if (unroll)
613 : : {
614 : 55 : if (dump_enabled_p ())
615 : 8 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS,
616 : 8 : find_loop_location (outer),
617 : : "applying unroll and jam with factor %d\n",
618 : : unroll_factor);
619 : 55 : initialize_original_copy_tables ();
620 : 55 : tree_unroll_loop (outer, unroll_factor, &desc);
621 : 55 : free_original_copy_tables ();
622 : 55 : fuse_loops (outer->inner);
623 : 55 : todo |= TODO_cleanup_cfg;
624 : :
625 : 55 : auto_bitmap exit_bbs;
626 : 55 : bitmap_set_bit (exit_bbs, single_exit (outer)->dest->index);
627 : 55 : todo |= do_rpo_vn (cfun, loop_preheader_edge (outer), exit_bbs);
628 : 55 : }
629 : :
630 : 1211 : loop_nest.release ();
631 : 1211 : free_dependence_relations (dependences);
632 : 1211 : free_data_refs (datarefs);
633 : 1608 : }
634 : :
635 : 25682 : if (todo)
636 : : {
637 : 47 : free_dominance_info (CDI_DOMINATORS);
638 : : /* We need to cleanup the CFG first since otherwise SSA form can
639 : : be not up-to-date from the VN run. */
640 : 47 : if (todo & TODO_cleanup_cfg)
641 : : {
642 : 47 : cleanup_tree_cfg ();
643 : 47 : todo &= ~TODO_cleanup_cfg;
644 : : }
645 : 47 : rewrite_into_loop_closed_ssa (NULL, 0);
646 : 47 : scev_reset ();
647 : : }
648 : 25682 : return todo;
649 : : }
650 : :
651 : : /* Pass boilerplate */
652 : :
653 : : namespace {
654 : :
655 : : const pass_data pass_data_loop_jam =
656 : : {
657 : : GIMPLE_PASS, /* type */
658 : : "unrolljam", /* name */
659 : : OPTGROUP_LOOP, /* optinfo_flags */
660 : : TV_LOOP_JAM, /* tv_id */
661 : : PROP_cfg, /* properties_required */
662 : : 0, /* properties_provided */
663 : : 0, /* properties_destroyed */
664 : : 0, /* todo_flags_start */
665 : : 0, /* todo_flags_finish */
666 : : };
667 : :
668 : : class pass_loop_jam : public gimple_opt_pass
669 : : {
670 : : public:
671 : 282866 : pass_loop_jam (gcc::context *ctxt)
672 : 565732 : : gimple_opt_pass (pass_data_loop_jam, ctxt)
673 : : {}
674 : :
675 : : /* opt_pass methods: */
676 : 227899 : bool gate (function *) final override { return flag_unroll_jam != 0; }
677 : : unsigned int execute (function *) final override;
678 : :
679 : : };
680 : :
681 : : unsigned int
682 : 25682 : pass_loop_jam::execute (function *fun)
683 : : {
684 : 51364 : if (number_of_loops (fun) <= 1)
685 : : return 0;
686 : :
687 : 25682 : return tree_loop_unroll_and_jam ();
688 : : }
689 : :
690 : : }
691 : :
692 : : gimple_opt_pass *
693 : 282866 : make_pass_loop_jam (gcc::context *ctxt)
694 : : {
695 : 282866 : return new pass_loop_jam (ctxt);
696 : : }
|