Branch data Line data Source code
1 : : /* Loop unroll-and-jam.
2 : : Copyright (C) 2017-2024 Free Software Foundation, Inc.
3 : :
4 : : This file is part of GCC.
5 : :
6 : : GCC is free software; you can redistribute it and/or modify it
7 : : under the terms of the GNU General Public License as published by the
8 : : Free Software Foundation; either version 3, or (at your option) any
9 : : later version.
10 : :
11 : : GCC is distributed in the hope that it will be useful, but WITHOUT
12 : : ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 : : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 : : for more details.
15 : :
16 : : You should have received a copy of the GNU General Public License
17 : : along with GCC; see the file COPYING3. If not see
18 : : <http://www.gnu.org/licenses/>. */
19 : :
20 : : #include "config.h"
21 : : #include "system.h"
22 : : #include "coretypes.h"
23 : : #include "tree-pass.h"
24 : : #include "backend.h"
25 : : #include "tree.h"
26 : : #include "gimple.h"
27 : : #include "ssa.h"
28 : : #include "fold-const.h"
29 : : #include "tree-cfg.h"
30 : : #include "tree-ssa.h"
31 : : #include "tree-ssa-loop-niter.h"
32 : : #include "tree-ssa-loop.h"
33 : : #include "tree-ssa-loop-manip.h"
34 : : #include "cfgloop.h"
35 : : #include "tree-scalar-evolution.h"
36 : : #include "gimple-iterator.h"
37 : : #include "cfghooks.h"
38 : : #include "tree-data-ref.h"
39 : : #include "tree-ssa-loop-ivopts.h"
40 : : #include "tree-vectorizer.h"
41 : : #include "tree-ssa-sccvn.h"
42 : : #include "tree-cfgcleanup.h"
43 : :
44 : : /* Unroll and Jam transformation
45 : :
46 : : This is a combination of two transformations, where the second
47 : : is not always valid. It's applicable if a loop nest has redundancies
48 : : over the iterations of an outer loop while not having that with
49 : : an inner loop.
50 : :
51 : : Given this nest:
52 : : for (i) {
53 : : for (j) {
54 : : B(i,j)
55 : : }
56 : : }
57 : :
58 : : first unroll:
59 : : for (i by 2) {
60 : : for (j) {
61 : : B(i,j)
62 : : }
63 : : for (j) {
64 : : B(i+1,j)
65 : : }
66 : : }
67 : :
68 : : then fuse the two adjacent inner loops resulting from that:
69 : : for (i by 2) {
70 : : for (j) {
71 : : B(i,j)
72 : : B(i+1,j)
73 : : }
74 : : }
75 : :
76 : : As the order of evaluations of the body B changes this is valid
77 : : only in certain situations: all distance vectors need to be forward.
78 : : Additionally if there are multiple induction variables than just
79 : : a counting control IV (j above) we can also deal with some situations.
80 : :
81 : : The validity is checked by unroll_jam_possible_p, and the data-dep
82 : : testing below.
83 : :
84 : : A trivial example where the fusion is wrong would be when
85 : : B(i,j) == x[j-1] = x[j];
86 : : for (i by 2) {
87 : : for (j) {
88 : : x[j-1] = x[j];
89 : : }
90 : : for (j) {
91 : : x[j-1] = x[j];
92 : : }
93 : : } effect: move content to front by two elements
94 : : -->
95 : : for (i by 2) {
96 : : for (j) {
97 : : x[j-1] = x[j];
98 : : x[j-1] = x[j];
99 : : }
100 : : } effect: move content to front by one element
101 : : */
102 : :
103 : : /* Modify the loop tree for the fact that all code once belonging
104 : : to the OLD loop or the outer loop of OLD now is inside LOOP. */
105 : :
106 : : static void
107 : 71 : merge_loop_tree (class loop *loop, class loop *old)
108 : : {
109 : 71 : basic_block *bbs;
110 : 71 : int i, n;
111 : 71 : class loop *subloop;
112 : 71 : edge e;
113 : 71 : edge_iterator ei;
114 : :
115 : : /* Find its nodes. */
116 : 71 : bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
117 : 71 : n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun));
118 : :
119 : 624 : for (i = 0; i < n; i++)
120 : : {
121 : : /* If the block was direct child of OLD loop it's now part
122 : : of LOOP. If it was outside OLD, then it moved into LOOP
123 : : as well. This avoids changing the loop father for BBs
124 : : in inner loops of OLD. */
125 : 1027 : if (bbs[i]->loop_father == old
126 : 956 : || loop_depth (bbs[i]->loop_father) < loop_depth (old))
127 : : {
128 : 474 : remove_bb_from_loops (bbs[i]);
129 : 474 : add_bb_to_loop (bbs[i], loop);
130 : 474 : continue;
131 : : }
132 : :
133 : : /* If we find a direct subloop of OLD, move it to LOOP. */
134 : 79 : subloop = bbs[i]->loop_father;
135 : 79 : if (loop_outer (subloop) == old && subloop->header == bbs[i])
136 : : {
137 : 0 : flow_loop_tree_node_remove (subloop);
138 : 0 : flow_loop_tree_node_add (loop, subloop);
139 : : }
140 : : }
141 : :
142 : : /* Update the information about loop exit edges. */
143 : 624 : for (i = 0; i < n; i++)
144 : : {
145 : 1201 : FOR_EACH_EDGE (e, ei, bbs[i]->succs)
146 : : {
147 : 648 : rescan_loop_exit (e, false, false);
148 : : }
149 : : }
150 : :
151 : 71 : loop->num_nodes = n;
152 : :
153 : 71 : free (bbs);
154 : 71 : }
155 : :
156 : : /* BB is part of the outer loop of an unroll-and-jam situation.
157 : : Check if any statements therein would prevent the transformation. */
158 : :
159 : : static bool
160 : 6750 : bb_prevents_fusion_p (basic_block bb)
161 : : {
162 : 6750 : gimple_stmt_iterator gsi;
163 : : /* BB is duplicated by outer unrolling and then all N-1 first copies
164 : : move into the body of the fused inner loop. If BB exits the outer loop
165 : : the last copy still does so, and the first N-1 copies are cancelled
166 : : by loop unrolling, so also after fusion it's the exit block.
167 : : But there might be other reasons that prevent fusion:
168 : : * stores or unknown side-effects prevent fusion
169 : : * loads don't
170 : : * computations into SSA names: these aren't problematic. Their
171 : : result will be unused on the exit edges of the first N-1 copies
172 : : (those aren't taken after unrolling). If they are used on the
173 : : other edge (the one leading to the outer latch block) they are
174 : : loop-carried (on the outer loop) and the Nth copy of BB will
175 : : compute them again (i.e. the first N-1 copies will be dead). */
176 : 26724 : for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
177 : : {
178 : 13492 : gimple *g = gsi_stmt (gsi);
179 : 22802 : if (gimple_vdef (g) || gimple_has_side_effects (g))
180 : 268 : return true;
181 : : }
182 : : return false;
183 : : }
184 : :
185 : : /* Given an inner loop LOOP (of some OUTER loop) determine if
186 : : we can safely fuse copies of it (generated by outer unrolling).
187 : : If so return true, otherwise return false. */
188 : :
189 : : static bool
190 : 14189 : unroll_jam_possible_p (class loop *outer, class loop *loop)
191 : : {
192 : 14189 : basic_block *bbs;
193 : 14189 : int i, n;
194 : 14189 : class tree_niter_desc niter;
195 : :
196 : : /* When fusing the loops we skip the latch block
197 : : of the first one, so it mustn't have any effects to
198 : : preserve. */
199 : 14189 : if (!empty_block_p (loop->latch))
200 : : return false;
201 : :
202 : 13614 : edge exit;
203 : 13614 : if (!(exit = single_exit (loop)))
204 : : return false;
205 : :
206 : : /* We need a perfect nest. Quick check for adjacent inner loops. */
207 : 9787 : if (outer->inner != loop || loop->next)
208 : : return false;
209 : :
210 : : /* Prevent head-controlled inner loops, that we usually have.
211 : : The guard block would need to be accepted
212 : : (invariant condition either entering or skipping the loop),
213 : : without also accepting arbitrary control flow. When unswitching
214 : : ran before us (as with -O3) this won't be a problem because its
215 : : outer loop unswitching will have moved out the invariant condition.
216 : :
217 : : If we do that we need to extend fuse_loops() to cope with this
218 : : by threading through the (still invariant) copied condition
219 : : between the two loop copies. */
220 : 5762 : if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header))
221 : : return false;
222 : :
223 : : /* The number of iterations of the inner loop must be loop invariant
224 : : with respect to the outer loop. */
225 : 3582 : if (!number_of_iterations_exit (loop, single_exit (loop), &niter,
226 : : false, true)
227 : 3203 : || niter.cmp == ERROR_MARK
228 : 3203 : || !integer_zerop (niter.may_be_zero)
229 : 6564 : || !expr_invariant_in_loop_p (outer, niter.niter))
230 : 923 : return false;
231 : :
232 : : /* If the inner loop produces any values that are used inside the
233 : : outer loop (except the virtual op) then it can flow
234 : : back (perhaps indirectly) into the inner loop. This prevents
235 : : fusion: without fusion the value at the last iteration is used,
236 : : with fusion the value after the initial iteration is used.
237 : :
238 : : If all uses are outside the outer loop this doesn't prevent fusion;
239 : : the value of the last iteration is still used (and the values from
240 : : all intermediate iterations are dead). */
241 : 2659 : gphi_iterator psi;
242 : 2659 : for (psi = gsi_start_phis (single_exit (loop)->dest);
243 : 5044 : !gsi_end_p (psi); gsi_next (&psi))
244 : : {
245 : 2754 : imm_use_iterator imm_iter;
246 : 2754 : use_operand_p use_p;
247 : 2754 : tree op = gimple_phi_result (psi.phi ());
248 : 5508 : if (virtual_operand_p (op))
249 : 2295 : continue;
250 : 557 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, op)
251 : : {
252 : 467 : gimple *use_stmt = USE_STMT (use_p);
253 : 467 : if (!is_gimple_debug (use_stmt)
254 : 467 : && flow_bb_inside_loop_p (outer, gimple_bb (use_stmt)))
255 : 369 : return false;
256 : : }
257 : : }
258 : :
259 : : /* And check blocks belonging to just outer loop. */
260 : 2290 : bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
261 : 2290 : n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun));
262 : :
263 : 13379 : for (i = 0; i < n; i++)
264 : 11392 : if (bbs[i]->loop_father == outer
265 : 11392 : && (bb_prevents_fusion_p (bbs[i])
266 : : /* Outer loop exits must come after the inner loop, otherwise
267 : : we'll put the outer loop exit into the fused inner loop. */
268 : 6482 : || (loop_exits_from_bb_p (outer, bbs[i])
269 : 2024 : && !dominated_by_p (CDI_DOMINATORS, bbs[i], exit->src))))
270 : : break;
271 : 2290 : free (bbs);
272 : 2290 : if (i != n)
273 : : return false;
274 : :
275 : : /* For now we can safely fuse copies of LOOP only if all
276 : : loop carried variables are inductions (or the virtual op).
277 : :
278 : : We could handle reductions as well (the initial value in the second
279 : : body would be the after-iter value of the first body) if it's over
280 : : an associative and commutative operation. We wouldn't
281 : : be able to handle unknown cycles. */
282 : 5240 : for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi))
283 : : {
284 : 3642 : affine_iv iv;
285 : 3642 : tree op = gimple_phi_result (psi.phi ());
286 : :
287 : 7284 : if (virtual_operand_p (op))
288 : 1583 : continue;
289 : 2059 : if (!simple_iv (loop, loop, op, &iv, true))
290 : 389 : return false;
291 : : /* The inductions must be regular, loop invariant step and initial
292 : : value. */
293 : 2038 : if (!expr_invariant_in_loop_p (outer, iv.step)
294 : 2038 : || !expr_invariant_in_loop_p (outer, iv.base))
295 : 368 : return false;
296 : : /* XXX With more effort we could also be able to deal with inductions
297 : : where the initial value is loop variant but a simple IV in the
298 : : outer loop. The initial value for the second body would be
299 : : the original initial value plus iv.base.step. The next value
300 : : for the fused loop would be the original next value of the first
301 : : copy, _not_ the next value of the second body. */
302 : : }
303 : :
304 : : return true;
305 : 14189 : }
306 : :
307 : : /* Fuse LOOP with all further neighbors. The loops are expected to
308 : : be in appropriate form. */
309 : :
310 : : static void
311 : 71 : fuse_loops (class loop *loop)
312 : : {
313 : 71 : class loop *next = loop->next;
314 : :
315 : 142 : while (next)
316 : : {
317 : 71 : edge e;
318 : :
319 : 71 : remove_branch (single_pred_edge (loop->latch));
320 : : /* Make delete_basic_block not fiddle with the loop structure. */
321 : 71 : basic_block oldlatch = loop->latch;
322 : 71 : loop->latch = NULL;
323 : 71 : delete_basic_block (oldlatch);
324 : 71 : e = redirect_edge_and_branch (loop_latch_edge (next),
325 : : loop->header);
326 : 71 : loop->latch = e->src;
327 : 71 : flush_pending_stmts (e);
328 : :
329 : 71 : gcc_assert (EDGE_COUNT (next->header->preds) == 1);
330 : :
331 : : /* The PHI nodes of the second body (single-argument now)
332 : : need adjustments to use the right values: either directly
333 : : the value of the corresponding PHI in the first copy or
334 : : the one leaving the first body which unrolling did for us.
335 : :
336 : : See also unroll_jam_possible_p() for further possibilities. */
337 : 71 : gphi_iterator psi_first, psi_second;
338 : 71 : e = single_pred_edge (next->header);
339 : 71 : for (psi_first = gsi_start_phis (loop->header),
340 : 71 : psi_second = gsi_start_phis (next->header);
341 : 198 : !gsi_end_p (psi_first);
342 : 127 : gsi_next (&psi_first), gsi_next (&psi_second))
343 : : {
344 : 127 : gphi *phi_first = psi_first.phi ();
345 : 127 : gphi *phi_second = psi_second.phi ();
346 : 127 : tree firstop = gimple_phi_result (phi_first);
347 : : /* The virtual operand is correct already as it's
348 : : always live at exit, hence has a LCSSA node and outer
349 : : loop unrolling updated SSA form. */
350 : 254 : if (virtual_operand_p (firstop))
351 : 55 : continue;
352 : :
353 : : /* Due to unroll_jam_possible_p() we know that this is
354 : : an induction. The second body goes over the same
355 : : iteration space. */
356 : 72 : add_phi_arg (phi_second, firstop, e,
357 : : gimple_location (phi_first));
358 : : }
359 : 71 : gcc_assert (gsi_end_p (psi_second));
360 : :
361 : 71 : merge_loop_tree (loop, next);
362 : 71 : gcc_assert (!next->num_nodes);
363 : 71 : class loop *ln = next->next;
364 : 71 : delete_loop (next);
365 : 71 : next = ln;
366 : : }
367 : 71 : }
368 : :
369 : : /* Return true if any of the access functions for dataref A
370 : : isn't invariant with respect to loop LOOP_NEST. */
371 : : static bool
372 : 3650 : any_access_function_variant_p (const struct data_reference *a,
373 : : const class loop *loop_nest)
374 : : {
375 : 3650 : vec<tree> fns = DR_ACCESS_FNS (a);
376 : :
377 : 11487 : for (tree t : fns)
378 : 4175 : if (!evolution_function_is_invariant_p (t, loop_nest->num))
379 : : return true;
380 : :
381 : : return false;
382 : : }
383 : :
384 : : /* Returns true if the distance in DDR can be determined and adjusts
385 : : the unroll factor in *UNROLL to make unrolling valid for that distance.
386 : : Otherwise return false. DDR is with respect to the outer loop of INNER.
387 : :
388 : : If this data dep can lead to a removed memory reference, increment
389 : : *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
390 : : for this to happen. */
391 : :
392 : : static bool
393 : 3627 : adjust_unroll_factor (class loop *inner, struct data_dependence_relation *ddr,
394 : : unsigned *unroll, unsigned *profit_unroll,
395 : : unsigned *removed)
396 : : {
397 : 3627 : bool ret = false;
398 : 3627 : if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
399 : : {
400 : 3627 : if (DDR_NUM_DIST_VECTS (ddr) == 0)
401 : 3627 : return false;
402 : : unsigned i;
403 : : lambda_vector dist_v;
404 : 3705 : FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
405 : : {
406 : : /* A distance (a,b) is at worst transformed into (a/N,b) by the
407 : : unrolling (factor N), so the transformation is valid if
408 : : a >= N, or b > 0, or b is zero and a > 0. Otherwise the unroll
409 : : factor needs to be limited so that the first condition holds.
410 : : That may limit the factor down to zero in the worst case. */
411 : 2435 : lambda_int dist = dist_v[0];
412 : 2435 : if (dist < 0)
413 : 0 : gcc_unreachable ();
414 : 2435 : else if (dist >= (lambda_int)*unroll)
415 : : ;
416 : 7143 : else if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
417 : : {
418 : : /* We have (a,0) with a < N, so this will be transformed into
419 : : (0,0) after unrolling by N. This might potentially be a
420 : : problem, if it's not a read-read dependency. */
421 : 2303 : if (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr)))
422 : : ;
423 : : else
424 : : {
425 : : /* So, at least one is a write, and we might reduce the
426 : : distance vector to (0,0). This is still no problem
427 : : if both data-refs are affine with respect to the inner
428 : : loops. But if one of them is invariant with respect
429 : : to an inner loop our reordering implicit in loop fusion
430 : : corrupts the program, as our data dependences don't
431 : : capture this. E.g. for:
432 : : for (0 <= i < n)
433 : : for (0 <= j < m)
434 : : a[i][0] = a[i+1][0] + 2; // (1)
435 : : b[i][j] = b[i+1][j] + 2; // (2)
436 : : the distance vector for both statements is (-1,0),
437 : : but exchanging the order for (2) is okay, while
438 : : for (1) it is not. To see this, write out the original
439 : : accesses (assume m is 2):
440 : : a i j original
441 : : 0 0 0 r a[1][0] b[1][0]
442 : : 1 0 0 w a[0][0] b[0][0]
443 : : 2 0 1 r a[1][0] b[1][1]
444 : : 3 0 1 w a[0][0] b[0][1]
445 : : 4 1 0 r a[2][0] b[2][0]
446 : : 5 1 0 w a[1][0] b[1][0]
447 : : after unroll-by-2 and fusion the accesses are done in
448 : : this order (from column a): 0,1, 4,5, 2,3, i.e. this:
449 : : a i j transformed
450 : : 0 0 0 r a[1][0] b[1][0]
451 : : 1 0 0 w a[0][0] b[0][0]
452 : : 4 1 0 r a[2][0] b[2][0]
453 : : 5 1 0 w a[1][0] b[1][0]
454 : : 2 0 1 r a[1][0] b[1][1]
455 : : 3 0 1 w a[0][0] b[0][1]
456 : : Note how access 2 accesses the same element as access 5
457 : : for array 'a' but not for array 'b'. */
458 : 1831 : if (any_access_function_variant_p (DDR_A (ddr), inner)
459 : 1831 : && any_access_function_variant_p (DDR_B (ddr), inner))
460 : : ;
461 : : else
462 : : /* And if any dataref of this pair is invariant with
463 : : respect to the inner loop, we have no chance than
464 : : to reduce the unroll factor. */
465 : 12 : *unroll = dist;
466 : : }
467 : : }
468 : 2479 : else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
469 : : ;
470 : : else
471 : 44 : *unroll = dist;
472 : :
473 : : /* With a distance (a,0) it's always profitable to unroll-and-jam
474 : : (by a+1), because one memory reference will go away. With
475 : : (a,b) and b != 0 that's less clear. We will increase the
476 : : number of streams without lowering the number of mem refs.
477 : : So for now only handle the first situation. */
478 : 7305 : if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
479 : : {
480 : 2307 : *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1);
481 : 2307 : (*removed)++;
482 : : }
483 : :
484 : 2435 : ret = true;
485 : : }
486 : : }
487 : : return ret;
488 : : }
489 : :
490 : : /* Main entry point for the unroll-and-jam transformation
491 : : described above. */
492 : :
493 : : static unsigned int
494 : 25297 : tree_loop_unroll_and_jam (void)
495 : : {
496 : 25297 : unsigned int todo = 0;
497 : :
498 : 25297 : gcc_assert (scev_initialized_p ());
499 : :
500 : : /* Go through all innermost loops. */
501 : 137738 : for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST))
502 : : {
503 : 61847 : class loop *outer = loop_outer (loop);
504 : :
505 : 61847 : if (loop_depth (loop) < 2
506 : 61847 : || optimize_loop_nest_for_size_p (outer))
507 : 60662 : continue;
508 : :
509 : 14189 : if (!unroll_jam_possible_p (outer, loop))
510 : 12591 : continue;
511 : :
512 : 1598 : vec<data_reference_p> datarefs = vNULL;
513 : 1598 : vec<ddr_p> dependences = vNULL;
514 : 1598 : unsigned unroll_factor, profit_unroll, removed;
515 : 1598 : class tree_niter_desc desc;
516 : 1598 : bool unroll = false;
517 : :
518 : 1598 : auto_vec<loop_p, 3> loop_nest;
519 : 1598 : if (!compute_data_dependences_for_loop (outer, true, &loop_nest,
520 : : &datarefs, &dependences))
521 : : {
522 : 322 : if (dump_file && (dump_flags & TDF_DETAILS))
523 : 0 : fprintf (dump_file, "Cannot analyze data dependencies\n");
524 : 322 : free_data_refs (datarefs);
525 : 322 : free_dependence_relations (dependences);
526 : 322 : continue;
527 : : }
528 : 1276 : if (!datarefs.length ())
529 : 91 : continue;
530 : :
531 : 1185 : if (dump_file && (dump_flags & TDF_DETAILS))
532 : 14 : dump_data_dependence_relations (dump_file, dependences);
533 : :
534 : 1185 : unroll_factor = (unsigned)-1;
535 : 1185 : profit_unroll = 1;
536 : 1185 : removed = 0;
537 : :
538 : : /* Check all dependencies. */
539 : 1185 : unsigned i;
540 : 1185 : struct data_dependence_relation *ddr;
541 : 40542 : FOR_EACH_VEC_ELT (dependences, i, ddr)
542 : : {
543 : 39686 : struct data_reference *dra, *drb;
544 : :
545 : : /* If the refs are independend there's nothing to do. */
546 : 39686 : if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
547 : 32226 : continue;
548 : :
549 : 7460 : dra = DDR_A (ddr);
550 : 7460 : drb = DDR_B (ddr);
551 : :
552 : : /* Nothing interesting for the self dependencies, except for WAW if
553 : : the access function is not affine or constant because we may end
554 : : up reordering writes to the same location. */
555 : 7460 : if (dra == drb)
556 : : {
557 : 7518 : if (DR_IS_WRITE (dra)
558 : 2037 : && !DR_ACCESS_FNS (dra).is_empty ()
559 : 5865 : && DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
560 : : {
561 : 148 : unroll_factor = 0;
562 : 148 : break;
563 : : }
564 : : else
565 : 3685 : continue;
566 : : }
567 : :
568 : : /* Now check the distance vector, for determining a sensible
569 : : outer unroll factor, and for validity of merging the inner
570 : : loop copies. */
571 : 3627 : if (!adjust_unroll_factor (loop, ddr, &unroll_factor, &profit_unroll,
572 : : &removed))
573 : : {
574 : : /* Couldn't get the distance vector. For two reads that's
575 : : harmless (we assume we should unroll). For at least
576 : : one write this means we can't check the dependence direction
577 : : and hence can't determine safety. */
578 : :
579 : 2357 : if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb))
580 : : {
581 : 181 : unroll_factor = 0;
582 : 181 : break;
583 : : }
584 : : }
585 : : }
586 : :
587 : : /* We regard a user-specified minimum percentage of zero as a request
588 : : to ignore all profitability concerns and apply the transformation
589 : : always. */
590 : 1185 : if (!param_unroll_jam_min_percent)
591 : 24 : profit_unroll = MAX(2, profit_unroll);
592 : 1161 : else if (removed * 100 / datarefs.length ()
593 : 1161 : < (unsigned)param_unroll_jam_min_percent)
594 : 938 : profit_unroll = 1;
595 : 1185 : if (unroll_factor > profit_unroll)
596 : 814 : unroll_factor = profit_unroll;
597 : 1185 : if (unroll_factor > (unsigned)param_unroll_jam_max_unroll)
598 : 0 : unroll_factor = param_unroll_jam_max_unroll;
599 : 2441 : unroll = (unroll_factor > 1
600 : 1185 : && can_unroll_loop_p (outer, unroll_factor, &desc));
601 : :
602 : 71 : if (unroll)
603 : : {
604 : 71 : if (dump_enabled_p ())
605 : 8 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS,
606 : 8 : find_loop_location (outer),
607 : : "applying unroll and jam with factor %d\n",
608 : : unroll_factor);
609 : 71 : initialize_original_copy_tables ();
610 : 71 : tree_unroll_loop (outer, unroll_factor, &desc);
611 : 71 : free_original_copy_tables ();
612 : 71 : fuse_loops (outer->inner);
613 : 71 : todo |= TODO_cleanup_cfg;
614 : :
615 : 71 : auto_bitmap exit_bbs;
616 : 71 : bitmap_set_bit (exit_bbs, single_exit (outer)->dest->index);
617 : 71 : todo |= do_rpo_vn (cfun, loop_preheader_edge (outer), exit_bbs);
618 : 71 : }
619 : :
620 : 1185 : loop_nest.release ();
621 : 1185 : free_dependence_relations (dependences);
622 : 1185 : free_data_refs (datarefs);
623 : 1598 : }
624 : :
625 : 25297 : if (todo)
626 : : {
627 : 63 : free_dominance_info (CDI_DOMINATORS);
628 : : /* We need to cleanup the CFG first since otherwise SSA form can
629 : : be not up-to-date from the VN run. */
630 : 63 : if (todo & TODO_cleanup_cfg)
631 : : {
632 : 63 : cleanup_tree_cfg ();
633 : 63 : todo &= ~TODO_cleanup_cfg;
634 : : }
635 : 63 : rewrite_into_loop_closed_ssa (NULL, 0);
636 : 63 : scev_reset ();
637 : : }
638 : 25297 : return todo;
639 : : }
640 : :
641 : : /* Pass boilerplate */
642 : :
643 : : namespace {
644 : :
645 : : const pass_data pass_data_loop_jam =
646 : : {
647 : : GIMPLE_PASS, /* type */
648 : : "unrolljam", /* name */
649 : : OPTGROUP_LOOP, /* optinfo_flags */
650 : : TV_LOOP_JAM, /* tv_id */
651 : : PROP_cfg, /* properties_required */
652 : : 0, /* properties_provided */
653 : : 0, /* properties_destroyed */
654 : : 0, /* todo_flags_start */
655 : : 0, /* todo_flags_finish */
656 : : };
657 : :
658 : : class pass_loop_jam : public gimple_opt_pass
659 : : {
660 : : public:
661 : 280114 : pass_loop_jam (gcc::context *ctxt)
662 : 560228 : : gimple_opt_pass (pass_data_loop_jam, ctxt)
663 : : {}
664 : :
665 : : /* opt_pass methods: */
666 : 225983 : bool gate (function *) final override { return flag_unroll_jam != 0; }
667 : : unsigned int execute (function *) final override;
668 : :
669 : : };
670 : :
671 : : unsigned int
672 : 25297 : pass_loop_jam::execute (function *fun)
673 : : {
674 : 50594 : if (number_of_loops (fun) <= 1)
675 : : return 0;
676 : :
677 : 25297 : return tree_loop_unroll_and_jam ();
678 : : }
679 : :
680 : : }
681 : :
682 : : gimple_opt_pass *
683 : 280114 : make_pass_loop_jam (gcc::context *ctxt)
684 : : {
685 : 280114 : return new pass_loop_jam (ctxt);
686 : : }
|