Line data Source code
1 : /* Loop interchange.
2 : Copyright (C) 2017-2026 Free Software Foundation, Inc.
3 : Contributed by ARM Ltd.
4 :
5 : This file is part of GCC.
6 :
7 : GCC is free software; you can redistribute it and/or modify it
8 : under the terms of the GNU General Public License as published by the
9 : Free Software Foundation; either version 3, or (at your option) any
10 : later version.
11 :
12 : GCC is distributed in the hope that it will be useful, but WITHOUT
13 : ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 : for more details.
16 :
17 : You should have received a copy of the GNU General Public License
18 : along with GCC; see the file COPYING3. If not see
19 : <http://www.gnu.org/licenses/>. */
20 :
21 : #include "config.h"
22 : #include "system.h"
23 : #include "coretypes.h"
24 : #include "backend.h"
25 : #include "is-a.h"
26 : #include "tree.h"
27 : #include "gimple.h"
28 : #include "tree-pass.h"
29 : #include "ssa.h"
30 : #include "gimple-pretty-print.h"
31 : #include "fold-const.h"
32 : #include "gimplify.h"
33 : #include "gimple-iterator.h"
34 : #include "gimplify-me.h"
35 : #include "cfgloop.h"
36 : #include "tree-ssa.h"
37 : #include "tree-scalar-evolution.h"
38 : #include "tree-ssa-loop-manip.h"
39 : #include "tree-ssa-loop-niter.h"
40 : #include "tree-ssa-loop-ivopts.h"
41 : #include "tree-ssa-dce.h"
42 : #include "tree-data-ref.h"
43 : #include "tree-vectorizer.h"
44 :
45 : /* This pass performs loop interchange: for example, the loop nest
46 :
47 : for (int j = 0; j < N; j++)
48 : for (int k = 0; k < N; k++)
49 : for (int i = 0; i < N; i++)
50 : c[i][j] = c[i][j] + a[i][k]*b[k][j];
51 :
52 : is transformed to
53 :
54 : for (int i = 0; i < N; i++)
55 : for (int j = 0; j < N; j++)
56 : for (int k = 0; k < N; k++)
57 : c[i][j] = c[i][j] + a[i][k]*b[k][j];
58 :
59 : This pass implements loop interchange in the following steps:
60 :
61 : 1) Find perfect loop nest for each innermost loop and compute data
62 : dependence relations for it. For above example, loop nest is
63 : <loop_j, loop_k, loop_i>.
64 : 2) From innermost to outermost loop, this pass tries to interchange
65 : each loop pair. For above case, it firstly tries to interchange
66 : <loop_k, loop_i> and loop nest becomes <loop_j, loop_i, loop_k>.
67 : Then it tries to interchange <loop_j, loop_i> and loop nest becomes
68 : <loop_i, loop_j, loop_k>. The overall effect is to move innermost
69 : loop to the outermost position. For loop pair <loop_i, loop_j>
70 : to be interchanged, we:
71 : 3) Check if data dependence relations are valid for loop interchange.
72 : 4) Check if both loops can be interchanged in terms of transformation.
73 : 5) Check if interchanging the two loops is profitable.
74 : 6) Interchange the two loops by mapping induction variables.
75 :
76 : This pass also handles reductions in loop nest. So far we only support
77 : simple reduction of inner loop and double reduction of the loop nest. */
78 :
79 : /* Maximum number of stmts in each loop that should be interchanged. */
80 : #define MAX_NUM_STMT (param_loop_interchange_max_num_stmts)
81 : /* Maximum number of data references in loop nest. */
82 : #define MAX_DATAREFS (param_loop_max_datarefs_for_datadeps)
83 :
84 : /* Comparison ratio of access stride between inner/outer loops to be
85 : interchanged. This is the minimum stride ratio for loop interchange
86 : to be profitable. */
87 : #define OUTER_STRIDE_RATIO (param_loop_interchange_stride_ratio)
88 : /* The same as above, but we require higher ratio for interchanging the
89 : innermost two loops. */
90 : #define INNER_STRIDE_RATIO ((OUTER_STRIDE_RATIO) + 1)
91 :
92 : /* Comparison ratio of stmt cost between inner/outer loops. Loops won't
93 : be interchanged if outer loop has too many stmts. */
94 : #define STMT_COST_RATIO (3)
95 :
96 : /* Vector of strides that DR accesses in each level loop of a loop nest. */
97 : #define DR_ACCESS_STRIDE(dr) ((vec<tree> *) dr->aux)
98 :
99 : /* Structure recording loop induction variable. */
100 : typedef struct induction
101 : {
102 : /* IV itself. */
103 : tree var;
104 : /* IV's initializing value, which is the init arg of the IV PHI node. */
105 : tree init_val;
106 : /* IV's initializing expr, which is (the expanded result of) init_val. */
107 : tree init_expr;
108 : /* IV's step. */
109 : tree step;
110 : } *induction_p;
111 :
112 : /* Enum type for loop reduction variable. */
113 : enum reduction_type
114 : {
115 : UNKNOWN_RTYPE = 0,
116 : SIMPLE_RTYPE,
117 : DOUBLE_RTYPE
118 : };
119 :
120 : /* Structure recording loop reduction variable. */
121 : typedef struct reduction
122 : {
123 : /* Reduction itself. */
124 : tree var;
125 : /* PHI node defining reduction variable. */
126 : gphi *phi;
127 : /* Init and next variables of the reduction. */
128 : tree init;
129 : tree next;
130 : /* Lcssa PHI node if reduction is used outside of its definition loop. */
131 : gphi *lcssa_phi;
132 : /* Stmts defining init and next. */
133 : gimple *producer;
134 : gimple *consumer;
135 : /* If init is loaded from memory, this is the loading memory reference. */
136 : tree init_ref;
137 : /* If reduction is finally stored to memory, this is the stored memory
138 : reference. */
139 : tree fini_ref;
140 : enum reduction_type type;
141 : } *reduction_p;
142 :
143 :
144 : /* Dump reduction RE. */
145 :
146 : static void
147 10 : dump_reduction (reduction_p re)
148 : {
149 10 : if (re->type == SIMPLE_RTYPE)
150 6 : fprintf (dump_file, " Simple reduction: ");
151 4 : else if (re->type == DOUBLE_RTYPE)
152 2 : fprintf (dump_file, " Double reduction: ");
153 : else
154 2 : fprintf (dump_file, " Unknown reduction: ");
155 :
156 10 : print_gimple_stmt (dump_file, re->phi, 0);
157 10 : }
158 :
159 : /* Dump LOOP's induction IV. */
160 : static void
161 51 : dump_induction (class loop *loop, induction_p iv)
162 : {
163 51 : fprintf (dump_file, " Induction: ");
164 51 : print_generic_expr (dump_file, iv->var, TDF_SLIM);
165 51 : fprintf (dump_file, " = {");
166 51 : print_generic_expr (dump_file, iv->init_expr, TDF_SLIM);
167 51 : fprintf (dump_file, ", ");
168 51 : print_generic_expr (dump_file, iv->step, TDF_SLIM);
169 51 : fprintf (dump_file, "}_%d\n", loop->num);
170 51 : }
171 :
172 : /* Loop candidate for interchange. */
173 :
174 : class loop_cand
175 : {
176 : public:
177 : loop_cand (class loop *, class loop *);
178 : ~loop_cand ();
179 :
180 : reduction_p find_reduction_by_stmt (gimple *);
181 : void classify_simple_reduction (reduction_p);
182 : bool analyze_iloop_reduction_var (tree);
183 : bool analyze_oloop_reduction_var (loop_cand *, tree);
184 : bool analyze_induction_var (tree, tree);
185 : bool analyze_carried_vars (loop_cand *);
186 : bool analyze_lcssa_phis (void);
187 : bool can_interchange_p (loop_cand *);
188 : void undo_simple_reduction (reduction_p, bitmap);
189 :
190 : /* The loop itself. */
191 : class loop *m_loop;
192 : /* The outer loop for interchange. It equals to loop if this loop cand
193 : itself represents the outer loop. */
194 : class loop *m_outer;
195 : /* Vector of induction variables in loop. */
196 : vec<induction_p> m_inductions;
197 : /* Vector of reduction variables in loop. */
198 : vec<reduction_p> m_reductions;
199 : /* Lcssa PHI nodes of this loop. */
200 : vec<gphi *> m_lcssa_nodes;
201 : /* Single exit edge of this loop. */
202 : edge m_exit;
203 : /* Basic blocks of this loop. */
204 : basic_block *m_bbs;
205 : /* Number of stmts of this loop. Inner loops' stmts are not included. */
206 : int m_num_stmts;
207 : /* Number of constant initialized simple reduction. */
208 : int m_const_init_reduc;
209 : };
210 :
211 : /* Constructor. */
212 :
213 176 : loop_cand::loop_cand (class loop *loop, class loop *outer)
214 176 : : m_loop (loop), m_outer (outer), m_exit (single_exit (loop)),
215 176 : m_bbs (get_loop_body (loop)), m_num_stmts (0), m_const_init_reduc (0)
216 : {
217 176 : m_inductions.create (3);
218 176 : m_reductions.create (3);
219 176 : m_lcssa_nodes.create (3);
220 176 : }
221 :
222 : /* Destructor. */
223 :
224 176 : loop_cand::~loop_cand ()
225 : {
226 176 : induction_p iv;
227 482 : for (unsigned i = 0; m_inductions.iterate (i, &iv); ++i)
228 306 : free (iv);
229 :
230 : reduction_p re;
231 211 : for (unsigned i = 0; m_reductions.iterate (i, &re); ++i)
232 35 : free (re);
233 :
234 176 : m_inductions.release ();
235 176 : m_reductions.release ();
236 176 : m_lcssa_nodes.release ();
237 176 : free (m_bbs);
238 176 : }
239 :
240 : /* Return single use stmt of VAR in LOOP, otherwise return NULL. */
241 :
242 : static gimple *
243 13 : single_use_in_loop (tree var, class loop *loop)
244 : {
245 13 : gimple *stmt, *res = NULL;
246 13 : use_operand_p use_p;
247 13 : imm_use_iterator iterator;
248 :
249 39 : FOR_EACH_IMM_USE_FAST (use_p, iterator, var)
250 : {
251 13 : stmt = USE_STMT (use_p);
252 13 : if (is_gimple_debug (stmt))
253 0 : continue;
254 :
255 13 : if (!flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
256 0 : continue;
257 :
258 13 : if (res)
259 0 : return NULL;
260 :
261 : res = stmt;
262 0 : }
263 13 : return res;
264 : }
265 :
266 : /* Return true if E is unsupported in loop interchange, i.e, E is a complex
267 : edge or part of irreducible loop. */
268 :
269 : static inline bool
270 126013 : unsupported_edge (edge e)
271 : {
272 126013 : return (e->flags & (EDGE_COMPLEX | EDGE_IRREDUCIBLE_LOOP));
273 : }
274 :
275 : /* Return the reduction if STMT is one of its lcssa PHI, producer or consumer
276 : stmt. */
277 :
278 : reduction_p
279 53 : loop_cand::find_reduction_by_stmt (gimple *stmt)
280 : {
281 53 : gphi *phi = dyn_cast <gphi *> (stmt);
282 53 : reduction_p re;
283 :
284 53 : for (unsigned i = 0; m_reductions.iterate (i, &re); ++i)
285 43 : if ((phi != NULL && phi == re->lcssa_phi)
286 10 : || (stmt == re->producer || stmt == re->consumer))
287 : return re;
288 :
289 : return NULL;
290 : }
291 :
292 : /* Return true if current loop_cand be interchanged. ILOOP is not NULL if
293 : current loop_cand is outer loop in loop nest. */
294 :
295 : bool
296 141 : loop_cand::can_interchange_p (loop_cand *iloop)
297 : {
298 : /* For now we only support at most one reduction. */
299 141 : unsigned allowed_reduction_num = 1;
300 :
301 : /* Only support reduction if the loop nest to be interchanged is the
302 : innermostin two loops. */
303 141 : if ((iloop == NULL && m_loop->inner != NULL)
304 70 : || (iloop != NULL && iloop->m_loop->inner != NULL))
305 141 : allowed_reduction_num = 0;
306 :
307 141 : if (m_reductions.length () > allowed_reduction_num
308 141 : || (m_reductions.length () == 1
309 30 : && m_reductions[0]->type == UNKNOWN_RTYPE))
310 : return false;
311 :
312 : /* Only support lcssa PHI node which is for reduction. */
313 282 : if (m_lcssa_nodes.length () > allowed_reduction_num)
314 : return false;
315 :
316 : /* Check if basic block has any unsupported operation. Note basic blocks
317 : of inner loops are not checked here. */
318 754 : for (unsigned i = 0; i < m_loop->num_nodes; i++)
319 : {
320 615 : basic_block bb = m_bbs[i];
321 615 : gphi_iterator psi;
322 615 : gimple_stmt_iterator gsi;
323 :
324 : /* Skip basic blocks of inner loops. */
325 615 : if (bb->loop_father != m_loop)
326 543 : continue;
327 :
328 2323 : for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
329 : {
330 1586 : gimple *stmt = gsi_stmt (gsi);
331 1586 : if (is_gimple_debug (stmt))
332 358 : continue;
333 :
334 1228 : if (gimple_has_side_effects (stmt))
335 2 : return false;
336 :
337 1228 : m_num_stmts++;
338 1228 : if (gcall *call = dyn_cast <gcall *> (stmt))
339 : {
340 : /* In basic block of outer loop, the call should be cheap since
341 : it will be moved to inner loop. */
342 1 : if (iloop != NULL
343 1 : && !gimple_inexpensive_call_p (call))
344 : return false;
345 1 : continue;
346 : }
347 :
348 1496 : if (!iloop || !gimple_vuse (stmt))
349 1210 : continue;
350 :
351 : /* Support stmt accessing memory in outer loop only if it is for
352 : inner loop's reduction. */
353 17 : if (iloop->find_reduction_by_stmt (stmt))
354 10 : continue;
355 :
356 7 : tree lhs;
357 : /* Support loop invariant memory reference if it's only used once by
358 : inner loop. */
359 : /* ??? How's this checking for invariantness? */
360 7 : if (gimple_assign_single_p (stmt)
361 7 : && (lhs = gimple_assign_lhs (stmt)) != NULL_TREE
362 7 : && TREE_CODE (lhs) == SSA_NAME
363 13 : && single_use_in_loop (lhs, iloop->m_loop))
364 6 : continue;
365 :
366 : return false;
367 : }
368 : /* Check if loop has too many stmts. */
369 368 : if (m_num_stmts > MAX_NUM_STMT)
370 : return false;
371 :
372 : /* Allow PHI nodes in any basic block of inner loop, PHI nodes in outer
373 : loop's header, or PHI nodes in dest bb of inner loop's exit edge. */
374 367 : if (!iloop || bb == m_loop->header
375 139 : || bb == iloop->m_exit->dest)
376 297 : continue;
377 :
378 : /* Don't allow any other PHI nodes. */
379 70 : for (psi = gsi_start_phis (bb); !gsi_end_p (psi); gsi_next (&psi))
380 2 : if (!virtual_operand_p (PHI_RESULT (psi.phi ())))
381 : return false;
382 : }
383 :
384 : return true;
385 : }
386 :
387 : /* Programmers and optimizers (like loop store motion) may optimize code:
388 :
389 : for (int i = 0; i < N; i++)
390 : for (int j = 0; j < N; j++)
391 : a[i] += b[j][i] * c[j][i];
392 :
393 : into reduction:
394 :
395 : for (int i = 0; i < N; i++)
396 : {
397 : // producer. Note sum can be intitialized to a constant.
398 : int sum = a[i];
399 : for (int j = 0; j < N; j++)
400 : {
401 : sum += b[j][i] * c[j][i];
402 : }
403 : // consumer.
404 : a[i] = sum;
405 : }
406 :
407 : The result code can't be interchanged without undoing the optimization.
408 : This function classifies this kind reduction and records information so
409 : that we can undo the store motion during interchange. */
410 :
411 : void
412 21 : loop_cand::classify_simple_reduction (reduction_p re)
413 : {
414 21 : gimple *producer, *consumer;
415 :
416 : /* Check init variable of reduction and how it is initialized. */
417 21 : if (TREE_CODE (re->init) == SSA_NAME)
418 : {
419 17 : producer = SSA_NAME_DEF_STMT (re->init);
420 17 : re->producer = producer;
421 17 : basic_block bb = gimple_bb (producer);
422 17 : if (!bb || bb->loop_father != m_outer)
423 : return;
424 :
425 17 : if (!gimple_assign_load_p (producer))
426 : return;
427 :
428 3 : re->init_ref = gimple_assign_rhs1 (producer);
429 : }
430 4 : else if (CONSTANT_CLASS_P (re->init))
431 4 : m_const_init_reduc++;
432 : else
433 : return;
434 :
435 : /* Check how reduction variable is used. */
436 7 : consumer = single_use_in_loop (PHI_RESULT (re->lcssa_phi), m_outer);
437 7 : if (!consumer
438 7 : || !gimple_store_p (consumer))
439 0 : return;
440 :
441 7 : re->fini_ref = gimple_get_lhs (consumer);
442 7 : re->consumer = consumer;
443 :
444 : /* Simple reduction with constant initializer. */
445 7 : if (!re->init_ref)
446 : {
447 4 : gcc_assert (CONSTANT_CLASS_P (re->init));
448 4 : re->init_ref = unshare_expr (re->fini_ref);
449 : }
450 :
451 : /* Require memory references in producer and consumer are the same so
452 : that we can undo reduction during interchange. */
453 7 : if (re->init_ref && !operand_equal_p (re->init_ref, re->fini_ref, 0))
454 : return;
455 :
456 7 : re->type = SIMPLE_RTYPE;
457 : }
458 :
459 : /* Analyze reduction variable VAR for inner loop of the loop nest to be
460 : interchanged. Return true if analysis succeeds. */
461 :
462 : bool
463 31 : loop_cand::analyze_iloop_reduction_var (tree var)
464 : {
465 31 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (var));
466 31 : gphi *lcssa_phi = NULL, *use_phi;
467 31 : tree init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (m_loop));
468 31 : tree next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (m_loop));
469 31 : reduction_p re;
470 31 : gimple *stmt, *next_def, *single_use = NULL;
471 31 : use_operand_p use_p;
472 31 : imm_use_iterator iterator;
473 :
474 31 : if (TREE_CODE (next) != SSA_NAME)
475 : return false;
476 :
477 31 : next_def = SSA_NAME_DEF_STMT (next);
478 31 : basic_block bb = gimple_bb (next_def);
479 31 : if (!bb || !flow_bb_inside_loop_p (m_loop, bb))
480 0 : return false;
481 :
482 : /* In restricted reduction, the var is (and must be) used in defining
483 : the updated var. The process can be depicted as below:
484 :
485 : var ;; = PHI<init, next>
486 : |
487 : |
488 : v
489 : +---------------------+
490 : | reduction operators | <-- other operands
491 : +---------------------+
492 : |
493 : |
494 : v
495 : next
496 :
497 : In terms loop interchange, we don't change how NEXT is computed based
498 : on VAR and OTHER OPERANDS. In case of double reduction in loop nest
499 : to be interchanged, we don't changed it at all. In the case of simple
500 : reduction in inner loop, we only make change how VAR/NEXT is loaded or
501 : stored. With these conditions, we can relax restrictions on reduction
502 : in a way that reduction operation is seen as black box. In general,
503 : we can ignore reassociation of reduction operator; we can handle fake
504 : reductions in which VAR is not even used to compute NEXT. */
505 31 : if (! single_imm_use (var, &use_p, &single_use)
506 31 : || ! flow_bb_inside_loop_p (m_loop, gimple_bb (single_use)))
507 2 : return false;
508 :
509 : /* Check the reduction operation. We require a left-associative operation.
510 : For FP math we also need to be allowed to associate operations. */
511 39 : if (gassign *ass = dyn_cast <gassign *> (single_use))
512 : {
513 29 : enum tree_code code = gimple_assign_rhs_code (ass);
514 32 : if (! (associative_tree_code (code)
515 : || (code == MINUS_EXPR
516 1 : && use_p->use == gimple_assign_rhs1_ptr (ass)))
517 29 : || (FLOAT_TYPE_P (TREE_TYPE (var))
518 12 : && ! flag_associative_math))
519 : return false;
520 : }
521 : else
522 : return false;
523 :
524 : /* Handle and verify a series of stmts feeding the reduction op. */
525 21 : if (single_use != next_def
526 21 : && !check_reduction_path (dump_user_location_t (), m_loop, phi, next,
527 : gimple_assign_rhs_code (single_use)))
528 0 : return false;
529 :
530 : /* Only support cases in which INIT is used in inner loop. */
531 21 : if (TREE_CODE (init) == SSA_NAME)
532 36 : FOR_EACH_IMM_USE_FAST (use_p, iterator, init)
533 : {
534 19 : stmt = USE_STMT (use_p);
535 19 : if (is_gimple_debug (stmt))
536 2 : continue;
537 :
538 17 : if (!flow_bb_inside_loop_p (m_loop, gimple_bb (stmt)))
539 0 : return false;
540 17 : }
541 :
542 67 : FOR_EACH_IMM_USE_FAST (use_p, iterator, next)
543 : {
544 46 : stmt = USE_STMT (use_p);
545 46 : if (is_gimple_debug (stmt))
546 4 : continue;
547 :
548 : /* Or else it's used in PHI itself. */
549 42 : use_phi = dyn_cast <gphi *> (stmt);
550 42 : if (use_phi == phi)
551 21 : continue;
552 :
553 21 : if (use_phi != NULL
554 21 : && lcssa_phi == NULL
555 21 : && gimple_bb (stmt) == m_exit->dest
556 42 : && PHI_ARG_DEF_FROM_EDGE (use_phi, m_exit) == next)
557 : lcssa_phi = use_phi;
558 : else
559 0 : return false;
560 0 : }
561 21 : if (!lcssa_phi)
562 : return false;
563 :
564 21 : re = XCNEW (struct reduction);
565 21 : re->var = var;
566 21 : re->init = init;
567 21 : re->next = next;
568 21 : re->phi = phi;
569 21 : re->lcssa_phi = lcssa_phi;
570 :
571 21 : classify_simple_reduction (re);
572 :
573 21 : if (dump_file && (dump_flags & TDF_DETAILS))
574 8 : dump_reduction (re);
575 :
576 21 : m_reductions.safe_push (re);
577 21 : return true;
578 : }
579 :
580 : /* Analyze reduction variable VAR for outer loop of the loop nest to be
581 : interchanged. ILOOP is not NULL and points to inner loop. For the
582 : moment, we only support double reduction for outer loop, like:
583 :
584 : for (int i = 0; i < n; i++)
585 : {
586 : int sum = 0;
587 :
588 : for (int j = 0; j < n; j++) // outer loop
589 : for (int k = 0; k < n; k++) // inner loop
590 : sum += a[i][k]*b[k][j];
591 :
592 : s[i] = sum;
593 : }
594 :
595 : Note the innermost two loops are the loop nest to be interchanged.
596 : Return true if analysis succeeds. */
597 :
598 : bool
599 18 : loop_cand::analyze_oloop_reduction_var (loop_cand *iloop, tree var)
600 : {
601 18 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (var));
602 18 : gphi *lcssa_phi = NULL, *use_phi;
603 18 : tree init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (m_loop));
604 18 : tree next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (m_loop));
605 18 : reduction_p re;
606 18 : gimple *stmt, *next_def;
607 18 : use_operand_p use_p;
608 18 : imm_use_iterator iterator;
609 :
610 18 : if (TREE_CODE (next) != SSA_NAME)
611 : return false;
612 :
613 18 : next_def = SSA_NAME_DEF_STMT (next);
614 18 : basic_block bb = gimple_bb (next_def);
615 18 : if (!bb || !flow_bb_inside_loop_p (m_loop, bb))
616 0 : return false;
617 :
618 : /* Find inner loop's simple reduction that uses var as initializer. */
619 20 : reduction_p inner_re = NULL;
620 20 : for (unsigned i = 0; iloop->m_reductions.iterate (i, &inner_re); ++i)
621 16 : if (inner_re->init == var || operand_equal_p (inner_re->init, var, 0))
622 : break;
623 :
624 18 : if (inner_re == NULL
625 14 : || inner_re->type != UNKNOWN_RTYPE
626 14 : || inner_re->producer != phi)
627 : return false;
628 :
629 : /* In case of double reduction, outer loop's reduction should be updated
630 : by inner loop's simple reduction. */
631 14 : if (next_def != inner_re->lcssa_phi)
632 : return false;
633 :
634 : /* Outer loop's reduction should only be used to initialize inner loop's
635 : simple reduction. */
636 14 : if (! single_imm_use (var, &use_p, &stmt)
637 14 : || stmt != inner_re->phi)
638 : return false;
639 :
640 : /* Check this reduction is correctly used outside of loop via lcssa phi. */
641 44 : FOR_EACH_IMM_USE_FAST (use_p, iterator, next)
642 : {
643 30 : stmt = USE_STMT (use_p);
644 30 : if (is_gimple_debug (stmt))
645 2 : continue;
646 :
647 : /* Or else it's used in PHI itself. */
648 28 : use_phi = dyn_cast <gphi *> (stmt);
649 28 : if (use_phi == phi)
650 14 : continue;
651 :
652 14 : if (lcssa_phi == NULL
653 14 : && use_phi != NULL
654 14 : && gimple_bb (stmt) == m_exit->dest
655 28 : && PHI_ARG_DEF_FROM_EDGE (use_phi, m_exit) == next)
656 : lcssa_phi = use_phi;
657 : else
658 0 : return false;
659 0 : }
660 14 : if (!lcssa_phi)
661 : return false;
662 :
663 14 : re = XCNEW (struct reduction);
664 14 : re->var = var;
665 14 : re->init = init;
666 14 : re->next = next;
667 14 : re->phi = phi;
668 14 : re->lcssa_phi = lcssa_phi;
669 14 : re->type = DOUBLE_RTYPE;
670 14 : inner_re->type = DOUBLE_RTYPE;
671 :
672 14 : if (dump_file && (dump_flags & TDF_DETAILS))
673 2 : dump_reduction (re);
674 :
675 14 : m_reductions.safe_push (re);
676 14 : return true;
677 : }
678 :
679 : /* Return true if VAR is induction variable of current loop whose scev is
680 : specified by CHREC. */
681 :
682 : bool
683 306 : loop_cand::analyze_induction_var (tree var, tree chrec)
684 : {
685 306 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (var));
686 306 : tree init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (m_loop));
687 :
688 : /* Var is loop invariant, though it's unlikely to happen. */
689 306 : if (tree_does_not_contain_chrecs (chrec))
690 : {
691 : /* Punt on floating point invariants if honoring signed zeros,
692 : representing that as + 0.0 would change the result if init
693 : is -0.0. Similarly for SNaNs it can raise exception. */
694 0 : if (HONOR_SIGNED_ZEROS (chrec) || HONOR_SNANS (chrec))
695 0 : return false;
696 0 : struct induction *iv = XCNEW (struct induction);
697 0 : iv->var = var;
698 0 : iv->init_val = init;
699 0 : iv->init_expr = chrec;
700 0 : iv->step = build_zero_cst (TREE_TYPE (chrec));
701 0 : m_inductions.safe_push (iv);
702 0 : return true;
703 : }
704 :
705 306 : if (TREE_CODE (chrec) != POLYNOMIAL_CHREC
706 306 : || CHREC_VARIABLE (chrec) != (unsigned) m_loop->num
707 306 : || tree_contains_chrecs (CHREC_LEFT (chrec), NULL)
708 612 : || tree_contains_chrecs (CHREC_RIGHT (chrec), NULL))
709 0 : return false;
710 :
711 306 : struct induction *iv = XCNEW (struct induction);
712 306 : iv->var = var;
713 306 : iv->init_val = init;
714 306 : iv->init_expr = CHREC_LEFT (chrec);
715 306 : iv->step = CHREC_RIGHT (chrec);
716 :
717 306 : if (dump_file && (dump_flags & TDF_DETAILS))
718 51 : dump_induction (m_loop, iv);
719 :
720 306 : m_inductions.safe_push (iv);
721 306 : return true;
722 : }
723 :
724 : /* Return true if all loop carried variables defined in loop header can
725 : be successfully analyzed. */
726 :
727 : bool
728 166 : loop_cand::analyze_carried_vars (loop_cand *iloop)
729 : {
730 166 : edge e = loop_preheader_edge (m_outer);
731 166 : gphi_iterator gsi;
732 :
733 166 : if (dump_file && (dump_flags & TDF_DETAILS))
734 38 : fprintf (dump_file, "\nLoop(%d) carried vars:\n", m_loop->num);
735 :
736 659 : for (gsi = gsi_start_phis (m_loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
737 : {
738 507 : gphi *phi = gsi.phi ();
739 :
740 507 : tree var = PHI_RESULT (phi);
741 1014 : if (virtual_operand_p (var))
742 152 : continue;
743 :
744 355 : tree chrec = analyze_scalar_evolution (m_loop, var);
745 355 : chrec = instantiate_scev (e, m_loop, chrec);
746 :
747 : /* Analyze var as reduction variable. */
748 355 : if (chrec_contains_undetermined (chrec)
749 355 : || chrec_contains_symbols_defined_in_loop (chrec, m_outer->num))
750 : {
751 49 : if (iloop && !analyze_oloop_reduction_var (iloop, var))
752 : return false;
753 45 : if (!iloop && !analyze_iloop_reduction_var (var))
754 : return false;
755 : }
756 : /* Analyze var as induction variable. */
757 306 : else if (!analyze_induction_var (var, chrec))
758 : return false;
759 : }
760 :
761 : return true;
762 : }
763 :
764 : /* Return TRUE if loop closed PHI nodes can be analyzed successfully. */
765 :
766 : bool
767 152 : loop_cand::analyze_lcssa_phis (void)
768 : {
769 152 : gphi_iterator gsi;
770 328 : for (gsi = gsi_start_phis (m_exit->dest); !gsi_end_p (gsi); gsi_next (&gsi))
771 : {
772 179 : gphi *phi = gsi.phi ();
773 :
774 358 : if (virtual_operand_p (PHI_RESULT (phi)))
775 143 : continue;
776 :
777 : /* TODO: We only support lcssa phi for reduction for now. */
778 36 : if (!find_reduction_by_stmt (phi))
779 : return false;
780 : }
781 :
782 : return true;
783 : }
784 :
785 : /* CONSUMER is a stmt in BB storing reduction result into memory object.
786 : When the reduction is intialized from constant value, we need to add
787 : a stmt loading from the memory object to target basic block in inner
788 : loop during undoing the reduction. Problem is that memory reference
789 : may use ssa variables not dominating the target basic block. This
790 : function finds all stmts on which CONSUMER depends in basic block BB,
791 : records and returns them via STMTS. */
792 :
793 : static void
794 4 : find_deps_in_bb_for_stmt (gimple_seq *stmts, basic_block bb, gimple *consumer)
795 : {
796 4 : auto_vec<gimple *, 4> worklist;
797 4 : use_operand_p use_p;
798 4 : ssa_op_iter iter;
799 4 : gimple *stmt, *def_stmt;
800 4 : gimple_stmt_iterator gsi;
801 :
802 : /* First clear flag for stmts in bb. */
803 21 : for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
804 13 : gimple_set_plf (gsi_stmt (gsi), GF_PLF_1, false);
805 :
806 : /* DFS search all depended stmts in bb and mark flag for these stmts. */
807 4 : worklist.safe_push (consumer);
808 13 : while (!worklist.is_empty ())
809 : {
810 5 : stmt = worklist.pop ();
811 19 : FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
812 : {
813 14 : def_stmt = SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p));
814 :
815 14 : if (is_a <gphi *> (def_stmt)
816 10 : || gimple_bb (def_stmt) != bb
817 15 : || gimple_plf (def_stmt, GF_PLF_1))
818 13 : continue;
819 :
820 1 : worklist.safe_push (def_stmt);
821 : }
822 5 : gimple_set_plf (stmt, GF_PLF_1, true);
823 : }
824 4 : for (gsi = gsi_start_nondebug_bb (bb);
825 5 : !gsi_end_p (gsi) && (stmt = gsi_stmt (gsi)) != consumer;)
826 : {
827 : /* Move dep stmts to sequence STMTS. */
828 1 : if (gimple_plf (stmt, GF_PLF_1))
829 : {
830 1 : gsi_remove (&gsi, false);
831 1 : gimple_seq_add_stmt_without_update (stmts, stmt);
832 : }
833 : else
834 0 : gsi_next_nondebug (&gsi);
835 : }
836 4 : }
837 :
838 : /* User can write, optimizers can generate simple reduction RE for inner
839 : loop. In order to make interchange valid, we have to undo reduction by
840 : moving producer and consumer stmts into the inner loop. For example,
841 : below code:
842 :
843 : init = MEM_REF[idx]; //producer
844 : loop:
845 : var = phi<init, next>
846 : next = var op ...
847 : reduc_sum = phi<next>
848 : MEM_REF[idx] = reduc_sum //consumer
849 :
850 : is transformed into:
851 :
852 : loop:
853 : new_var = MEM_REF[idx]; //producer after moving
854 : next = new_var op ...
855 : MEM_REF[idx] = next; //consumer after moving
856 :
857 : Note if the reduction variable is initialized to constant, like:
858 :
859 : var = phi<0.0, next>
860 :
861 : we compute new_var as below:
862 :
863 : loop:
864 : tmp = MEM_REF[idx];
865 : new_var = !first_iteration ? tmp : 0.0;
866 :
867 : so that the initial const is used in the first iteration of loop. Also
868 : record ssa variables for dead code elimination in DCE_SEEDS. */
869 :
870 : void
871 7 : loop_cand::undo_simple_reduction (reduction_p re, bitmap dce_seeds)
872 : {
873 7 : gimple *stmt;
874 7 : gimple_stmt_iterator from, to = gsi_after_labels (m_loop->header);
875 7 : gimple_seq stmts = NULL;
876 7 : tree new_var;
877 :
878 : /* Prepare the initialization stmts and insert it to inner loop. */
879 7 : if (re->producer != NULL)
880 : {
881 3 : gimple_set_vuse (re->producer, NULL_TREE);
882 3 : update_stmt (re->producer);
883 3 : from = gsi_for_stmt (re->producer);
884 3 : gsi_remove (&from, false);
885 3 : gimple_seq_add_stmt_without_update (&stmts, re->producer);
886 3 : new_var = re->init;
887 : }
888 : else
889 : {
890 : /* Find all stmts on which expression "MEM_REF[idx]" depends. */
891 4 : find_deps_in_bb_for_stmt (&stmts, gimple_bb (re->consumer), re->consumer);
892 : /* Because we generate new stmt loading from the MEM_REF to TMP. */
893 4 : tree cond, tmp = copy_ssa_name (re->var);
894 4 : stmt = gimple_build_assign (tmp, re->init_ref);
895 4 : gimple_seq_add_stmt_without_update (&stmts, stmt);
896 :
897 : /* Init new_var to MEM_REF or CONST depending on if it is the first
898 : iteration. */
899 4 : induction_p iv = m_inductions[0];
900 4 : cond = make_ssa_name (boolean_type_node);
901 4 : stmt = gimple_build_assign (cond, NE_EXPR, iv->var, iv->init_val);
902 4 : gimple_seq_add_stmt_without_update (&stmts, stmt);
903 4 : new_var = copy_ssa_name (re->var);
904 4 : stmt = gimple_build_assign (new_var, COND_EXPR, cond, tmp, re->init);
905 4 : gimple_seq_add_stmt_without_update (&stmts, stmt);
906 : }
907 7 : gsi_insert_seq_before (&to, stmts, GSI_SAME_STMT);
908 :
909 : /* Replace all uses of reduction var with new variable. */
910 7 : use_operand_p use_p;
911 7 : imm_use_iterator iterator;
912 21 : FOR_EACH_IMM_USE_STMT (stmt, iterator, re->var)
913 : {
914 21 : FOR_EACH_IMM_USE_ON_STMT (use_p, iterator)
915 7 : SET_USE (use_p, new_var);
916 :
917 7 : update_stmt (stmt);
918 7 : }
919 :
920 : /* Move consumer stmt into inner loop, just after reduction next's def. */
921 7 : unlink_stmt_vdef (re->consumer);
922 14 : release_ssa_name (gimple_vdef (re->consumer));
923 7 : gimple_set_vdef (re->consumer, NULL_TREE);
924 7 : gimple_set_vuse (re->consumer, NULL_TREE);
925 7 : gimple_assign_set_rhs1 (re->consumer, re->next);
926 7 : update_stmt (re->consumer);
927 7 : from = gsi_for_stmt (re->consumer);
928 7 : to = gsi_for_stmt (SSA_NAME_DEF_STMT (re->next));
929 7 : gsi_move_after (&from, &to);
930 :
931 : /* Mark the reduction variables for DCE. */
932 7 : bitmap_set_bit (dce_seeds, SSA_NAME_VERSION (re->var));
933 7 : bitmap_set_bit (dce_seeds, SSA_NAME_VERSION (PHI_RESULT (re->lcssa_phi)));
934 7 : }
935 :
936 : /* Free DATAREFS and its auxiliary memory. */
937 :
938 : static void
939 55436 : free_data_refs_with_aux (vec<data_reference_p> datarefs)
940 : {
941 55436 : data_reference_p dr;
942 60440 : for (unsigned i = 0; datarefs.iterate (i, &dr); ++i)
943 5004 : if (dr->aux != NULL)
944 : {
945 4956 : DR_ACCESS_STRIDE (dr)->release ();
946 4956 : delete (vec<tree> *) dr->aux;
947 : }
948 :
949 55436 : free_data_refs (datarefs);
950 55436 : }
951 :
952 : /* Class for loop interchange transformation. */
953 :
954 : class tree_loop_interchange
955 : {
956 : public:
957 80 : tree_loop_interchange (vec<class loop *> loop_nest)
958 80 : : m_loop_nest (loop_nest), m_niters_iv_var (NULL_TREE),
959 160 : m_dce_seeds (BITMAP_ALLOC (NULL)) { }
960 160 : ~tree_loop_interchange () { BITMAP_FREE (m_dce_seeds); }
961 : bool interchange (vec<data_reference_p>, vec<ddr_p>);
962 :
963 : private:
964 : void update_data_info (unsigned, unsigned, vec<data_reference_p>, vec<ddr_p>);
965 : bool valid_data_dependences (unsigned, unsigned, vec<ddr_p>);
966 : void interchange_loops (loop_cand &, loop_cand &);
967 : void map_inductions_to_loop (loop_cand &, loop_cand &);
968 : void move_code_to_inner_loop (class loop *, class loop *, basic_block *);
969 :
970 : /* The whole loop nest in which interchange is ongoing. */
971 : vec<class loop *> m_loop_nest;
972 : /* We create new IV which is only used in loop's exit condition check.
973 : In case of 3-level loop nest interchange, when we interchange the
974 : innermost two loops, new IV created in the middle level loop does
975 : not need to be preserved in interchanging the outermost two loops
976 : later. We record the IV so that it can be skipped. */
977 : tree m_niters_iv_var;
978 : /* Bitmap of seed variables for dead code elimination after interchange. */
979 : bitmap m_dce_seeds;
980 : };
981 :
982 : /* Update data refs' access stride and dependence information after loop
983 : interchange. I_IDX/O_IDX gives indices of interchanged loops in loop
984 : nest. DATAREFS are data references. DDRS are data dependences. */
985 :
986 : void
987 13 : tree_loop_interchange::update_data_info (unsigned i_idx, unsigned o_idx,
988 : vec<data_reference_p> datarefs,
989 : vec<ddr_p> ddrs)
990 : {
991 13 : struct data_reference *dr;
992 13 : struct data_dependence_relation *ddr;
993 :
994 : /* Update strides of data references. */
995 145 : for (unsigned i = 0; datarefs.iterate (i, &dr); ++i)
996 : {
997 132 : vec<tree> *stride = DR_ACCESS_STRIDE (dr);
998 132 : gcc_assert (stride->length () > i_idx);
999 132 : std::swap ((*stride)[i_idx], (*stride)[o_idx]);
1000 : }
1001 : /* Update data dependences. */
1002 2306 : for (unsigned i = 0; ddrs.iterate (i, &ddr); ++i)
1003 2293 : if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
1004 : {
1005 2299 : for (unsigned j = 0; j < DDR_NUM_DIST_VECTS (ddr); ++j)
1006 : {
1007 6 : lambda_vector dist_vect = DDR_DIST_VECT (ddr, j);
1008 6 : std::swap (dist_vect[i_idx], dist_vect[o_idx]);
1009 : }
1010 : }
1011 13 : }
1012 :
1013 : /* Check data dependence relations, return TRUE if it's valid to interchange
1014 : two loops specified by I_IDX/O_IDX. Theoretically, interchanging the two
1015 : loops is valid only if dist vector, after interchanging, doesn't have '>'
1016 : as the leftmost non-'=' direction. Practically, this function have been
1017 : conservative here by not checking some valid cases. */
1018 :
1019 : bool
1020 98 : tree_loop_interchange::valid_data_dependences (unsigned i_idx, unsigned o_idx,
1021 : vec<ddr_p> ddrs)
1022 : {
1023 98 : struct data_dependence_relation *ddr;
1024 :
1025 5627 : for (unsigned i = 0; ddrs.iterate (i, &ddr); ++i)
1026 : {
1027 : /* Skip no-dependence case. */
1028 5539 : if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1029 5448 : continue;
1030 :
1031 5688 : for (unsigned j = 0; j < DDR_NUM_DIST_VECTS (ddr); ++j)
1032 : {
1033 159 : lambda_vector dist_vect = DDR_DIST_VECT (ddr, j);
1034 318 : unsigned level = dependence_level (dist_vect, m_loop_nest.length ());
1035 :
1036 : /* If there is no carried dependence. */
1037 159 : if (level == 0)
1038 78 : continue;
1039 :
1040 81 : level --;
1041 :
1042 : /* If dependence is not carried by any loop in between the two
1043 : loops [oloop, iloop] to interchange. */
1044 81 : if (level < o_idx || level > i_idx)
1045 16 : continue;
1046 :
1047 : /* Be conservative, skip case if either direction at i_idx/o_idx
1048 : levels is not '=' or '<'. */
1049 65 : if ((!DDR_REVERSED_P (ddr) && dist_vect[i_idx] < 0)
1050 64 : || (DDR_REVERSED_P (ddr) && dist_vect[i_idx] > 0)
1051 55 : || (!DDR_REVERSED_P (ddr) && dist_vect[o_idx] < 0)
1052 55 : || (DDR_REVERSED_P (ddr) && dist_vect[o_idx] > 0))
1053 : return false;
1054 : }
1055 : }
1056 :
1057 : return true;
1058 : }
1059 :
1060 : /* Interchange two loops specified by ILOOP and OLOOP. */
1061 :
1062 : void
1063 48 : tree_loop_interchange::interchange_loops (loop_cand &iloop, loop_cand &oloop)
1064 : {
1065 48 : reduction_p re;
1066 48 : gimple_stmt_iterator gsi;
1067 48 : tree i_niters, o_niters, var_after;
1068 :
1069 : /* Undo inner loop's simple reduction. */
1070 66 : for (unsigned i = 0; iloop.m_reductions.iterate (i, &re); ++i)
1071 18 : if (re->type != DOUBLE_RTYPE)
1072 : {
1073 7 : if (re->producer)
1074 3 : reset_debug_uses (re->producer);
1075 :
1076 7 : iloop.undo_simple_reduction (re, m_dce_seeds);
1077 : }
1078 :
1079 : /* Only need to reset debug uses for double reduction. */
1080 59 : for (unsigned i = 0; oloop.m_reductions.iterate (i, &re); ++i)
1081 : {
1082 11 : gcc_assert (re->type == DOUBLE_RTYPE);
1083 11 : reset_debug_uses (SSA_NAME_DEF_STMT (re->var));
1084 11 : reset_debug_uses (SSA_NAME_DEF_STMT (re->next));
1085 : }
1086 :
1087 : /* Prepare niters for both loops. */
1088 48 : class loop *loop_nest = m_loop_nest[0];
1089 48 : edge instantiate_below = loop_preheader_edge (loop_nest);
1090 48 : gsi = gsi_last_bb (loop_preheader_edge (loop_nest)->src);
1091 48 : i_niters = number_of_latch_executions (iloop.m_loop);
1092 48 : i_niters = analyze_scalar_evolution (loop_outer (iloop.m_loop), i_niters);
1093 48 : i_niters = instantiate_scev (instantiate_below, loop_outer (iloop.m_loop),
1094 : i_niters);
1095 48 : i_niters = force_gimple_operand_gsi (&gsi, unshare_expr (i_niters), true,
1096 : NULL_TREE, false, GSI_CONTINUE_LINKING);
1097 48 : o_niters = number_of_latch_executions (oloop.m_loop);
1098 48 : if (oloop.m_loop != loop_nest)
1099 : {
1100 13 : o_niters = analyze_scalar_evolution (loop_outer (oloop.m_loop), o_niters);
1101 13 : o_niters = instantiate_scev (instantiate_below, loop_outer (oloop.m_loop),
1102 : o_niters);
1103 : }
1104 48 : o_niters = force_gimple_operand_gsi (&gsi, unshare_expr (o_niters), true,
1105 : NULL_TREE, false, GSI_CONTINUE_LINKING);
1106 :
1107 : /* Move src's code to tgt loop. This is necessary when src is the outer
1108 : loop and tgt is the inner loop. */
1109 48 : move_code_to_inner_loop (oloop.m_loop, iloop.m_loop, oloop.m_bbs);
1110 :
1111 : /* Map outer loop's IV to inner loop, and vice versa. */
1112 48 : map_inductions_to_loop (oloop, iloop);
1113 48 : map_inductions_to_loop (iloop, oloop);
1114 :
1115 : /* Create canonical IV for both loops. Note canonical IV for outer/inner
1116 : loop is actually from inner/outer loop. Also we record the new IV
1117 : created for the outer loop so that it can be skipped in later loop
1118 : interchange. */
1119 48 : create_canonical_iv (oloop.m_loop, oloop.m_exit,
1120 : i_niters, &m_niters_iv_var, &var_after);
1121 48 : bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after));
1122 48 : create_canonical_iv (iloop.m_loop, iloop.m_exit,
1123 : o_niters, NULL, &var_after);
1124 48 : bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after));
1125 :
1126 : /* Scrap niters estimation of interchanged loops. */
1127 48 : iloop.m_loop->any_upper_bound = false;
1128 48 : iloop.m_loop->any_likely_upper_bound = false;
1129 48 : free_numbers_of_iterations_estimates (iloop.m_loop);
1130 48 : oloop.m_loop->any_upper_bound = false;
1131 48 : oloop.m_loop->any_likely_upper_bound = false;
1132 48 : free_numbers_of_iterations_estimates (oloop.m_loop);
1133 :
1134 : /* Clear all cached scev information. This is expensive but shouldn't be
1135 : a problem given we interchange in very limited times. */
1136 48 : scev_reset_htab ();
1137 :
1138 : /* ??? The association between the loop data structure and the
1139 : CFG changed, so what was loop N at the source level is now
1140 : loop M. We should think of retaining the association or breaking
1141 : it fully by creating a new loop instead of re-using the "wrong" one. */
1142 48 : }
1143 :
1144 : /* Map induction variables of SRC loop to TGT loop. The function firstly
1145 : creates the same IV of SRC loop in TGT loop, then deletes the original
1146 : IV and re-initialize it using the newly created IV. For example, loop
1147 : nest:
1148 :
1149 : for (i = 0; i < N; i++)
1150 : for (j = 0; j < M; j++)
1151 : {
1152 : //use of i;
1153 : //use of j;
1154 : }
1155 :
1156 : will be transformed into:
1157 :
1158 : for (jj = 0; jj < M; jj++)
1159 : for (ii = 0; ii < N; ii++)
1160 : {
1161 : //use of ii;
1162 : //use of jj;
1163 : }
1164 :
1165 : after loop interchange. */
1166 :
1167 : void
1168 96 : tree_loop_interchange::map_inductions_to_loop (loop_cand &src, loop_cand &tgt)
1169 : {
1170 96 : induction_p iv;
1171 96 : edge e = tgt.m_exit;
1172 96 : gimple_stmt_iterator incr_pos = gsi_last_bb (e->src), gsi;
1173 :
1174 : /* Map source loop's IV to target loop. */
1175 266 : for (unsigned i = 0; src.m_inductions.iterate (i, &iv); ++i)
1176 : {
1177 170 : gimple *use_stmt, *stmt = SSA_NAME_DEF_STMT (iv->var);
1178 170 : gcc_assert (is_a <gphi *> (stmt));
1179 :
1180 170 : use_operand_p use_p;
1181 : /* Only map original IV to target loop. */
1182 170 : if (m_niters_iv_var != iv->var)
1183 : {
1184 : /* Map the IV by creating the same one in target loop. */
1185 166 : tree var_before, var_after;
1186 166 : tree base = unshare_expr (iv->init_expr);
1187 166 : tree step = unshare_expr (iv->step);
1188 262 : create_iv (base, PLUS_EXPR, step, SSA_NAME_VAR (iv->var),
1189 : tgt.m_loop, &incr_pos, false, &var_before, &var_after);
1190 166 : bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_before));
1191 166 : bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after));
1192 :
1193 : /* Replace uses of the original IV var with newly created IV var. */
1194 166 : imm_use_iterator imm_iter;
1195 1038 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, iv->var)
1196 : {
1197 2310 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1198 802 : SET_USE (use_p, var_before);
1199 :
1200 706 : update_stmt (use_stmt);
1201 166 : }
1202 : }
1203 :
1204 : /* Mark all uses for DCE. */
1205 170 : ssa_op_iter op_iter;
1206 680 : FOR_EACH_PHI_OR_STMT_USE (use_p, stmt, op_iter, SSA_OP_USE)
1207 : {
1208 340 : tree use = USE_FROM_PTR (use_p);
1209 340 : if (TREE_CODE (use) == SSA_NAME
1210 340 : && ! SSA_NAME_IS_DEFAULT_DEF (use))
1211 176 : bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (use));
1212 : }
1213 :
1214 : /* Delete definition of the original IV in the source loop. */
1215 170 : gsi = gsi_for_stmt (stmt);
1216 170 : remove_phi_node (&gsi, true);
1217 : }
1218 96 : }
1219 :
1220 : /* Move stmts of outer loop to inner loop. */
1221 :
1222 : void
1223 48 : tree_loop_interchange::move_code_to_inner_loop (class loop *outer,
1224 : class loop *inner,
1225 : basic_block *outer_bbs)
1226 : {
1227 48 : basic_block oloop_exit_bb = single_exit (outer)->src;
1228 48 : gimple_stmt_iterator gsi, to;
1229 :
1230 306 : for (unsigned i = 0; i < outer->num_nodes; i++)
1231 : {
1232 258 : basic_block bb = outer_bbs[i];
1233 :
1234 : /* Skip basic blocks of inner loop. */
1235 258 : if (flow_bb_inside_loop_p (inner, bb))
1236 114 : continue;
1237 :
1238 : /* Move code from header/latch to header/latch. */
1239 144 : if (bb == outer->header)
1240 48 : to = gsi_after_labels (inner->header);
1241 96 : else if (bb == outer->latch)
1242 48 : to = gsi_after_labels (inner->latch);
1243 : else
1244 : /* Otherwise, simply move to exit->src. */
1245 96 : to = gsi_last_bb (single_exit (inner)->src);
1246 :
1247 401 : for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
1248 : {
1249 257 : gimple *stmt = gsi_stmt (gsi);
1250 :
1251 305 : if (oloop_exit_bb == bb
1252 423 : && stmt == gsi_stmt (gsi_last_bb (oloop_exit_bb)))
1253 : {
1254 48 : gsi_next (&gsi);
1255 48 : continue;
1256 : }
1257 :
1258 343 : if (gimple_vdef (stmt))
1259 : {
1260 0 : unlink_stmt_vdef (stmt);
1261 0 : release_ssa_name (gimple_vdef (stmt));
1262 0 : gimple_set_vdef (stmt, NULL_TREE);
1263 : }
1264 343 : if (gimple_vuse (stmt))
1265 : {
1266 2 : gimple_set_vuse (stmt, NULL_TREE);
1267 2 : update_stmt (stmt);
1268 : }
1269 :
1270 209 : reset_debug_uses (stmt);
1271 209 : gsi_move_before (&gsi, &to);
1272 : }
1273 : }
1274 48 : }
1275 :
1276 : /* Given data reference DR in LOOP_NEST, the function computes DR's access
1277 : stride at each level of loop from innermost LOOP to outer. On success,
1278 : it saves access stride at each level loop in a vector which is pointed
1279 : by DR->aux. For example:
1280 :
1281 : int arr[100][100][100];
1282 : for (i = 0; i < 100; i++) ;(DR->aux)strides[0] = 40000
1283 : for (j = 100; j > 0; j--) ;(DR->aux)strides[1] = 400
1284 : for (k = 0; k < 100; k++) ;(DR->aux)strides[2] = 4
1285 : arr[i][j - 1][k] = 0; */
1286 :
1287 : static void
1288 4958 : compute_access_stride (class loop *&loop_nest, class loop *loop,
1289 : data_reference_p dr)
1290 : {
1291 4958 : vec<tree> *strides = new vec<tree> ();
1292 4958 : dr->aux = strides;
1293 :
1294 4958 : basic_block bb = gimple_bb (DR_STMT (dr));
1295 4958 : if (!flow_bb_inside_loop_p (loop_nest, bb))
1296 : return;
1297 5734 : while (!flow_bb_inside_loop_p (loop, bb))
1298 : {
1299 776 : strides->safe_push (build_int_cst (sizetype, 0));
1300 776 : loop = loop_outer (loop);
1301 : }
1302 4958 : gcc_assert (loop == bb->loop_father);
1303 :
1304 4958 : tree ref = DR_REF (dr);
1305 4958 : if (TREE_CODE (ref) == COMPONENT_REF
1306 4958 : && DECL_BIT_FIELD (TREE_OPERAND (ref, 1)))
1307 : {
1308 : /* We can't take address of bitfields. If the bitfield is at constant
1309 : offset from the start of the struct, just use address of the
1310 : struct, for analysis of the strides that shouldn't matter. */
1311 4 : if (!TREE_OPERAND (ref, 2)
1312 4 : || TREE_CODE (TREE_OPERAND (ref, 2)) == INTEGER_CST)
1313 2 : ref = TREE_OPERAND (ref, 0);
1314 : /* Otherwise, if we have a bit field representative, use that. */
1315 2 : else if (DECL_BIT_FIELD_REPRESENTATIVE (TREE_OPERAND (ref, 1))
1316 : != NULL_TREE)
1317 : {
1318 2 : tree repr = DECL_BIT_FIELD_REPRESENTATIVE (TREE_OPERAND (ref, 1));
1319 2 : ref = build3 (COMPONENT_REF, TREE_TYPE (repr), TREE_OPERAND (ref, 0),
1320 2 : repr, TREE_OPERAND (ref, 2));
1321 : }
1322 : /* Otherwise punt. */
1323 : else
1324 : return;
1325 : }
1326 4958 : tree scev_base = build_fold_addr_expr (ref);
1327 4958 : tree scev = analyze_scalar_evolution (loop, scev_base);
1328 4958 : if (chrec_contains_undetermined (scev))
1329 : return;
1330 :
1331 : tree orig_scev = scev;
1332 5054 : do
1333 : {
1334 4995 : scev = instantiate_scev (loop_preheader_edge (loop_nest),
1335 : loop, orig_scev);
1336 4995 : if (! chrec_contains_undetermined (scev))
1337 : break;
1338 :
1339 : /* If we couldn't instantiate for the desired nest, shrink it. */
1340 67 : if (loop_nest == loop)
1341 : return;
1342 59 : loop_nest = loop_nest->inner;
1343 : } while (1);
1344 :
1345 : tree sl = scev;
1346 : class loop *expected = loop;
1347 15061 : while (TREE_CODE (sl) == POLYNOMIAL_CHREC)
1348 : {
1349 10133 : class loop *sl_loop = get_chrec_loop (sl);
1350 20356 : while (sl_loop != expected)
1351 : {
1352 90 : strides->safe_push (size_int (0));
1353 90 : expected = loop_outer (expected);
1354 : }
1355 10133 : strides->safe_push (CHREC_RIGHT (sl));
1356 10133 : sl = CHREC_LEFT (sl);
1357 10133 : expected = loop_outer (expected);
1358 : }
1359 4928 : if (! tree_contains_chrecs (sl, NULL))
1360 5574 : while (expected != loop_outer (loop_nest))
1361 : {
1362 646 : strides->safe_push (size_int (0));
1363 646 : expected = loop_outer (expected);
1364 : }
1365 : }
1366 :
1367 : /* Given loop nest LOOP_NEST with innermost LOOP, the function computes
1368 : access strides with respect to each level loop for all data refs in
1369 : DATAREFS from inner loop to outer loop. On success, it returns the
1370 : outermost loop that access strides can be computed successfully for
1371 : all data references. If access strides cannot be computed at least
1372 : for two levels of loop for any data reference, it returns NULL. */
1373 :
1374 : static class loop *
1375 856 : compute_access_strides (class loop *loop_nest, class loop *loop,
1376 : vec<data_reference_p> datarefs)
1377 : {
1378 856 : unsigned i, j, num_loops = (unsigned) -1;
1379 856 : data_reference_p dr;
1380 856 : vec<tree> *stride;
1381 :
1382 856 : class loop *interesting_loop_nest = loop_nest;
1383 5755 : for (i = 0; datarefs.iterate (i, &dr); ++i)
1384 : {
1385 4958 : compute_access_stride (interesting_loop_nest, loop, dr);
1386 4958 : stride = DR_ACCESS_STRIDE (dr);
1387 9886 : if (stride->length () < num_loops)
1388 : {
1389 915 : num_loops = stride->length ();
1390 885 : if (num_loops < 2)
1391 : return NULL;
1392 : }
1393 : }
1394 :
1395 5616 : for (i = 0; datarefs.iterate (i, &dr); ++i)
1396 : {
1397 4819 : stride = DR_ACCESS_STRIDE (dr);
1398 4819 : if (stride->length () > num_loops)
1399 15 : stride->truncate (num_loops);
1400 :
1401 9844 : for (j = 0; j < (num_loops >> 1); ++j)
1402 5025 : std::swap ((*stride)[j], (*stride)[num_loops - j - 1]);
1403 : }
1404 :
1405 1594 : loop = superloop_at_depth (loop, loop_depth (loop) + 1 - num_loops);
1406 797 : gcc_assert (loop_nest == loop || flow_loop_nested_p (loop_nest, loop));
1407 : return loop;
1408 : }
1409 :
1410 : /* Prune access strides for data references in DATAREFS by removing strides
1411 : of loops that isn't in current LOOP_NEST. */
1412 :
1413 : static void
1414 3 : prune_access_strides_not_in_loop (class loop *loop_nest,
1415 : class loop *innermost,
1416 : vec<data_reference_p> datarefs)
1417 : {
1418 3 : data_reference_p dr;
1419 6 : unsigned num_loops = loop_depth (innermost) - loop_depth (loop_nest) + 1;
1420 3 : gcc_assert (num_loops > 1);
1421 :
1422 : /* Block remove strides of loops that is not in current loop nest. */
1423 18 : for (unsigned i = 0; datarefs.iterate (i, &dr); ++i)
1424 : {
1425 15 : vec<tree> *stride = DR_ACCESS_STRIDE (dr);
1426 30 : if (stride->length () > num_loops)
1427 15 : stride->block_remove (0, stride->length () - num_loops);
1428 : }
1429 3 : }
1430 :
1431 : /* Dump access strides for all DATAREFS. */
1432 :
1433 : static void
1434 18 : dump_access_strides (vec<data_reference_p> datarefs)
1435 : {
1436 18 : data_reference_p dr;
1437 18 : fprintf (dump_file, "Access Strides for DRs:\n");
1438 110 : for (unsigned i = 0; datarefs.iterate (i, &dr); ++i)
1439 : {
1440 92 : fprintf (dump_file, " ");
1441 92 : print_generic_expr (dump_file, DR_REF (dr), TDF_SLIM);
1442 92 : fprintf (dump_file, ":\t\t<");
1443 :
1444 92 : vec<tree> *stride = DR_ACCESS_STRIDE (dr);
1445 92 : unsigned num_loops = stride->length ();
1446 288 : for (unsigned j = 0; j < num_loops; ++j)
1447 : {
1448 196 : print_generic_expr (dump_file, (*stride)[j], TDF_SLIM);
1449 288 : fprintf (dump_file, "%s", (j < num_loops - 1) ? ",\t" : ">\n");
1450 : }
1451 : }
1452 18 : }
1453 :
1454 : /* Return true if it's profitable to interchange two loops whose index
1455 : in whole loop nest vector are I_IDX/O_IDX respectively. The function
1456 : computes and compares three types information from all DATAREFS:
1457 : 1) Access stride for loop I_IDX and O_IDX.
1458 : 2) Number of invariant memory references with respect to I_IDX before
1459 : and after loop interchange.
1460 : 3) Flags indicating if all memory references access sequential memory
1461 : in ILOOP, before and after loop interchange.
1462 : If INNMOST_LOOP_P is true, the two loops for interchanging are the two
1463 : innermost loops in loop nest. This function also dumps information if
1464 : DUMP_INFO_P is true. */
1465 :
1466 : static bool
1467 1100 : should_interchange_loops (unsigned i_idx, unsigned o_idx,
1468 : vec<data_reference_p> datarefs,
1469 : unsigned i_stmt_cost, unsigned o_stmt_cost,
1470 : bool innermost_loops_p, bool dump_info_p = true)
1471 : {
1472 1100 : unsigned HOST_WIDE_INT ratio;
1473 1100 : unsigned i, j, num_old_inv_drs = 0, num_new_inv_drs = 0;
1474 1100 : struct data_reference *dr;
1475 1100 : bool all_seq_dr_before_p = true, all_seq_dr_after_p = true;
1476 1100 : widest_int iloop_strides = 0, oloop_strides = 0;
1477 1100 : unsigned num_unresolved_drs = 0;
1478 1100 : unsigned num_resolved_ok_drs = 0;
1479 1100 : unsigned num_resolved_not_ok_drs = 0;
1480 :
1481 1100 : if (dump_info_p && dump_file && (dump_flags & TDF_DETAILS))
1482 17 : fprintf (dump_file, "\nData ref strides:\n\tmem_ref:\t\tiloop\toloop\n");
1483 :
1484 7893 : for (i = 0; datarefs.iterate (i, &dr); ++i)
1485 : {
1486 6793 : vec<tree> *stride = DR_ACCESS_STRIDE (dr);
1487 6793 : tree iloop_stride = (*stride)[i_idx], oloop_stride = (*stride)[o_idx];
1488 :
1489 6793 : bool subloop_stride_p = false;
1490 : /* Data ref can't be invariant or sequential access at current loop if
1491 : its address changes with respect to any subloops. */
1492 7128 : for (j = i_idx + 1; j < stride->length (); ++j)
1493 1731 : if (!integer_zerop ((*stride)[j]))
1494 : {
1495 : subloop_stride_p = true;
1496 : break;
1497 : }
1498 :
1499 6793 : if (integer_zerop (iloop_stride))
1500 : {
1501 865 : if (!subloop_stride_p)
1502 823 : num_old_inv_drs++;
1503 : }
1504 6793 : if (integer_zerop (oloop_stride))
1505 : {
1506 599 : if (!subloop_stride_p)
1507 538 : num_new_inv_drs++;
1508 : }
1509 :
1510 6793 : if (TREE_CODE (iloop_stride) == INTEGER_CST
1511 5851 : && TREE_CODE (oloop_stride) == INTEGER_CST)
1512 : {
1513 5341 : iloop_strides = wi::add (iloop_strides, wi::to_widest (iloop_stride));
1514 5341 : oloop_strides = wi::add (oloop_strides, wi::to_widest (oloop_stride));
1515 : }
1516 1452 : else if (multiple_of_p (TREE_TYPE (iloop_stride),
1517 : iloop_stride, oloop_stride))
1518 67 : num_resolved_ok_drs++;
1519 1385 : else if (multiple_of_p (TREE_TYPE (iloop_stride),
1520 : oloop_stride, iloop_stride))
1521 491 : num_resolved_not_ok_drs++;
1522 : else
1523 894 : num_unresolved_drs++;
1524 :
1525 : /* Data ref can't be sequential access if its address changes in sub
1526 : loop. */
1527 6793 : if (subloop_stride_p)
1528 : {
1529 1396 : all_seq_dr_before_p = false;
1530 1396 : all_seq_dr_after_p = false;
1531 1396 : continue;
1532 : }
1533 : /* Track if all data references are sequential accesses before/after loop
1534 : interchange. Note invariant is considered sequential here. */
1535 5397 : tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
1536 5397 : if (all_seq_dr_before_p
1537 6642 : && ! (integer_zerop (iloop_stride)
1538 1245 : || operand_equal_p (access_size, iloop_stride, 0)))
1539 : all_seq_dr_before_p = false;
1540 5397 : if (all_seq_dr_after_p
1541 6440 : && ! (integer_zerop (oloop_stride)
1542 1043 : || operand_equal_p (access_size, oloop_stride, 0)))
1543 : all_seq_dr_after_p = false;
1544 : }
1545 :
1546 1100 : if (dump_info_p && dump_file && (dump_flags & TDF_DETAILS))
1547 : {
1548 17 : fprintf (dump_file, "\toverall:\t\t");
1549 17 : print_decu (iloop_strides, dump_file);
1550 17 : fprintf (dump_file, "\t");
1551 17 : print_decu (oloop_strides, dump_file);
1552 17 : fprintf (dump_file, "\n");
1553 :
1554 17 : fprintf (dump_file, "Invariant data ref: before(%d), after(%d)\n",
1555 : num_old_inv_drs, num_new_inv_drs);
1556 38 : fprintf (dump_file, "All consecutive stride: before(%s), after(%s)\n",
1557 : all_seq_dr_before_p ? "true" : "false",
1558 : all_seq_dr_after_p ? "true" : "false");
1559 17 : fprintf (dump_file, "OK to interchage with variable strides: %d\n",
1560 : num_resolved_ok_drs);
1561 17 : fprintf (dump_file, "Not OK to interchage with variable strides: %d\n",
1562 : num_resolved_not_ok_drs);
1563 17 : fprintf (dump_file, "Variable strides we cannot decide: %d\n",
1564 : num_unresolved_drs);
1565 17 : fprintf (dump_file, "Stmt cost of inner loop: %d\n", i_stmt_cost);
1566 17 : fprintf (dump_file, "Stmt cost of outer loop: %d\n", o_stmt_cost);
1567 : }
1568 :
1569 1100 : if (num_unresolved_drs != 0 || num_resolved_not_ok_drs != 0)
1570 : return false;
1571 :
1572 : /* Stmts of outer loop will be moved to inner loop. If there are two many
1573 : such stmts, it could make inner loop costly. Here we compare stmt cost
1574 : between outer and inner loops. */
1575 508 : if (i_stmt_cost && o_stmt_cost
1576 46 : && num_old_inv_drs + o_stmt_cost > num_new_inv_drs
1577 41 : && o_stmt_cost * STMT_COST_RATIO > i_stmt_cost)
1578 : return false;
1579 :
1580 : /* We use different stride comparison ratio for interchanging innermost
1581 : two loops or not. The idea is to be conservative in interchange for
1582 : the innermost loops. */
1583 490 : ratio = innermost_loops_p ? INNER_STRIDE_RATIO : OUTER_STRIDE_RATIO;
1584 : /* Do interchange if it gives better data locality behavior. */
1585 490 : if (wi::gtu_p (iloop_strides, wi::mul (oloop_strides, ratio)))
1586 : return true;
1587 348 : if (wi::gtu_p (iloop_strides, oloop_strides))
1588 : {
1589 : /* Or it creates more invariant memory references. */
1590 15 : if ((!all_seq_dr_before_p || all_seq_dr_after_p)
1591 15 : && num_new_inv_drs > num_old_inv_drs)
1592 : return true;
1593 : /* Or it makes all memory references sequential. */
1594 11 : if (num_new_inv_drs >= num_old_inv_drs
1595 11 : && !all_seq_dr_before_p && all_seq_dr_after_p)
1596 : return true;
1597 : }
1598 :
1599 : return false;
1600 1100 : }
1601 :
1602 : /* Try to interchange inner loop of a loop nest to outer level. */
1603 :
1604 : bool
1605 80 : tree_loop_interchange::interchange (vec<data_reference_p> datarefs,
1606 : vec<ddr_p> ddrs)
1607 : {
1608 80 : dump_user_location_t loc = find_loop_location (m_loop_nest[0]);
1609 80 : bool changed_p = false;
1610 : /* In each iteration we try to interchange I-th loop with (I+1)-th loop.
1611 : The overall effect is to push inner loop to outermost level in whole
1612 : loop nest. */
1613 229 : for (unsigned i = m_loop_nest.length (); i > 1; --i)
1614 : {
1615 98 : unsigned i_idx = i - 1, o_idx = i - 2;
1616 :
1617 : /* Check validity for loop interchange. */
1618 98 : if (!valid_data_dependences (i_idx, o_idx, ddrs))
1619 : break;
1620 :
1621 88 : loop_cand iloop (m_loop_nest[i_idx], m_loop_nest[o_idx]);
1622 88 : loop_cand oloop (m_loop_nest[o_idx], m_loop_nest[o_idx]);
1623 :
1624 : /* Check if we can do transformation for loop interchange. */
1625 88 : if (!iloop.analyze_carried_vars (NULL)
1626 78 : || !iloop.analyze_lcssa_phis ()
1627 78 : || !oloop.analyze_carried_vars (&iloop)
1628 74 : || !oloop.analyze_lcssa_phis ()
1629 71 : || !iloop.can_interchange_p (NULL)
1630 158 : || !oloop.can_interchange_p (&iloop))
1631 : break;
1632 :
1633 : /* Outer loop's stmts will be moved to inner loop during interchange.
1634 : If there are many of them, it may make inner loop very costly. We
1635 : need to check number of outer loop's stmts in profit cost model of
1636 : interchange. */
1637 69 : int stmt_cost = oloop.m_num_stmts;
1638 : /* Count out the exit checking stmt of outer loop. */
1639 69 : stmt_cost --;
1640 : /* Count out IV's increasing stmt, IVOPTs takes care if it. */
1641 69 : stmt_cost -= oloop.m_inductions.length ();
1642 : /* Count in the additional load and cond_expr stmts caused by inner
1643 : loop's constant initialized reduction. */
1644 69 : stmt_cost += iloop.m_const_init_reduc * 2;
1645 69 : if (stmt_cost < 0)
1646 : stmt_cost = 0;
1647 :
1648 : /* Check profitability for loop interchange. */
1649 69 : if (should_interchange_loops (i_idx, o_idx, datarefs,
1650 69 : (unsigned) iloop.m_num_stmts,
1651 : (unsigned) stmt_cost,
1652 69 : iloop.m_loop->inner == NULL))
1653 : {
1654 48 : if (dump_file && (dump_flags & TDF_DETAILS))
1655 16 : fprintf (dump_file,
1656 : "Loop_pair<outer:%d, inner:%d> is interchanged\n\n",
1657 16 : oloop.m_loop->num, iloop.m_loop->num);
1658 :
1659 48 : changed_p = true;
1660 48 : interchange_loops (iloop, oloop);
1661 : /* No need to update if there is no further loop interchange. */
1662 48 : if (o_idx > 0)
1663 13 : update_data_info (i_idx, o_idx, datarefs, ddrs);
1664 : }
1665 : else
1666 : {
1667 21 : if (dump_file && (dump_flags & TDF_DETAILS))
1668 1 : fprintf (dump_file,
1669 : "Loop_pair<outer:%d, inner:%d> is not interchanged\n\n",
1670 1 : oloop.m_loop->num, iloop.m_loop->num);
1671 : }
1672 88 : }
1673 80 : simple_dce_from_worklist (m_dce_seeds);
1674 :
1675 80 : if (changed_p && dump_enabled_p ())
1676 14 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
1677 : "loops interchanged in loop nest\n");
1678 :
1679 80 : return changed_p;
1680 : }
1681 :
1682 :
1683 : /* Loop interchange pass. */
1684 :
1685 : namespace {
1686 :
1687 : const pass_data pass_data_linterchange =
1688 : {
1689 : GIMPLE_PASS, /* type */
1690 : "linterchange", /* name */
1691 : OPTGROUP_LOOP, /* optinfo_flags */
1692 : TV_LINTERCHANGE, /* tv_id */
1693 : PROP_cfg, /* properties_required */
1694 : 0, /* properties_provided */
1695 : 0, /* properties_destroyed */
1696 : 0, /* todo_flags_start */
1697 : 0, /* todo_flags_finish */
1698 : };
1699 :
1700 : class pass_linterchange : public gimple_opt_pass
1701 : {
1702 : public:
1703 285722 : pass_linterchange (gcc::context *ctxt)
1704 571444 : : gimple_opt_pass (pass_data_linterchange, ctxt)
1705 : {}
1706 :
1707 : /* opt_pass methods: */
1708 0 : opt_pass * clone () final override { return new pass_linterchange (m_ctxt); }
1709 241458 : bool gate (function *) final override { return flag_loop_interchange; }
1710 : unsigned int execute (function *) final override;
1711 :
1712 : }; // class pass_linterchange
1713 :
1714 :
1715 : /* Return true if LOOP has proper form for interchange. We check three
1716 : conditions in the function:
1717 : 1) In general, a loop can be interchanged only if it doesn't have
1718 : basic blocks other than header, exit and latch besides possible
1719 : inner loop nest. This basically restricts loop interchange to
1720 : below form loop nests:
1721 :
1722 : header<---+
1723 : | |
1724 : v |
1725 : INNER_LOOP |
1726 : | |
1727 : v |
1728 : exit--->latch
1729 :
1730 : 2) Data reference in basic block that executes in different times
1731 : than loop head/exit is not allowed.
1732 : 3) Record the innermost outer loop that doesn't form rectangle loop
1733 : nest with LOOP. */
1734 :
1735 : static bool
1736 58237 : proper_loop_form_for_interchange (class loop *loop, class loop **min_outer)
1737 : {
1738 58237 : edge e0, e1, exit;
1739 :
1740 : /* Don't interchange if loop has unsupported information for the moment. */
1741 58237 : if (loop->safelen > 0
1742 57736 : || loop->constraints != 0
1743 57736 : || loop->can_be_parallel
1744 57736 : || loop->dont_vectorize
1745 57484 : || loop->force_vectorize
1746 57484 : || loop->in_oacc_kernels_region
1747 57418 : || loop->orig_loop_num != 0
1748 57332 : || loop->simduid != NULL_TREE)
1749 : return false;
1750 :
1751 : /* Don't interchange if outer loop has basic block other than header, exit
1752 : and latch. */
1753 57330 : if (loop->inner != NULL
1754 2779 : && loop->num_nodes != loop->inner->num_nodes + 3)
1755 : return false;
1756 :
1757 56913 : if ((exit = single_dom_exit (loop)) == NULL)
1758 : return false;
1759 :
1760 : /* Check control flow on loop header/exit blocks. */
1761 37908 : if (loop->header == exit->src
1762 37908 : && (EDGE_COUNT (loop->header->preds) != 2
1763 28005 : || EDGE_COUNT (loop->header->succs) != 2))
1764 : return false;
1765 37887 : else if (loop->header != exit->src
1766 37887 : && (EDGE_COUNT (loop->header->preds) != 2
1767 43493 : || !single_succ_p (loop->header)
1768 2356 : || unsupported_edge (single_succ_edge (loop->header))
1769 2356 : || EDGE_COUNT (exit->src->succs) != 2
1770 43491 : || !single_pred_p (exit->src)
1771 2354 : || unsupported_edge (single_pred_edge (exit->src))))
1772 : return false;
1773 :
1774 30338 : e0 = EDGE_PRED (loop->header, 0);
1775 30338 : e1 = EDGE_PRED (loop->header, 1);
1776 30329 : if (unsupported_edge (e0) || unsupported_edge (e1)
1777 30318 : || (e0->src != loop->latch && e1->src != loop->latch)
1778 60656 : || (e0->src->loop_father == loop && e1->src->loop_father == loop))
1779 : return false;
1780 :
1781 30318 : e0 = EDGE_SUCC (exit->src, 0);
1782 30318 : e1 = EDGE_SUCC (exit->src, 1);
1783 30318 : if (unsupported_edge (e0) || unsupported_edge (e1)
1784 30312 : || (e0->dest != loop->latch && e1->dest != loop->latch)
1785 60496 : || (e0->dest->loop_father == loop && e1->dest->loop_father == loop))
1786 : return false;
1787 :
1788 : /* Don't interchange if any reference is in basic block that doesn't
1789 : dominate exit block. */
1790 30178 : basic_block *bbs = get_loop_body (loop);
1791 97460 : for (unsigned i = 0; i < loop->num_nodes; i++)
1792 : {
1793 69172 : basic_block bb = bbs[i];
1794 :
1795 108166 : if (bb->loop_father != loop
1796 62708 : || bb == loop->header || bb == exit->src
1797 99350 : || dominated_by_p (CDI_DOMINATORS, exit->src, bb))
1798 38994 : continue;
1799 :
1800 30178 : for (gimple_stmt_iterator gsi = gsi_start_nondebug_bb (bb);
1801 30626 : !gsi_end_p (gsi); gsi_next_nondebug (&gsi))
1802 4640 : if (gimple_vuse (gsi_stmt (gsi)))
1803 : {
1804 1890 : free (bbs);
1805 1890 : return false;
1806 : }
1807 : }
1808 28288 : free (bbs);
1809 :
1810 28288 : tree niters = number_of_latch_executions (loop);
1811 28288 : niters = analyze_scalar_evolution (loop_outer (loop), niters);
1812 28288 : if (!niters || chrec_contains_undetermined (niters))
1813 11188 : return false;
1814 :
1815 : /* Record the innermost outer loop that doesn't form rectangle loop nest. */
1816 17100 : for (loop_p loop2 = loop_outer (loop);
1817 21302 : loop2 && flow_loop_nested_p (*min_outer, loop2);
1818 4202 : loop2 = loop_outer (loop2))
1819 : {
1820 4433 : niters = instantiate_scev (loop_preheader_edge (loop2),
1821 : loop_outer (loop), niters);
1822 4433 : if (!evolution_function_is_invariant_p (niters, loop2->num))
1823 : {
1824 231 : *min_outer = loop2;
1825 231 : break;
1826 : }
1827 : }
1828 : return true;
1829 : }
1830 :
1831 : /* Return true if any two adjacent loops in loop nest [INNERMOST, LOOP_NEST]
1832 : should be interchanged by looking into all DATAREFS. */
1833 :
1834 : static bool
1835 797 : should_interchange_loop_nest (class loop *loop_nest, class loop *innermost,
1836 : vec<data_reference_p> datarefs)
1837 : {
1838 1594 : unsigned idx = loop_depth (innermost) - loop_depth (loop_nest);
1839 797 : gcc_assert (idx > 0);
1840 :
1841 : /* Check if any two adjacent loops should be interchanged. */
1842 : for (class loop *loop = innermost;
1843 1728 : loop != loop_nest; loop = loop_outer (loop), idx--)
1844 1031 : if (should_interchange_loops (idx, idx - 1, datarefs, 0, 0,
1845 : loop == innermost, false))
1846 : return true;
1847 :
1848 : return false;
1849 : }
1850 :
1851 : /* Given loop nest LOOP_NEST and data references DATAREFS, compute data
1852 : dependences for loop interchange and store it in DDRS. Note we compute
1853 : dependences directly rather than call generic interface so that we can
1854 : return on unknown dependence instantly. */
1855 :
1856 : static bool
1857 107 : tree_loop_interchange_compute_ddrs (vec<loop_p> loop_nest,
1858 : vec<data_reference_p> datarefs,
1859 : vec<ddr_p> *ddrs)
1860 : {
1861 107 : struct data_reference *a, *b;
1862 107 : class loop *innermost = loop_nest.last ();
1863 :
1864 107 : for (unsigned i = 0; datarefs.iterate (i, &a); ++i)
1865 : {
1866 438 : bool a_outer_p = gimple_bb (DR_STMT (a))->loop_father != innermost;
1867 4765 : for (unsigned j = i + 1; datarefs.iterate (j, &b); ++j)
1868 4247 : if (DR_IS_WRITE (a) || DR_IS_WRITE (b))
1869 : {
1870 3262 : bool b_outer_p = gimple_bb (DR_STMT (b))->loop_father != innermost;
1871 : /* Don't support multiple write references in outer loop. */
1872 3262 : if (a_outer_p && b_outer_p && DR_IS_WRITE (a) && DR_IS_WRITE (b))
1873 27 : return false;
1874 :
1875 3259 : ddr_p ddr = initialize_data_dependence_relation (a, b, loop_nest);
1876 3259 : ddrs->safe_push (ddr);
1877 3259 : compute_affine_dependence (ddr, loop_nest[0]);
1878 :
1879 : /* Give up if ddr is unknown dependence or classic direct vector
1880 : is not available. */
1881 3259 : if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1882 3259 : || (DDR_ARE_DEPENDENT (ddr) == NULL_TREE
1883 112 : && DDR_NUM_DIR_VECTS (ddr) == 0))
1884 : return false;
1885 :
1886 : /* If either data references is in outer loop of nest, we require
1887 : no dependence here because the data reference need to be moved
1888 : into inner loop during interchange. */
1889 3246 : if (a_outer_p && b_outer_p
1890 3237 : && operand_equal_p (DR_REF (a), DR_REF (b), 0))
1891 9 : continue;
1892 3228 : if (DDR_ARE_DEPENDENT (ddr) != chrec_known
1893 76 : && (a_outer_p || b_outer_p))
1894 : return false;
1895 : }
1896 : }
1897 :
1898 : return true;
1899 : }
1900 :
1901 : /* Prune DATAREFS by removing any data reference not inside of LOOP. */
1902 :
1903 : static inline void
1904 7 : prune_datarefs_not_in_loop (class loop *loop, vec<data_reference_p> datarefs)
1905 : {
1906 7 : unsigned i, j;
1907 7 : struct data_reference *dr;
1908 :
1909 44 : for (i = 0, j = 0; datarefs.iterate (i, &dr); ++i)
1910 : {
1911 37 : if (flow_bb_inside_loop_p (loop, gimple_bb (DR_STMT (dr))))
1912 35 : datarefs[j++] = dr;
1913 : else
1914 : {
1915 2 : if (dr->aux)
1916 : {
1917 2 : DR_ACCESS_STRIDE (dr)->release ();
1918 2 : delete (vec<tree> *) dr->aux;
1919 : }
1920 2 : free_data_ref (dr);
1921 : }
1922 : }
1923 7 : datarefs.truncate (j);
1924 7 : }
1925 :
1926 : /* Find and store data references in DATAREFS for LOOP nest. If there's
1927 : difficult data reference in a basic block, we shrink the loop nest to
1928 : inner loop of that basic block's father loop. On success, return the
1929 : outer loop of the result loop nest. */
1930 :
1931 : static class loop *
1932 1240 : prepare_data_references (class loop *loop, vec<data_reference_p> *datarefs)
1933 : {
1934 1240 : class loop *loop_nest = loop;
1935 1240 : vec<data_reference_p> *bb_refs;
1936 1240 : basic_block bb, *bbs = get_loop_body_in_dom_order (loop);
1937 :
1938 8580 : for (unsigned i = 0; i < loop->num_nodes; i++)
1939 7340 : bbs[i]->aux = NULL;
1940 :
1941 : /* Find data references for all basic blocks. Shrink loop nest on difficult
1942 : data reference. */
1943 7316 : for (unsigned i = 0; loop_nest && i < loop->num_nodes; ++i)
1944 : {
1945 6076 : bb = bbs[i];
1946 6076 : if (!flow_bb_inside_loop_p (loop_nest, bb))
1947 4 : continue;
1948 :
1949 6072 : bb_refs = new vec<data_reference_p> ();
1950 6072 : if (find_data_references_in_bb (loop, bb, bb_refs) == chrec_dont_know)
1951 : {
1952 363 : loop_nest = bb->loop_father->inner;
1953 363 : if (loop_nest && !loop_nest->inner)
1954 361 : loop_nest = NULL;
1955 :
1956 363 : free_data_refs (*bb_refs);
1957 363 : delete bb_refs;
1958 : }
1959 5709 : else if (bb_refs->is_empty ())
1960 : {
1961 4604 : bb_refs->release ();
1962 4604 : delete bb_refs;
1963 : }
1964 : else
1965 1105 : bb->aux = bb_refs;
1966 : }
1967 :
1968 : /* Collect all data references in loop nest. */
1969 8580 : for (unsigned i = 0; i < loop->num_nodes; i++)
1970 : {
1971 7340 : bb = bbs[i];
1972 7340 : if (!bb->aux)
1973 6235 : continue;
1974 :
1975 1105 : bb_refs = (vec<data_reference_p> *) bb->aux;
1976 1105 : if (loop_nest && flow_bb_inside_loop_p (loop_nest, bb))
1977 : {
1978 1043 : datarefs->safe_splice (*bb_refs);
1979 1043 : bb_refs->release ();
1980 : }
1981 : else
1982 62 : free_data_refs (*bb_refs);
1983 :
1984 1105 : delete bb_refs;
1985 1105 : bb->aux = NULL;
1986 : }
1987 1240 : free (bbs);
1988 :
1989 1240 : return loop_nest;
1990 : }
1991 :
1992 : /* Given innermost LOOP, return true if perfect loop nest can be found and
1993 : data dependences can be computed. If succeed, record the perfect loop
1994 : nest in LOOP_NEST; record all data references in DATAREFS and record all
1995 : data dependence relations in DDRS.
1996 :
1997 : We do support a restricted form of imperfect loop nest, i.e, loop nest
1998 : with load/store in outer loop initializing/finalizing simple reduction
1999 : of the innermost loop. For such outer loop reference, we require that
2000 : it has no dependence with others sinve it will be moved to inner loop
2001 : in interchange. */
2002 :
2003 : static bool
2004 55436 : prepare_perfect_loop_nest (class loop *loop, vec<loop_p> *loop_nest,
2005 : vec<data_reference_p> *datarefs, vec<ddr_p> *ddrs)
2006 : {
2007 55436 : class loop *start_loop = NULL, *innermost = loop;
2008 55436 : class loop *outermost = loops_for_fn (cfun)->tree_root;
2009 :
2010 : /* Find loop nest from the innermost loop. The outermost is the innermost
2011 : outer*/
2012 113910 : while (loop->num != 0 && loop->inner == start_loop
2013 120787 : && flow_loop_nested_p (outermost, loop))
2014 : {
2015 58237 : if (!proper_loop_form_for_interchange (loop, &outermost))
2016 : break;
2017 :
2018 17100 : start_loop = loop;
2019 : /* If this loop has sibling loop, the father loop won't be in perfect
2020 : loop nest. */
2021 17100 : if (loop->next != NULL)
2022 : break;
2023 :
2024 7007 : loop = loop_outer (loop);
2025 : }
2026 55436 : if (!start_loop || !start_loop->inner)
2027 : return false;
2028 :
2029 : /* Prepare the data reference vector for the loop nest, pruning outer
2030 : loops we cannot handle. */
2031 1240 : start_loop = prepare_data_references (start_loop, datarefs);
2032 1240 : if (!start_loop
2033 : /* Check if there is no data reference. */
2034 56212 : || datarefs->is_empty ()
2035 : /* Check if there are too many of data references. */
2036 2096 : || (int) datarefs->length () > MAX_DATAREFS)
2037 : return false;
2038 :
2039 : /* Compute access strides for all data references, pruning outer
2040 : loops we cannot analyze refs in. */
2041 856 : start_loop = compute_access_strides (start_loop, innermost, *datarefs);
2042 856 : if (!start_loop)
2043 : return false;
2044 :
2045 : /* Check if any interchange is profitable in the loop nest. */
2046 797 : if (!should_interchange_loop_nest (start_loop, innermost, *datarefs))
2047 : return false;
2048 :
2049 : /* Check if data dependences can be computed for loop nest starting from
2050 : start_loop. */
2051 : loop = start_loop;
2052 107 : do {
2053 107 : loop_nest->truncate (0);
2054 :
2055 107 : if (loop != start_loop)
2056 7 : prune_datarefs_not_in_loop (start_loop, *datarefs);
2057 :
2058 107 : if (find_loop_nest (start_loop, loop_nest)
2059 107 : && tree_loop_interchange_compute_ddrs (*loop_nest, *datarefs, ddrs))
2060 : {
2061 80 : if (dump_file && (dump_flags & TDF_DETAILS))
2062 18 : fprintf (dump_file,
2063 : "\nConsider loop interchange for loop_nest<%d - %d>\n",
2064 : start_loop->num, innermost->num);
2065 :
2066 80 : if (loop != start_loop)
2067 3 : prune_access_strides_not_in_loop (start_loop, innermost, *datarefs);
2068 :
2069 80 : if (dump_file && (dump_flags & TDF_DETAILS))
2070 18 : dump_access_strides (*datarefs);
2071 :
2072 80 : return true;
2073 : }
2074 :
2075 27 : free_dependence_relations (*ddrs);
2076 27 : *ddrs = vNULL;
2077 : /* Try to compute data dependences with the outermost loop stripped. */
2078 27 : loop = start_loop;
2079 27 : start_loop = start_loop->inner;
2080 27 : } while (start_loop && start_loop->inner);
2081 :
2082 : return false;
2083 : }
2084 :
2085 : /* Main entry for loop interchange pass. */
2086 :
2087 : unsigned int
2088 28067 : pass_linterchange::execute (function *fun)
2089 : {
2090 71853 : if (number_of_loops (fun) <= 2)
2091 : return 0;
2092 :
2093 15762 : bool changed_p = false;
2094 102722 : for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST))
2095 : {
2096 55436 : vec<loop_p> loop_nest = vNULL;
2097 55436 : vec<data_reference_p> datarefs = vNULL;
2098 55436 : vec<ddr_p> ddrs = vNULL;
2099 55436 : if (prepare_perfect_loop_nest (loop, &loop_nest, &datarefs, &ddrs))
2100 : {
2101 80 : tree_loop_interchange loop_interchange (loop_nest);
2102 80 : changed_p |= loop_interchange.interchange (datarefs, ddrs);
2103 80 : }
2104 55436 : free_dependence_relations (ddrs);
2105 55436 : free_data_refs_with_aux (datarefs);
2106 55436 : loop_nest.release ();
2107 15762 : }
2108 :
2109 15762 : if (changed_p)
2110 : {
2111 43 : unsigned todo = TODO_update_ssa_only_virtuals;
2112 43 : todo |= loop_invariant_motion_in_fun (cfun, false);
2113 43 : scev_reset ();
2114 43 : return todo;
2115 : }
2116 : return 0;
2117 : }
2118 :
2119 : } // anon namespace
2120 :
2121 : gimple_opt_pass *
2122 285722 : make_pass_linterchange (gcc::context *ctxt)
2123 : {
2124 285722 : return new pass_linterchange (ctxt);
2125 : }
|