Line data Source code
1 : /* Loop distribution.
2 : Copyright (C) 2006-2026 Free Software Foundation, Inc.
3 : Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4 : and Sebastian Pop <sebastian.pop@amd.com>.
5 :
6 : This file is part of GCC.
7 :
8 : GCC is free software; you can redistribute it and/or modify it
9 : under the terms of the GNU General Public License as published by the
10 : Free Software Foundation; either version 3, or (at your option) any
11 : later version.
12 :
13 : GCC is distributed in the hope that it will be useful, but WITHOUT
14 : ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : for more details.
17 :
18 : You should have received a copy of the GNU General Public License
19 : along with GCC; see the file COPYING3. If not see
20 : <http://www.gnu.org/licenses/>. */
21 :
22 : /* This pass performs loop distribution: for example, the loop
23 :
24 : |DO I = 2, N
25 : | A(I) = B(I) + C
26 : | D(I) = A(I-1)*E
27 : |ENDDO
28 :
29 : is transformed to
30 :
31 : |DOALL I = 2, N
32 : | A(I) = B(I) + C
33 : |ENDDO
34 : |
35 : |DOALL I = 2, N
36 : | D(I) = A(I-1)*E
37 : |ENDDO
38 :
39 : Loop distribution is the dual of loop fusion. It separates statements
40 : of a loop (or loop nest) into multiple loops (or loop nests) with the
41 : same loop header. The major goal is to separate statements which may
42 : be vectorized from those that can't. This pass implements distribution
43 : in the following steps:
44 :
45 : 1) Seed partitions with specific type statements. For now we support
46 : two types seed statements: statement defining variable used outside
47 : of loop; statement storing to memory.
48 : 2) Build reduced dependence graph (RDG) for loop to be distributed.
49 : The vertices (RDG:V) model all statements in the loop and the edges
50 : (RDG:E) model flow and control dependencies between statements.
51 : 3) Apart from RDG, compute data dependencies between memory references.
52 : 4) Starting from seed statement, build up partition by adding depended
53 : statements according to RDG's dependence information. Partition is
54 : classified as parallel type if it can be executed paralleled; or as
55 : sequential type if it can't. Parallel type partition is further
56 : classified as different builtin kinds if it can be implemented as
57 : builtin function calls.
58 : 5) Build partition dependence graph (PG) based on data dependencies.
59 : The vertices (PG:V) model all partitions and the edges (PG:E) model
60 : all data dependencies between every partitions pair. In general,
61 : data dependence is either compilation time known or unknown. In C
62 : family languages, there exists quite amount compilation time unknown
63 : dependencies because of possible alias relation of data references.
64 : We categorize PG's edge to two types: "true" edge that represents
65 : compilation time known data dependencies; "alias" edge for all other
66 : data dependencies.
67 : 6) Traverse subgraph of PG as if all "alias" edges don't exist. Merge
68 : partitions in each strong connected component (SCC) correspondingly.
69 : Build new PG for merged partitions.
70 : 7) Traverse PG again and this time with both "true" and "alias" edges
71 : included. We try to break SCCs by removing some edges. Because
72 : SCCs by "true" edges are all fused in step 6), we can break SCCs
73 : by removing some "alias" edges. It's NP-hard to choose optimal
74 : edge set, fortunately simple approximation is good enough for us
75 : given the small problem scale.
76 : 8) Collect all data dependencies of the removed "alias" edges. Create
77 : runtime alias checks for collected data dependencies.
78 : 9) Version loop under the condition of runtime alias checks. Given
79 : loop distribution generally introduces additional overhead, it is
80 : only useful if vectorization is achieved in distributed loop. We
81 : version loop with internal function call IFN_LOOP_DIST_ALIAS. If
82 : no distributed loop can be vectorized, we simply remove distributed
83 : loops and recover to the original one.
84 :
85 : TODO:
86 : 1) We only distribute innermost two-level loop nest now. We should
87 : extend it for arbitrary loop nests in the future.
88 : 2) We only fuse partitions in SCC now. A better fusion algorithm is
89 : desired to minimize loop overhead, maximize parallelism and maximize
90 : data reuse. */
91 :
92 : #include "config.h"
93 : #include "system.h"
94 : #include "coretypes.h"
95 : #include "backend.h"
96 : #include "tree.h"
97 : #include "gimple.h"
98 : #include "cfghooks.h"
99 : #include "tree-pass.h"
100 : #include "ssa.h"
101 : #include "gimple-pretty-print.h"
102 : #include "fold-const.h"
103 : #include "cfganal.h"
104 : #include "gimple-iterator.h"
105 : #include "gimplify-me.h"
106 : #include "stor-layout.h"
107 : #include "tree-cfg.h"
108 : #include "tree-ssa-loop-manip.h"
109 : #include "tree-ssa-loop-ivopts.h"
110 : #include "tree-ssa-loop.h"
111 : #include "tree-into-ssa.h"
112 : #include "tree-ssa.h"
113 : #include "cfgloop.h"
114 : #include "tree-scalar-evolution.h"
115 : #include "tree-vectorizer.h"
116 : #include "tree-eh.h"
117 : #include "gimple-fold.h"
118 : #include "tree-affine.h"
119 : #include "intl.h"
120 : #include "rtl.h"
121 : #include "memmodel.h"
122 : #include "optabs.h"
123 : #include "tree-ssa-loop-niter.h"
124 :
125 :
126 : #define MAX_DATAREFS_NUM \
127 : ((unsigned) param_loop_max_datarefs_for_datadeps)
128 :
129 : /* Threshold controlling number of distributed partitions. Given it may
130 : be unnecessary if a memory stream cost model is invented in the future,
131 : we define it as a temporary macro, rather than a parameter. */
132 : #define NUM_PARTITION_THRESHOLD (4)
133 :
134 : /* Hashtable helpers. */
135 :
136 : struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation>
137 : {
138 : static inline hashval_t hash (const data_dependence_relation *);
139 : static inline bool equal (const data_dependence_relation *,
140 : const data_dependence_relation *);
141 : };
142 :
143 : /* Hash function for data dependence. */
144 :
145 : inline hashval_t
146 9093507 : ddr_hasher::hash (const data_dependence_relation *ddr)
147 : {
148 9093507 : inchash::hash h;
149 9093507 : h.add_ptr (DDR_A (ddr));
150 9093507 : h.add_ptr (DDR_B (ddr));
151 9093507 : return h.end ();
152 : }
153 :
154 : /* Hash table equality function for data dependence. */
155 :
156 : inline bool
157 8106427 : ddr_hasher::equal (const data_dependence_relation *ddr1,
158 : const data_dependence_relation *ddr2)
159 : {
160 8106427 : return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2));
161 : }
162 :
163 :
164 :
165 : #define DR_INDEX(dr) ((uintptr_t) (dr)->aux)
166 :
167 : /* A Reduced Dependence Graph (RDG) vertex representing a statement. */
168 : struct rdg_vertex
169 : {
170 : /* The statement represented by this vertex. */
171 : gimple *stmt;
172 :
173 : /* Vector of data-references in this statement. */
174 : vec<data_reference_p> datarefs;
175 :
176 : /* True when the statement contains a write to memory. */
177 : bool has_mem_write;
178 :
179 : /* True when the statement contains a read from memory. */
180 : bool has_mem_reads;
181 : };
182 :
183 : #define RDGV_STMT(V) ((struct rdg_vertex *) ((V)->data))->stmt
184 : #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs
185 : #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write
186 : #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads
187 : #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I]))
188 : #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I]))
189 : #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
190 : #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
191 :
192 : /* Data dependence type. */
193 :
194 : enum rdg_dep_type
195 : {
196 : /* Read After Write (RAW). */
197 : flow_dd = 'f',
198 :
199 : /* Control dependence (execute conditional on). */
200 : control_dd = 'c'
201 : };
202 :
203 : /* Dependence information attached to an edge of the RDG. */
204 :
205 : struct rdg_edge
206 : {
207 : /* Type of the dependence. */
208 : enum rdg_dep_type type;
209 : };
210 :
211 : #define RDGE_TYPE(E) ((struct rdg_edge *) ((E)->data))->type
212 :
213 : /* Kind of distributed loop. */
214 : enum partition_kind {
215 : PKIND_NORMAL,
216 : /* Partial memset stands for a paritition can be distributed into a loop
217 : of memset calls, rather than a single memset call. It's handled just
218 : like a normal parition, i.e, distributed as separate loop, no memset
219 : call is generated.
220 :
221 : Note: This is a hacking fix trying to distribute ZERO-ing stmt in a
222 : loop nest as deep as possible. As a result, parloop achieves better
223 : parallelization by parallelizing deeper loop nest. This hack should
224 : be unnecessary and removed once distributed memset can be understood
225 : and analyzed in data reference analysis. See PR82604 for more. */
226 : PKIND_PARTIAL_MEMSET,
227 : PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
228 : };
229 :
230 : /* Type of distributed loop. */
231 : enum partition_type {
232 : /* The distributed loop can be executed parallelly. */
233 : PTYPE_PARALLEL = 0,
234 : /* The distributed loop has to be executed sequentially. */
235 : PTYPE_SEQUENTIAL
236 : };
237 :
238 : /* Builtin info for loop distribution. */
239 : struct builtin_info
240 : {
241 : /* data-references a kind != PKIND_NORMAL partition is about. */
242 : data_reference_p dst_dr;
243 : data_reference_p src_dr;
244 : /* Base address and size of memory objects operated by the builtin. Note
245 : both dest and source memory objects must have the same size. */
246 : tree dst_base;
247 : tree src_base;
248 : tree size;
249 : /* Base and offset part of dst_base after stripping constant offset. This
250 : is only used in memset builtin distribution for now. */
251 : tree dst_base_base;
252 : unsigned HOST_WIDE_INT dst_base_offset;
253 : };
254 :
255 : /* Partition for loop distribution. */
256 : struct partition
257 : {
258 : /* Statements of the partition. */
259 : bitmap stmts;
260 : /* True if the partition defines variable which is used outside of loop. */
261 : bool reduction_p;
262 : location_t loc;
263 : enum partition_kind kind;
264 : enum partition_type type;
265 : /* Data references in the partition. */
266 : bitmap datarefs;
267 : /* Information of builtin parition. */
268 : struct builtin_info *builtin;
269 : };
270 :
271 : /* Partitions are fused because of different reasons. */
272 : enum fuse_type
273 : {
274 : FUSE_NON_BUILTIN = 0,
275 : FUSE_REDUCTION = 1,
276 : FUSE_SHARE_REF = 2,
277 : FUSE_SAME_SCC = 3,
278 : FUSE_FINALIZE = 4
279 : };
280 :
281 : /* Description on different fusing reason. */
282 : static const char *fuse_message[] = {
283 : "they are non-builtins",
284 : "they have reductions",
285 : "they have shared memory refs",
286 : "they are in the same dependence scc",
287 : "there is no point to distribute loop"};
288 :
289 :
290 : /* Dump vertex I in RDG to FILE. */
291 :
292 : static void
293 871 : dump_rdg_vertex (FILE *file, struct graph *rdg, int i)
294 : {
295 871 : struct vertex *v = &(rdg->vertices[i]);
296 871 : struct graph_edge *e;
297 :
298 871 : fprintf (file, "(vertex %d: (%s%s) (in:", i,
299 871 : RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "",
300 871 : RDG_MEM_READS_STMT (rdg, i) ? "r" : "");
301 :
302 871 : if (v->pred)
303 2903 : for (e = v->pred; e; e = e->pred_next)
304 2032 : fprintf (file, " %d", e->src);
305 :
306 871 : fprintf (file, ") (out:");
307 :
308 871 : if (v->succ)
309 2758 : for (e = v->succ; e; e = e->succ_next)
310 2032 : fprintf (file, " %d", e->dest);
311 :
312 871 : fprintf (file, ")\n");
313 871 : print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS);
314 871 : fprintf (file, ")\n");
315 871 : }
316 :
317 : /* Call dump_rdg_vertex on stderr. */
318 :
319 : DEBUG_FUNCTION void
320 0 : debug_rdg_vertex (struct graph *rdg, int i)
321 : {
322 0 : dump_rdg_vertex (stderr, rdg, i);
323 0 : }
324 :
325 : /* Dump the reduced dependence graph RDG to FILE. */
326 :
327 : static void
328 70 : dump_rdg (FILE *file, struct graph *rdg)
329 : {
330 70 : fprintf (file, "(rdg\n");
331 941 : for (int i = 0; i < rdg->n_vertices; i++)
332 871 : dump_rdg_vertex (file, rdg, i);
333 70 : fprintf (file, ")\n");
334 70 : }
335 :
336 : /* Call dump_rdg on stderr. */
337 :
338 : DEBUG_FUNCTION void
339 0 : debug_rdg (struct graph *rdg)
340 : {
341 0 : dump_rdg (stderr, rdg);
342 0 : }
343 :
344 : static void
345 0 : dot_rdg_1 (FILE *file, struct graph *rdg)
346 : {
347 0 : int i;
348 0 : pretty_printer pp;
349 0 : pp_needs_newline (&pp) = false;
350 0 : pp.set_output_stream (file);
351 :
352 0 : fprintf (file, "digraph RDG {\n");
353 :
354 0 : for (i = 0; i < rdg->n_vertices; i++)
355 : {
356 0 : struct vertex *v = &(rdg->vertices[i]);
357 0 : struct graph_edge *e;
358 :
359 0 : fprintf (file, "%d [label=\"[%d] ", i, i);
360 0 : pp_gimple_stmt_1 (&pp, RDGV_STMT (v), 0, TDF_SLIM);
361 0 : pp_flush (&pp);
362 0 : fprintf (file, "\"]\n");
363 :
364 : /* Highlight reads from memory. */
365 0 : if (RDG_MEM_READS_STMT (rdg, i))
366 0 : fprintf (file, "%d [style=filled, fillcolor=green]\n", i);
367 :
368 : /* Highlight stores to memory. */
369 0 : if (RDG_MEM_WRITE_STMT (rdg, i))
370 0 : fprintf (file, "%d [style=filled, fillcolor=red]\n", i);
371 :
372 0 : if (v->succ)
373 0 : for (e = v->succ; e; e = e->succ_next)
374 0 : switch (RDGE_TYPE (e))
375 : {
376 0 : case flow_dd:
377 : /* These are the most common dependences: don't print these. */
378 0 : fprintf (file, "%d -> %d \n", i, e->dest);
379 0 : break;
380 :
381 0 : case control_dd:
382 0 : fprintf (file, "%d -> %d [label=control] \n", i, e->dest);
383 0 : break;
384 :
385 0 : default:
386 0 : gcc_unreachable ();
387 : }
388 : }
389 :
390 0 : fprintf (file, "}\n\n");
391 0 : }
392 :
393 : /* Display the Reduced Dependence Graph using dotty. */
394 :
395 : DEBUG_FUNCTION void
396 0 : dot_rdg (struct graph *rdg)
397 : {
398 : /* When debugging, you may want to enable the following code. */
399 : #ifdef HAVE_POPEN
400 0 : FILE *file = popen ("dot -Tx11", "w");
401 0 : if (!file)
402 : return;
403 0 : dot_rdg_1 (file, rdg);
404 0 : fflush (file);
405 0 : close (fileno (file));
406 0 : pclose (file);
407 : #else
408 : dot_rdg_1 (stderr, rdg);
409 : #endif
410 : }
411 :
412 : /* Returns the index of STMT in RDG. */
413 :
414 : static int
415 13251524 : rdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt)
416 : {
417 13251524 : int index = gimple_uid (stmt);
418 13251524 : gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt);
419 13251524 : return index;
420 : }
421 :
422 : /* Creates dependence edges in RDG for all the uses of DEF. IDEF is
423 : the index of DEF in RDG. */
424 :
425 : static void
426 1782287 : create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef)
427 : {
428 1782287 : use_operand_p imm_use_p;
429 1782287 : imm_use_iterator iterator;
430 :
431 6627734 : FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def)
432 : {
433 3063160 : struct graph_edge *e;
434 3063160 : int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p));
435 :
436 3063160 : if (use < 0)
437 443333 : continue;
438 :
439 2619827 : e = add_edge (rdg, idef, use);
440 2619827 : e->data = XNEW (struct rdg_edge);
441 2619827 : RDGE_TYPE (e) = flow_dd;
442 1782287 : }
443 1782287 : }
444 :
445 : /* Creates an edge for the control dependences of BB to the vertex V. */
446 :
447 : static void
448 2272759 : create_edge_for_control_dependence (struct graph *rdg, basic_block bb,
449 : int v, control_dependences *cd)
450 : {
451 2272759 : bitmap_iterator bi;
452 2272759 : unsigned edge_n;
453 6467156 : EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index),
454 : 0, edge_n, bi)
455 : {
456 4194397 : basic_block cond_bb = cd->get_edge_src (edge_n);
457 4194397 : gimple *stmt = *gsi_last_bb (cond_bb);
458 4194397 : if (stmt && is_ctrl_stmt (stmt))
459 : {
460 3761369 : struct graph_edge *e;
461 3761369 : int c = rdg_vertex_for_stmt (rdg, stmt);
462 3761369 : if (c < 0)
463 1233575 : continue;
464 :
465 2527794 : e = add_edge (rdg, c, v);
466 2527794 : e->data = XNEW (struct rdg_edge);
467 2527794 : RDGE_TYPE (e) = control_dd;
468 : }
469 : }
470 2272759 : }
471 :
472 : /* Creates the edges of the reduced dependence graph RDG. */
473 :
474 : static void
475 144485 : create_rdg_flow_edges (struct graph *rdg)
476 : {
477 144485 : int i;
478 144485 : def_operand_p def_p;
479 144485 : ssa_op_iter iter;
480 :
481 2348830 : for (i = 0; i < rdg->n_vertices; i++)
482 6190977 : FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i),
483 : iter, SSA_OP_DEF)
484 1782287 : create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i);
485 144485 : }
486 :
487 : /* Creates the edges of the reduced dependence graph RDG. */
488 :
489 : static void
490 137031 : create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop)
491 : {
492 137031 : int i;
493 :
494 2291696 : for (i = 0; i < rdg->n_vertices; i++)
495 : {
496 2154665 : gimple *stmt = RDG_STMT (rdg, i);
497 2154665 : if (gimple_code (stmt) == GIMPLE_PHI)
498 : {
499 421241 : edge_iterator ei;
500 421241 : edge e;
501 1262044 : FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds)
502 840803 : if (flow_bb_inside_loop_p (loop, e->src))
503 539335 : create_edge_for_control_dependence (rdg, e->src, i, cd);
504 : }
505 : else
506 1733424 : create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd);
507 : }
508 137031 : }
509 :
510 :
511 : class loop_distribution
512 : {
513 : private:
514 : /* The loop (nest) to be distributed. */
515 : vec<loop_p> loop_nest;
516 :
517 : /* Vector of data references in the loop to be distributed. */
518 : vec<data_reference_p> datarefs_vec;
519 :
520 : /* If there is nonaddressable data reference in above vector. */
521 : bool has_nonaddressable_dataref_p;
522 :
523 : /* Store index of data reference in aux field. */
524 :
525 : /* Hash table for data dependence relation in the loop to be distributed. */
526 : hash_table<ddr_hasher> *ddrs_table;
527 :
528 : /* Array mapping basic block's index to its topological order. */
529 : int *bb_top_order_index;
530 : /* And size of the array. */
531 : int bb_top_order_index_size;
532 :
533 : /* Build the vertices of the reduced dependence graph RDG. Return false
534 : if that failed. */
535 : bool create_rdg_vertices (struct graph *rdg, const vec<gimple *> &stmts,
536 : loop_p loop);
537 :
538 : /* Initialize STMTS with all the statements of LOOP. We use topological
539 : order to discover all statements. The order is important because
540 : generate_loops_for_partition is using the same traversal for identifying
541 : statements in loop copies. */
542 : void stmts_from_loop (class loop *loop, vec<gimple *> *stmts);
543 :
544 :
545 : /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of
546 : LOOP, and one edge per flow dependence or control dependence from control
547 : dependence CD. During visiting each statement, data references are also
548 : collected and recorded in global data DATAREFS_VEC. */
549 : struct graph * build_rdg (class loop *loop, control_dependences *cd);
550 :
551 : /* Merge PARTITION into the partition DEST. RDG is the reduced dependence
552 : graph and we update type for result partition if it is non-NULL. */
553 : void partition_merge_into (struct graph *rdg,
554 : partition *dest, partition *partition,
555 : enum fuse_type ft);
556 :
557 :
558 : /* Return data dependence relation for data references A and B. The two
559 : data references must be in lexicographic order wrto reduced dependence
560 : graph RDG. We firstly try to find ddr from global ddr hash table. If
561 : it doesn't exist, compute the ddr and cache it. */
562 : data_dependence_relation * get_data_dependence (struct graph *rdg,
563 : data_reference_p a,
564 : data_reference_p b);
565 :
566 :
567 : /* In reduced dependence graph RDG for loop distribution, return true if
568 : dependence between references DR1 and DR2 leads to a dependence cycle
569 : and such dependence cycle can't be resolved by runtime alias check. */
570 : bool data_dep_in_cycle_p (struct graph *rdg, data_reference_p dr1,
571 : data_reference_p dr2);
572 :
573 :
574 : /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update
575 : PARTITION1's type after merging PARTITION2 into PARTITION1. */
576 : void update_type_for_merge (struct graph *rdg,
577 : partition *partition1, partition *partition2);
578 :
579 :
580 : /* Returns a partition with all the statements needed for computing
581 : the vertex V of the RDG, also including the loop exit conditions. */
582 : partition *build_rdg_partition_for_vertex (struct graph *rdg, int v);
583 :
584 : /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
585 : if it forms builtin memcpy or memmove call. */
586 : void classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition,
587 : data_reference_p dst_dr, data_reference_p src_dr);
588 :
589 : /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
590 : For the moment we detect memset, memcpy and memmove patterns. Bitmap
591 : STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions.
592 : Returns true if there is a reduction in all partitions and we
593 : possibly did not mark PARTITION as having one for this reason. */
594 :
595 : bool
596 : classify_partition (loop_p loop,
597 : struct graph *rdg, partition *partition,
598 : bitmap stmt_in_all_partitions);
599 :
600 :
601 : /* Returns true when PARTITION1 and PARTITION2 access the same memory
602 : object in RDG. */
603 : bool share_memory_accesses (struct graph *rdg,
604 : partition *partition1, partition *partition2);
605 :
606 : /* For each seed statement in STARTING_STMTS, this function builds
607 : partition for it by adding depended statements according to RDG.
608 : All partitions are recorded in PARTITIONS. */
609 : void rdg_build_partitions (struct graph *rdg,
610 : vec<gimple *> starting_stmts,
611 : vec<partition *> *partitions);
612 :
613 : /* Compute partition dependence created by the data references in DRS1
614 : and DRS2, modify and return DIR according to that. IF ALIAS_DDR is
615 : not NULL, we record dependence introduced by possible alias between
616 : two data references in ALIAS_DDRS; otherwise, we simply ignore such
617 : dependence as if it doesn't exist at all. */
618 : int pg_add_dependence_edges (struct graph *rdg, int dir, bitmap drs1,
619 : bitmap drs2, vec<ddr_p> *alias_ddrs);
620 :
621 :
622 : /* Build and return partition dependence graph for PARTITIONS. RDG is
623 : reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P
624 : is true, data dependence caused by possible alias between references
625 : is ignored, as if it doesn't exist at all; otherwise all depdendences
626 : are considered. */
627 : struct graph *build_partition_graph (struct graph *rdg,
628 : vec<struct partition *> *partitions,
629 : bool ignore_alias_p);
630 :
631 : /* Given reduced dependence graph RDG merge strong connected components
632 : of PARTITIONS. If IGNORE_ALIAS_P is true, data dependence caused by
633 : possible alias between references is ignored, as if it doesn't exist
634 : at all; otherwise all depdendences are considered. */
635 : void merge_dep_scc_partitions (struct graph *rdg, vec<struct partition *>
636 : *partitions, bool ignore_alias_p);
637 :
638 : /* This is the main function breaking strong conected components in
639 : PARTITIONS giving reduced depdendence graph RDG. Store data dependence
640 : relations for runtime alias check in ALIAS_DDRS. */
641 : void break_alias_scc_partitions (struct graph *rdg, vec<struct partition *>
642 : *partitions, vec<ddr_p> *alias_ddrs);
643 :
644 :
645 : /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
646 : ALIAS_DDRS contains ddrs which need runtime alias check. */
647 : void finalize_partitions (class loop *loop, vec<struct partition *>
648 : *partitions, vec<ddr_p> *alias_ddrs);
649 :
650 : /* Distributes the code from LOOP in such a way that producer statements
651 : are placed before consumer statements. Tries to separate only the
652 : statements from STMTS into separate loops. Returns the number of
653 : distributed loops. Set NB_CALLS to number of generated builtin calls.
654 : Set *DESTROY_P to whether LOOP needs to be destroyed. */
655 : int distribute_loop (class loop *loop, const vec<gimple *> &stmts,
656 : control_dependences *cd, int *nb_calls, bool *destroy_p,
657 : bool only_patterns_p);
658 :
659 : /* Transform loops which mimic the effects of builtins rawmemchr or strlen and
660 : replace them accordingly. */
661 : bool transform_reduction_loop (loop_p loop);
662 :
663 : /* Compute topological order for basic blocks. Topological order is
664 : needed because data dependence is computed for data references in
665 : lexicographical order. */
666 : void bb_top_order_init (void);
667 :
668 : void bb_top_order_destroy (void);
669 :
670 : public:
671 :
672 : /* Getter for bb_top_order. */
673 :
674 2975555 : inline int get_bb_top_order_index_size (void)
675 : {
676 2975555 : return bb_top_order_index_size;
677 : }
678 :
679 5951110 : inline int get_bb_top_order_index (int i)
680 : {
681 5951110 : return bb_top_order_index[i];
682 : }
683 :
684 : unsigned int execute (function *fun);
685 : };
686 :
687 :
688 : /* If X has a smaller topological sort number than Y, returns -1;
689 : if greater, returns 1. */
690 : static int
691 2975555 : bb_top_order_cmp_r (const void *x, const void *y, void *loop)
692 : {
693 2975555 : loop_distribution *_loop =
694 : (loop_distribution *) loop;
695 :
696 2975555 : basic_block bb1 = *(const basic_block *) x;
697 2975555 : basic_block bb2 = *(const basic_block *) y;
698 :
699 2975555 : int bb_top_order_index_size = _loop->get_bb_top_order_index_size ();
700 :
701 2975555 : gcc_assert (bb1->index < bb_top_order_index_size
702 : && bb2->index < bb_top_order_index_size);
703 2975555 : gcc_assert (bb1 == bb2
704 : || _loop->get_bb_top_order_index(bb1->index)
705 : != _loop->get_bb_top_order_index(bb2->index));
706 :
707 2975555 : return (_loop->get_bb_top_order_index(bb1->index) -
708 2975555 : _loop->get_bb_top_order_index(bb2->index));
709 : }
710 :
711 : bool
712 147701 : loop_distribution::create_rdg_vertices (struct graph *rdg,
713 : const vec<gimple *> &stmts,
714 : loop_p loop)
715 : {
716 147701 : int i;
717 147701 : gimple *stmt;
718 :
719 2374093 : FOR_EACH_VEC_ELT (stmts, i, stmt)
720 : {
721 2229608 : struct vertex *v = &(rdg->vertices[i]);
722 :
723 : /* Record statement to vertex mapping. */
724 2229608 : gimple_set_uid (stmt, i);
725 :
726 2229608 : v->data = XNEW (struct rdg_vertex);
727 2229608 : RDGV_STMT (v) = stmt;
728 2229608 : RDGV_DATAREFS (v).create (0);
729 2229608 : RDGV_HAS_MEM_WRITE (v) = false;
730 2229608 : RDGV_HAS_MEM_READS (v) = false;
731 2229608 : if (gimple_code (stmt) == GIMPLE_PHI)
732 444656 : continue;
733 :
734 1784952 : unsigned drp = datarefs_vec.length ();
735 1784952 : if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec))
736 : return false;
737 2671776 : for (unsigned j = drp; j < datarefs_vec.length (); ++j)
738 : {
739 445384 : data_reference_p dr = datarefs_vec[j];
740 445384 : if (DR_IS_READ (dr))
741 239348 : RDGV_HAS_MEM_READS (v) = true;
742 : else
743 206036 : RDGV_HAS_MEM_WRITE (v) = true;
744 445384 : RDGV_DATAREFS (v).safe_push (dr);
745 445384 : has_nonaddressable_dataref_p |= may_be_nonaddressable_p (dr->ref);
746 : }
747 : }
748 : return true;
749 : }
750 :
751 : void
752 147701 : loop_distribution::stmts_from_loop (class loop *loop, vec<gimple *> *stmts)
753 : {
754 147701 : unsigned int i;
755 147701 : basic_block *bbs = get_loop_body_in_custom_order (loop, this, bb_top_order_cmp_r);
756 :
757 629700 : for (i = 0; i < loop->num_nodes; i++)
758 : {
759 481999 : basic_block bb = bbs[i];
760 :
761 1072631 : for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
762 590632 : gsi_next (&bsi))
763 1181264 : if (!virtual_operand_p (gimple_phi_result (bsi.phi ())))
764 445894 : stmts->safe_push (bsi.phi ());
765 :
766 3801503 : for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
767 2837505 : gsi_next (&bsi))
768 : {
769 2837505 : gimple *stmt = gsi_stmt (bsi);
770 2837505 : if (gimple_code (stmt) != GIMPLE_LABEL && !is_gimple_debug (stmt))
771 1811570 : stmts->safe_push (stmt);
772 : }
773 : }
774 :
775 147701 : free (bbs);
776 147701 : }
777 :
778 : /* Free the reduced dependence graph RDG. */
779 :
780 : static void
781 147701 : free_rdg (struct graph *rdg, loop_p loop)
782 : {
783 147701 : int i;
784 :
785 2405165 : for (i = 0; i < rdg->n_vertices; i++)
786 : {
787 2257464 : struct vertex *v = &(rdg->vertices[i]);
788 2257464 : struct graph_edge *e;
789 :
790 7405085 : for (e = v->succ; e; e = e->succ_next)
791 5147621 : free (e->data);
792 :
793 2257464 : if (v->data)
794 : {
795 2229608 : (RDGV_DATAREFS (v)).release ();
796 2229608 : free (v->data);
797 : }
798 : }
799 :
800 147701 : free_graph (rdg);
801 :
802 : /* Reset UIDs of stmts still in the loop. */
803 147701 : basic_block *bbs = get_loop_body (loop);
804 629700 : for (unsigned i = 0; i < loop->num_nodes; ++i)
805 : {
806 481999 : basic_block bb = bbs[i];
807 481999 : gimple_stmt_iterator gsi;
808 1072571 : for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
809 590572 : gimple_set_uid (gsi_stmt (gsi), -1);
810 3798427 : for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
811 2834429 : gimple_set_uid (gsi_stmt (gsi), -1);
812 : }
813 147701 : free (bbs);
814 147701 : }
815 :
816 : struct graph *
817 147701 : loop_distribution::build_rdg (class loop *loop, control_dependences *cd)
818 : {
819 147701 : struct graph *rdg;
820 :
821 : /* Create the RDG vertices from the stmts of the loop nest. */
822 147701 : auto_vec<gimple *, 10> stmts;
823 147701 : stmts_from_loop (loop, &stmts);
824 295402 : rdg = new_graph (stmts.length ());
825 147701 : if (!create_rdg_vertices (rdg, stmts, loop))
826 : {
827 3216 : free_rdg (rdg, loop);
828 3216 : return NULL;
829 : }
830 144485 : stmts.release ();
831 :
832 144485 : create_rdg_flow_edges (rdg);
833 144485 : if (cd)
834 137031 : create_rdg_cd_edges (rdg, cd, loop);
835 :
836 : return rdg;
837 147701 : }
838 :
839 :
840 : /* Allocate and initialize a partition from BITMAP. */
841 :
842 : static partition *
843 246256 : partition_alloc (void)
844 : {
845 246256 : partition *partition = XCNEW (struct partition);
846 246256 : partition->stmts = BITMAP_ALLOC (NULL);
847 246256 : partition->reduction_p = false;
848 246256 : partition->loc = UNKNOWN_LOCATION;
849 246256 : partition->kind = PKIND_NORMAL;
850 246256 : partition->type = PTYPE_PARALLEL;
851 246256 : partition->datarefs = BITMAP_ALLOC (NULL);
852 246256 : return partition;
853 : }
854 :
855 : /* Free PARTITION. */
856 :
857 : static void
858 246256 : partition_free (partition *partition)
859 : {
860 246256 : BITMAP_FREE (partition->stmts);
861 246256 : BITMAP_FREE (partition->datarefs);
862 246256 : if (partition->builtin)
863 12833 : free (partition->builtin);
864 :
865 246256 : free (partition);
866 246256 : }
867 :
868 : /* Returns true if the partition can be generated as a builtin. */
869 :
870 : static bool
871 351160 : partition_builtin_p (partition *partition)
872 : {
873 351160 : return partition->kind > PKIND_PARTIAL_MEMSET;
874 : }
875 :
876 : /* Returns true if the partition contains a reduction. */
877 :
878 : static bool
879 2060160 : partition_reduction_p (partition *partition)
880 : {
881 2060160 : return partition->reduction_p;
882 : }
883 :
884 : void
885 34409 : loop_distribution::partition_merge_into (struct graph *rdg,
886 : partition *dest, partition *partition, enum fuse_type ft)
887 : {
888 34409 : if (dump_file && (dump_flags & TDF_DETAILS))
889 : {
890 42 : fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]);
891 42 : fprintf (dump_file, " Part 1: ");
892 42 : dump_bitmap (dump_file, dest->stmts);
893 42 : fprintf (dump_file, " Part 2: ");
894 42 : dump_bitmap (dump_file, partition->stmts);
895 : }
896 :
897 34409 : dest->kind = PKIND_NORMAL;
898 34409 : if (dest->type == PTYPE_PARALLEL)
899 24311 : dest->type = partition->type;
900 :
901 34409 : bitmap_ior_into (dest->stmts, partition->stmts);
902 34409 : if (partition_reduction_p (partition))
903 3561 : dest->reduction_p = true;
904 :
905 : /* Further check if any data dependence prevents us from executing the
906 : new partition parallelly. */
907 34409 : if (dest->type == PTYPE_PARALLEL && rdg != NULL)
908 6870 : update_type_for_merge (rdg, dest, partition);
909 :
910 34409 : bitmap_ior_into (dest->datarefs, partition->datarefs);
911 34409 : }
912 :
913 :
914 : /* Returns true when DEF is an SSA_NAME defined in LOOP and used after
915 : the LOOP. */
916 :
917 : static bool
918 4848385 : ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
919 : {
920 4848385 : imm_use_iterator imm_iter;
921 4848385 : use_operand_p use_p;
922 :
923 19298634 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
924 : {
925 9735884 : if (is_gimple_debug (USE_STMT (use_p)))
926 974041 : continue;
927 :
928 8761843 : basic_block use_bb = gimple_bb (USE_STMT (use_p));
929 8761843 : if (!flow_bb_inside_loop_p (loop, use_bb))
930 134020 : return true;
931 134020 : }
932 :
933 4714365 : return false;
934 : }
935 :
936 : /* Returns true when STMT defines a scalar variable used after the
937 : loop LOOP. */
938 :
939 : static bool
940 6961000 : stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt)
941 : {
942 6961000 : def_operand_p def_p;
943 6961000 : ssa_op_iter op_iter;
944 :
945 6961000 : if (gimple_code (stmt) == GIMPLE_PHI)
946 1201140 : return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop);
947 :
948 9333623 : FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
949 3647245 : if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop))
950 : return true;
951 :
952 : return false;
953 : }
954 :
955 : /* Return a copy of LOOP placed before LOOP. */
956 :
957 : static class loop *
958 1135 : copy_loop_before (class loop *loop, bool redirect_lc_phi_defs)
959 : {
960 1135 : class loop *res;
961 1135 : edge preheader = loop_preheader_edge (loop);
962 :
963 1135 : initialize_original_copy_tables ();
964 1135 : res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, single_exit (loop), NULL,
965 : NULL, preheader, NULL, false);
966 1135 : gcc_assert (res != NULL);
967 :
968 : /* When a not last partition is supposed to keep the LC PHIs computed
969 : adjust their definitions. */
970 1135 : if (redirect_lc_phi_defs)
971 : {
972 3 : edge exit = single_exit (loop);
973 14 : for (gphi_iterator si = gsi_start_phis (exit->dest); !gsi_end_p (si);
974 11 : gsi_next (&si))
975 : {
976 11 : gphi *phi = si.phi ();
977 22 : if (virtual_operand_p (gimple_phi_result (phi)))
978 3 : continue;
979 8 : use_operand_p use_p = PHI_ARG_DEF_PTR_FROM_EDGE (phi, exit);
980 8 : if (TREE_CODE (USE_FROM_PTR (use_p)) == SSA_NAME)
981 : {
982 5 : tree new_def = get_current_def (USE_FROM_PTR (use_p));
983 5 : if (!new_def)
984 : /* Something defined outside of the loop. */
985 2 : continue;
986 3 : SET_USE (use_p, new_def);
987 : }
988 : }
989 : }
990 :
991 1135 : free_original_copy_tables ();
992 1135 : delete_update_ssa ();
993 :
994 1135 : return res;
995 : }
996 :
997 : /* Creates an empty basic block after LOOP. */
998 :
999 : static void
1000 1135 : create_bb_after_loop (class loop *loop)
1001 : {
1002 1135 : edge exit = single_exit (loop);
1003 :
1004 1135 : if (!exit)
1005 : return;
1006 :
1007 1135 : split_edge (exit);
1008 : }
1009 :
1010 : /* Generate code for PARTITION from the code in LOOP. The loop is
1011 : copied when COPY_P is true. All the statements not flagged in the
1012 : PARTITION bitmap are removed from the loop or from its copy. The
1013 : statements are indexed in sequence inside a basic block, and the
1014 : basic blocks of a loop are taken in dom order. */
1015 :
1016 : static void
1017 3025 : generate_loops_for_partition (class loop *loop, partition *partition,
1018 : bool copy_p, bool keep_lc_phis_p)
1019 : {
1020 3025 : unsigned i;
1021 3025 : basic_block *bbs;
1022 :
1023 3025 : if (copy_p)
1024 : {
1025 1135 : int orig_loop_num = loop->orig_loop_num;
1026 1135 : loop = copy_loop_before (loop, keep_lc_phis_p);
1027 1135 : gcc_assert (loop != NULL);
1028 1135 : loop->orig_loop_num = orig_loop_num;
1029 1135 : create_preheader (loop, CP_SIMPLE_PREHEADERS);
1030 1135 : create_bb_after_loop (loop);
1031 : }
1032 : else
1033 : {
1034 : /* Origin number is set to the new versioned loop's num. */
1035 1890 : gcc_assert (loop->orig_loop_num != loop->num);
1036 : }
1037 :
1038 : /* Remove stmts not in the PARTITION bitmap. */
1039 3025 : bbs = get_loop_body_in_dom_order (loop);
1040 :
1041 3025 : if (MAY_HAVE_DEBUG_BIND_STMTS)
1042 7621 : for (i = 0; i < loop->num_nodes; i++)
1043 : {
1044 5864 : basic_block bb = bbs[i];
1045 :
1046 11052 : for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
1047 5188 : gsi_next (&bsi))
1048 : {
1049 5188 : gphi *phi = bsi.phi ();
1050 8610 : if (!virtual_operand_p (gimple_phi_result (phi))
1051 5188 : && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
1052 375 : reset_debug_uses (phi);
1053 : }
1054 :
1055 48297 : for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
1056 : {
1057 36569 : gimple *stmt = gsi_stmt (bsi);
1058 36569 : if (gimple_code (stmt) != GIMPLE_LABEL
1059 36569 : && !is_gimple_debug (stmt)
1060 60007 : && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
1061 6447 : reset_debug_uses (stmt);
1062 : }
1063 : }
1064 :
1065 13472 : for (i = 0; i < loop->num_nodes; i++)
1066 : {
1067 10447 : basic_block bb = bbs[i];
1068 10447 : edge inner_exit = NULL;
1069 :
1070 10447 : if (loop != bb->loop_father)
1071 116 : inner_exit = single_exit (bb->loop_father);
1072 :
1073 19751 : for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
1074 : {
1075 9304 : gphi *phi = bsi.phi ();
1076 15462 : if (!virtual_operand_p (gimple_phi_result (phi))
1077 9304 : && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
1078 861 : remove_phi_node (&bsi, true);
1079 : else
1080 8443 : gsi_next (&bsi);
1081 : }
1082 :
1083 76501 : for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
1084 : {
1085 55607 : gimple *stmt = gsi_stmt (bsi);
1086 55607 : if (gimple_code (stmt) != GIMPLE_LABEL
1087 55599 : && !is_gimple_debug (stmt)
1088 98075 : && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
1089 : {
1090 : /* In distribution of loop nest, if bb is inner loop's exit_bb,
1091 : we choose its exit edge/path in order to avoid generating
1092 : infinite loop. For all other cases, we choose an arbitrary
1093 : path through the empty CFG part that this unnecessary
1094 : control stmt controls. */
1095 13556 : if (gcond *cond_stmt = dyn_cast <gcond *> (stmt))
1096 : {
1097 664 : if (inner_exit && inner_exit->flags & EDGE_TRUE_VALUE)
1098 5 : gimple_cond_make_true (cond_stmt);
1099 : else
1100 659 : gimple_cond_make_false (cond_stmt);
1101 664 : update_stmt (stmt);
1102 : }
1103 12892 : else if (gimple_code (stmt) == GIMPLE_SWITCH)
1104 : {
1105 0 : gswitch *switch_stmt = as_a <gswitch *> (stmt);
1106 0 : gimple_switch_set_index
1107 0 : (switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1)));
1108 0 : update_stmt (stmt);
1109 : }
1110 : else
1111 : {
1112 12892 : unlink_stmt_vdef (stmt);
1113 12892 : gsi_remove (&bsi, true);
1114 12892 : release_defs (stmt);
1115 12892 : continue;
1116 : }
1117 : }
1118 42715 : gsi_next (&bsi);
1119 : }
1120 : }
1121 :
1122 3025 : free (bbs);
1123 3025 : }
1124 :
1125 : /* If VAL memory representation contains the same value in all bytes,
1126 : return that value, otherwise return -1.
1127 : E.g. for 0x24242424 return 0x24, for IEEE double
1128 : 747708026454360457216.0 return 0x44, etc. */
1129 :
1130 : static int
1131 79096 : const_with_all_bytes_same (tree val)
1132 : {
1133 79096 : unsigned char buf[64];
1134 79096 : int i, len;
1135 :
1136 79096 : if (integer_zerop (val)
1137 79096 : || (TREE_CODE (val) == CONSTRUCTOR
1138 521 : && !TREE_CLOBBER_P (val)
1139 33165 : && CONSTRUCTOR_NELTS (val) == 0))
1140 : return 0;
1141 :
1142 48205 : if (real_zerop (val))
1143 : {
1144 : /* Only return 0 for +0.0, not for -0.0, which doesn't have
1145 : an all bytes same memory representation. Don't transform
1146 : -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS. */
1147 2302 : switch (TREE_CODE (val))
1148 : {
1149 2289 : case REAL_CST:
1150 2289 : if (!real_isneg (TREE_REAL_CST_PTR (val)))
1151 : return 0;
1152 : break;
1153 6 : case COMPLEX_CST:
1154 6 : if (!const_with_all_bytes_same (TREE_REALPART (val))
1155 12 : && !const_with_all_bytes_same (TREE_IMAGPART (val)))
1156 : return 0;
1157 : break;
1158 7 : case VECTOR_CST:
1159 7 : {
1160 7 : unsigned int count = vector_cst_encoded_nelts (val);
1161 7 : unsigned int j;
1162 21 : for (j = 0; j < count; ++j)
1163 14 : if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val, j)))
1164 : break;
1165 7 : if (j == count)
1166 : return 0;
1167 : break;
1168 : }
1169 : default:
1170 : break;
1171 : }
1172 : }
1173 :
1174 45931 : if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
1175 : return -1;
1176 :
1177 45931 : len = native_encode_expr (val, buf, sizeof (buf));
1178 45931 : if (len == 0)
1179 : return -1;
1180 29503 : for (i = 1; i < len; i++)
1181 25952 : if (buf[i] != buf[0])
1182 : return -1;
1183 3551 : return buf[0];
1184 : }
1185 :
1186 : /* Generate a call to memset for PARTITION in LOOP. */
1187 :
1188 : static void
1189 7770 : generate_memset_builtin (class loop *loop, partition *partition)
1190 : {
1191 7770 : gimple_stmt_iterator gsi;
1192 7770 : tree mem, fn, nb_bytes;
1193 7770 : tree val;
1194 7770 : struct builtin_info *builtin = partition->builtin;
1195 7770 : gimple *fn_call;
1196 :
1197 : /* The new statements will be placed before LOOP. */
1198 7770 : gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1199 :
1200 7770 : nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
1201 7770 : nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1202 : false, GSI_CONTINUE_LINKING);
1203 7770 : mem = rewrite_to_non_trapping_overflow (builtin->dst_base);
1204 7770 : mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE,
1205 : false, GSI_CONTINUE_LINKING);
1206 :
1207 : /* This exactly matches the pattern recognition in classify_partition. */
1208 7770 : val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr));
1209 : /* Handle constants like 0x15151515 and similarly
1210 : floating point constants etc. where all bytes are the same. */
1211 7770 : int bytev = const_with_all_bytes_same (val);
1212 7770 : if (bytev != -1)
1213 7671 : val = build_int_cst (integer_type_node, bytev);
1214 99 : else if (TREE_CODE (val) == INTEGER_CST)
1215 0 : val = fold_convert (integer_type_node, val);
1216 99 : else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
1217 : {
1218 99 : tree tem = make_ssa_name (integer_type_node);
1219 99 : gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val);
1220 99 : gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
1221 99 : val = tem;
1222 : }
1223 :
1224 15540 : fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
1225 7770 : fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes);
1226 7770 : gimple_set_location (fn_call, partition->loc);
1227 7770 : gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1228 7770 : fold_stmt (&gsi);
1229 :
1230 7770 : if (dump_file && (dump_flags & TDF_DETAILS))
1231 : {
1232 39 : fprintf (dump_file, "generated memset");
1233 39 : if (bytev == 0)
1234 33 : fprintf (dump_file, " zero\n");
1235 : else
1236 6 : fprintf (dump_file, "\n");
1237 : }
1238 7770 : }
1239 :
1240 : /* Generate a call to memcpy for PARTITION in LOOP. */
1241 :
1242 : static void
1243 4379 : generate_memcpy_builtin (class loop *loop, partition *partition)
1244 : {
1245 4379 : gimple_stmt_iterator gsi;
1246 4379 : gimple *fn_call;
1247 4379 : tree dest, src, fn, nb_bytes;
1248 4379 : enum built_in_function kind;
1249 4379 : struct builtin_info *builtin = partition->builtin;
1250 :
1251 : /* The new statements will be placed before LOOP. */
1252 4379 : gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1253 :
1254 4379 : nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
1255 4379 : nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1256 : false, GSI_CONTINUE_LINKING);
1257 4379 : dest = rewrite_to_non_trapping_overflow (builtin->dst_base);
1258 4379 : src = rewrite_to_non_trapping_overflow (builtin->src_base);
1259 4379 : if (partition->kind == PKIND_MEMCPY
1260 4379 : || ! ptr_derefs_may_alias_p (dest, src))
1261 : kind = BUILT_IN_MEMCPY;
1262 : else
1263 177 : kind = BUILT_IN_MEMMOVE;
1264 : /* Try harder if we're copying a constant size. */
1265 177 : if (kind == BUILT_IN_MEMMOVE && poly_int_tree_p (nb_bytes))
1266 : {
1267 184 : aff_tree asrc, adest;
1268 92 : tree_to_aff_combination (src, ptr_type_node, &asrc);
1269 92 : tree_to_aff_combination (dest, ptr_type_node, &adest);
1270 92 : aff_combination_scale (&adest, -1);
1271 92 : aff_combination_add (&asrc, &adest);
1272 184 : if (aff_comb_cannot_overlap_p (&asrc, wi::to_poly_widest (nb_bytes),
1273 184 : wi::to_poly_widest (nb_bytes)))
1274 0 : kind = BUILT_IN_MEMCPY;
1275 92 : }
1276 :
1277 4379 : dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE,
1278 : false, GSI_CONTINUE_LINKING);
1279 4379 : src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE,
1280 : false, GSI_CONTINUE_LINKING);
1281 4379 : fn = build_fold_addr_expr (builtin_decl_implicit (kind));
1282 4379 : fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes);
1283 4379 : gimple_set_location (fn_call, partition->loc);
1284 4379 : gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1285 4379 : fold_stmt (&gsi);
1286 :
1287 4379 : if (dump_file && (dump_flags & TDF_DETAILS))
1288 : {
1289 13 : if (kind == BUILT_IN_MEMCPY)
1290 10 : fprintf (dump_file, "generated memcpy\n");
1291 : else
1292 3 : fprintf (dump_file, "generated memmove\n");
1293 : }
1294 4379 : }
1295 :
1296 : /* Remove and destroy the loop LOOP. */
1297 :
1298 : static void
1299 10157 : destroy_loop (class loop *loop)
1300 : {
1301 10157 : unsigned nbbs = loop->num_nodes;
1302 10157 : edge exit = single_exit (loop);
1303 10157 : basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
1304 10157 : basic_block *bbs;
1305 10157 : unsigned i;
1306 :
1307 10157 : bbs = get_loop_body_in_dom_order (loop);
1308 :
1309 10157 : gimple_stmt_iterator dst_gsi = gsi_after_labels (exit->dest);
1310 10157 : bool safe_p = single_pred_p (exit->dest);
1311 31982 : for (unsigned i = 0; i < nbbs; ++i)
1312 : {
1313 : /* We have made sure to not leave any dangling uses of SSA
1314 : names defined in the loop. With the exception of virtuals.
1315 : Make sure we replace all uses of virtual defs that will remain
1316 : outside of the loop with the bare symbol as delete_basic_block
1317 : will release them. */
1318 51657 : for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi);
1319 29832 : gsi_next (&gsi))
1320 : {
1321 29832 : gphi *phi = gsi.phi ();
1322 70771 : if (virtual_operand_p (gimple_phi_result (phi)))
1323 11107 : mark_virtual_phi_result_for_renaming (phi);
1324 : }
1325 132906 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi);)
1326 : {
1327 89256 : gimple *stmt = gsi_stmt (gsi);
1328 89256 : tree vdef = gimple_vdef (stmt);
1329 49248 : if (vdef && TREE_CODE (vdef) == SSA_NAME)
1330 11865 : mark_virtual_operand_for_renaming (vdef);
1331 : /* Also move and eventually reset debug stmts. We can leave
1332 : constant values in place in case the stmt dominates the exit.
1333 : ??? Non-constant values from the last iteration can be
1334 : replaced with final values if we can compute them. */
1335 89256 : if (gimple_debug_bind_p (stmt))
1336 : {
1337 20238 : tree val = gimple_debug_bind_get_value (stmt);
1338 20238 : gsi_move_before (&gsi, &dst_gsi);
1339 20238 : if (val
1340 20238 : && (!safe_p
1341 10603 : || !is_gimple_min_invariant (val)
1342 1118 : || !dominated_by_p (CDI_DOMINATORS, exit->src, bbs[i])))
1343 : {
1344 11301 : gimple_debug_bind_reset_value (stmt);
1345 11301 : update_stmt (stmt);
1346 : }
1347 : }
1348 : else
1349 69018 : gsi_next (&gsi);
1350 : }
1351 : }
1352 :
1353 10157 : redirect_edge_pred (exit, src);
1354 10157 : exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
1355 10157 : exit->flags |= EDGE_FALLTHRU;
1356 10157 : exit->probability = profile_probability::always ();
1357 10157 : cancel_loop_tree (loop);
1358 10157 : rescan_loop_exit (exit, false, true);
1359 :
1360 10157 : i = nbbs;
1361 21825 : do
1362 : {
1363 21825 : --i;
1364 21825 : delete_basic_block (bbs[i]);
1365 : }
1366 21825 : while (i != 0);
1367 :
1368 10157 : free (bbs);
1369 :
1370 10157 : set_immediate_dominator (CDI_DOMINATORS, dest,
1371 : recompute_dominator (CDI_DOMINATORS, dest));
1372 10157 : }
1373 :
1374 : /* Generates code for PARTITION. Return whether LOOP needs to be destroyed. */
1375 :
1376 : static bool
1377 15174 : generate_code_for_partition (class loop *loop,
1378 : partition *partition, bool copy_p,
1379 : bool keep_lc_phis_p)
1380 : {
1381 15174 : switch (partition->kind)
1382 : {
1383 3025 : case PKIND_NORMAL:
1384 3025 : case PKIND_PARTIAL_MEMSET:
1385 : /* Reductions all have to be in the last partition. */
1386 3025 : gcc_assert (!partition_reduction_p (partition)
1387 : || !copy_p);
1388 3025 : generate_loops_for_partition (loop, partition, copy_p,
1389 : keep_lc_phis_p);
1390 3025 : return false;
1391 :
1392 7770 : case PKIND_MEMSET:
1393 7770 : generate_memset_builtin (loop, partition);
1394 7770 : break;
1395 :
1396 4379 : case PKIND_MEMCPY:
1397 4379 : case PKIND_MEMMOVE:
1398 4379 : generate_memcpy_builtin (loop, partition);
1399 4379 : break;
1400 :
1401 0 : default:
1402 0 : gcc_unreachable ();
1403 : }
1404 :
1405 : /* Common tail for partitions we turn into a call. If this was the last
1406 : partition for which we generate code, we have to destroy the loop. */
1407 12149 : if (!copy_p)
1408 : return true;
1409 : return false;
1410 : }
1411 :
1412 : data_dependence_relation *
1413 1549815 : loop_distribution::get_data_dependence (struct graph *rdg, data_reference_p a,
1414 : data_reference_p b)
1415 : {
1416 1549815 : struct data_dependence_relation ent, **slot;
1417 1549815 : struct data_dependence_relation *ddr;
1418 :
1419 1549815 : gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b));
1420 1549815 : gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a))
1421 : <= rdg_vertex_for_stmt (rdg, DR_STMT (b)));
1422 1549815 : ent.a = a;
1423 1549815 : ent.b = b;
1424 1549815 : slot = ddrs_table->find_slot (&ent, INSERT);
1425 1549815 : if (*slot == NULL)
1426 : {
1427 1174542 : ddr = initialize_data_dependence_relation (a, b, loop_nest);
1428 1174542 : compute_affine_dependence (ddr, loop_nest[0]);
1429 1174542 : *slot = ddr;
1430 : }
1431 :
1432 1549815 : return *slot;
1433 : }
1434 :
1435 : bool
1436 256706 : loop_distribution::data_dep_in_cycle_p (struct graph *rdg,
1437 : data_reference_p dr1,
1438 : data_reference_p dr2)
1439 : {
1440 256706 : struct data_dependence_relation *ddr;
1441 :
1442 : /* Re-shuffle data-refs to be in topological order. */
1443 513412 : if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1444 256706 : > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1445 49661 : std::swap (dr1, dr2);
1446 :
1447 256706 : ddr = get_data_dependence (rdg, dr1, dr2);
1448 :
1449 : /* In case of no data dependence. */
1450 256706 : if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1451 : return false;
1452 : /* For unknown data dependence or known data dependence which can't be
1453 : expressed in classic distance vector, we check if it can be resolved
1454 : by runtime alias check. If yes, we still consider data dependence
1455 : as won't introduce data dependence cycle. */
1456 92623 : else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1457 92623 : || DDR_NUM_DIST_VECTS (ddr) == 0)
1458 50467 : return !runtime_alias_check_p (ddr, NULL, true);
1459 42156 : else if (DDR_NUM_DIST_VECTS (ddr) > 1)
1460 : return true;
1461 37716 : else if (DDR_REVERSED_P (ddr)
1462 110992 : || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), DDR_NB_LOOPS (ddr)))
1463 33064 : return false;
1464 :
1465 : return true;
1466 : }
1467 :
1468 : void
1469 237222 : loop_distribution::update_type_for_merge (struct graph *rdg,
1470 : partition *partition1,
1471 : partition *partition2)
1472 : {
1473 237222 : unsigned i, j;
1474 237222 : bitmap_iterator bi, bj;
1475 237222 : data_reference_p dr1, dr2;
1476 :
1477 716620 : EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1478 : {
1479 488491 : unsigned start = (partition1 == partition2) ? i + 1 : 0;
1480 :
1481 488491 : dr1 = datarefs_vec[i];
1482 1159843 : EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj)
1483 : {
1484 680445 : dr2 = datarefs_vec[j];
1485 680445 : if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1486 423739 : continue;
1487 :
1488 : /* Partition can only be executed sequentially if there is any
1489 : data dependence cycle. */
1490 256706 : if (data_dep_in_cycle_p (rdg, dr1, dr2))
1491 : {
1492 9093 : partition1->type = PTYPE_SEQUENTIAL;
1493 9093 : return;
1494 : }
1495 : }
1496 : }
1497 : }
1498 :
1499 : partition *
1500 246256 : loop_distribution::build_rdg_partition_for_vertex (struct graph *rdg, int v)
1501 : {
1502 246256 : partition *partition = partition_alloc ();
1503 246256 : auto_vec<int, 3> nodes;
1504 246256 : unsigned i, j;
1505 246256 : int x;
1506 246256 : data_reference_p dr;
1507 :
1508 246256 : graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
1509 :
1510 3461848 : FOR_EACH_VEC_ELT (nodes, i, x)
1511 : {
1512 3215592 : bitmap_set_bit (partition->stmts, x);
1513 :
1514 4223382 : for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j)
1515 : {
1516 507008 : unsigned idx = (unsigned) DR_INDEX (dr);
1517 507008 : gcc_assert (idx < datarefs_vec.length ());
1518 :
1519 : /* Partition can only be executed sequentially if there is any
1520 : unknown data reference. */
1521 507008 : if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr)
1522 478366 : || !DR_INIT (dr) || !DR_STEP (dr))
1523 28642 : partition->type = PTYPE_SEQUENTIAL;
1524 :
1525 507008 : bitmap_set_bit (partition->datarefs, idx);
1526 : }
1527 : }
1528 :
1529 246256 : if (partition->type == PTYPE_SEQUENTIAL)
1530 : return partition;
1531 :
1532 : /* Further check if any data dependence prevents us from executing the
1533 : partition parallelly. */
1534 230352 : update_type_for_merge (rdg, partition, partition);
1535 :
1536 230352 : return partition;
1537 246256 : }
1538 :
1539 : /* Given PARTITION of LOOP and RDG, record single load/store data references
1540 : for builtin partition in SRC_DR/DST_DR, return false if there is no such
1541 : data references. */
1542 :
1543 : static bool
1544 227263 : find_single_drs (class loop *loop, struct graph *rdg, const bitmap &partition_stmts,
1545 : data_reference_p *dst_dr, data_reference_p *src_dr)
1546 : {
1547 227263 : unsigned i;
1548 227263 : data_reference_p single_ld = NULL, single_st = NULL;
1549 227263 : bitmap_iterator bi;
1550 :
1551 2333676 : EXECUTE_IF_SET_IN_BITMAP (partition_stmts, 0, i, bi)
1552 : {
1553 2169622 : gimple *stmt = RDG_STMT (rdg, i);
1554 2169622 : data_reference_p dr;
1555 :
1556 2169622 : if (gimple_code (stmt) == GIMPLE_PHI)
1557 493205 : continue;
1558 :
1559 : /* Any scalar stmts are ok. */
1560 3122950 : if (!gimple_vuse (stmt))
1561 1325494 : continue;
1562 :
1563 : /* Otherwise just regular loads/stores. */
1564 350923 : if (!gimple_assign_single_p (stmt))
1565 227263 : return false;
1566 :
1567 : /* But exactly one store and/or load. */
1568 2750415 : for (unsigned j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j)
1569 : {
1570 356309 : tree type = TREE_TYPE (DR_REF (dr));
1571 :
1572 : /* The memset, memcpy and memmove library calls are only
1573 : able to deal with generic address space. */
1574 356309 : if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type)))
1575 : return false;
1576 :
1577 356291 : if (DR_IS_READ (dr))
1578 : {
1579 209745 : if (single_ld != NULL)
1580 : return false;
1581 : single_ld = dr;
1582 : }
1583 : else
1584 : {
1585 146546 : if (single_st != NULL)
1586 : return false;
1587 : single_st = dr;
1588 : }
1589 : }
1590 : }
1591 :
1592 164054 : if (!single_ld && !single_st)
1593 : return false;
1594 :
1595 158180 : basic_block bb_ld = NULL;
1596 158180 : basic_block bb_st = NULL;
1597 158180 : edge exit = single_exit (loop);
1598 :
1599 158180 : if (single_ld)
1600 : {
1601 : /* Bail out if this is a bitfield memory reference. */
1602 83427 : if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF
1603 83427 : && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1)))
1604 : return false;
1605 :
1606 : /* Data reference must be executed exactly once per iteration of each
1607 : loop in the loop nest. We only need to check dominance information
1608 : against the outermost one in a perfect loop nest because a bb can't
1609 : dominate outermost loop's latch without dominating inner loop's. */
1610 83355 : bb_ld = gimple_bb (DR_STMT (single_ld));
1611 83355 : if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld))
1612 : return false;
1613 :
1614 : /* The data reference must also be executed before possibly exiting
1615 : the loop as otherwise we'd for example unconditionally execute
1616 : memset (ptr, 0, n) which even with n == 0 implies ptr is non-NULL. */
1617 76849 : if (bb_ld != loop->header
1618 76849 : && (!exit
1619 9945 : || !dominated_by_p (CDI_DOMINATORS, exit->src, bb_ld)))
1620 2153 : return false;
1621 : }
1622 :
1623 149449 : if (single_st)
1624 : {
1625 : /* Bail out if this is a bitfield memory reference. */
1626 136333 : if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF
1627 136333 : && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1)))
1628 : return false;
1629 :
1630 : /* Data reference must be executed exactly once per iteration.
1631 : Same as single_ld, we only need to check against the outermost
1632 : loop. */
1633 136246 : bb_st = gimple_bb (DR_STMT (single_st));
1634 136246 : if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st))
1635 : return false;
1636 :
1637 : /* And before exiting the loop. */
1638 131694 : if (bb_st != loop->header
1639 131694 : && (!exit
1640 17239 : || !dominated_by_p (CDI_DOMINATORS, exit->src, bb_st)))
1641 1177 : return false;
1642 : }
1643 :
1644 143633 : if (single_ld && single_st)
1645 : {
1646 : /* Load and store must be in the same loop nest. */
1647 59435 : if (bb_st->loop_father != bb_ld->loop_father)
1648 : return false;
1649 :
1650 58834 : edge e = single_exit (bb_st->loop_father);
1651 58834 : bool dom_ld = dominated_by_p (CDI_DOMINATORS, e->src, bb_ld);
1652 58834 : bool dom_st = dominated_by_p (CDI_DOMINATORS, e->src, bb_st);
1653 58834 : if (dom_ld != dom_st)
1654 : return false;
1655 : }
1656 :
1657 143032 : *src_dr = single_ld;
1658 143032 : *dst_dr = single_st;
1659 143032 : return true;
1660 : }
1661 :
1662 : /* Given data reference DR in LOOP_NEST, this function checks the enclosing
1663 : loops from inner to outer to see if loop's step equals to access size at
1664 : each level of loop. Return 2 if we can prove this at all level loops;
1665 : record access base and size in BASE and SIZE; save loop's step at each
1666 : level of loop in STEPS if it is not null. For example:
1667 :
1668 : int arr[100][100][100];
1669 : for (i = 0; i < 100; i++) ;steps[2] = 40000
1670 : for (j = 100; j > 0; j--) ;steps[1] = -400
1671 : for (k = 0; k < 100; k++) ;steps[0] = 4
1672 : arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000
1673 :
1674 : Return 1 if we can prove the equality at the innermost loop, but not all
1675 : level loops. In this case, no information is recorded.
1676 :
1677 : Return 0 if no equality can be proven at any level loops. */
1678 :
1679 : static int
1680 83213 : compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
1681 : tree *size, vec<tree> *steps = NULL)
1682 : {
1683 83213 : location_t loc = gimple_location (DR_STMT (dr));
1684 83213 : basic_block bb = gimple_bb (DR_STMT (dr));
1685 83213 : class loop *loop = bb->loop_father;
1686 83213 : tree ref = DR_REF (dr);
1687 83213 : tree access_base = build_fold_addr_expr (ref);
1688 83213 : tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1689 83213 : int res = 0;
1690 :
1691 84674 : do {
1692 84674 : tree scev_fn = analyze_scalar_evolution (loop, access_base);
1693 84674 : if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
1694 43696 : return res;
1695 :
1696 82976 : access_base = CHREC_LEFT (scev_fn);
1697 82976 : if (tree_contains_chrecs (access_base, NULL))
1698 : return res;
1699 :
1700 82976 : tree scev_step = CHREC_RIGHT (scev_fn);
1701 : /* Only support constant steps. */
1702 82976 : if (TREE_CODE (scev_step) != INTEGER_CST)
1703 : return res;
1704 :
1705 78294 : enum ev_direction access_dir = scev_direction (scev_fn);
1706 78294 : if (access_dir == EV_DIR_UNKNOWN)
1707 : return res;
1708 :
1709 78294 : if (steps != NULL)
1710 50884 : steps->safe_push (scev_step);
1711 :
1712 78294 : scev_step = fold_convert_loc (loc, sizetype, scev_step);
1713 : /* Compute absolute value of scev step. */
1714 78294 : if (access_dir == EV_DIR_DECREASES)
1715 2200 : scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step);
1716 :
1717 : /* At each level of loop, scev step must equal to access size. In other
1718 : words, DR must access consecutive memory between loop iterations. */
1719 78294 : if (!operand_equal_p (scev_step, access_size, 0))
1720 : return res;
1721 :
1722 : /* Access stride can be computed for data reference at least for the
1723 : innermost loop. */
1724 40978 : res = 1;
1725 :
1726 : /* Compute DR's execution times in loop. */
1727 40978 : tree niters = number_of_latch_executions (loop);
1728 40978 : niters = fold_convert_loc (loc, sizetype, niters);
1729 40978 : if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb))
1730 40978 : niters = size_binop_loc (loc, PLUS_EXPR, niters, size_one_node);
1731 :
1732 : /* Compute DR's overall access size in loop. */
1733 40978 : access_size = fold_build2_loc (loc, MULT_EXPR, sizetype,
1734 : niters, scev_step);
1735 : /* Adjust base address in case of negative step. */
1736 40978 : if (access_dir == EV_DIR_DECREASES)
1737 : {
1738 1436 : tree adj = fold_build2_loc (loc, MINUS_EXPR, sizetype,
1739 : scev_step, access_size);
1740 1436 : access_base = fold_build_pointer_plus_loc (loc, access_base, adj);
1741 : }
1742 40978 : } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL);
1743 :
1744 39517 : *base = access_base;
1745 39517 : *size = access_size;
1746 : /* Access stride can be computed for data reference at each level loop. */
1747 39517 : return 2;
1748 : }
1749 :
1750 : /* Allocate and return builtin struct. Record information like DST_DR,
1751 : SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct. */
1752 :
1753 : static struct builtin_info *
1754 12833 : alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr,
1755 : tree dst_base, tree src_base, tree size)
1756 : {
1757 0 : struct builtin_info *builtin = XNEW (struct builtin_info);
1758 12833 : builtin->dst_dr = dst_dr;
1759 12833 : builtin->src_dr = src_dr;
1760 12833 : builtin->dst_base = dst_base;
1761 12833 : builtin->src_base = src_base;
1762 12833 : builtin->size = size;
1763 12833 : return builtin;
1764 : }
1765 :
1766 : /* Given data reference DR in loop nest LOOP, classify if it forms builtin
1767 : memset call. */
1768 :
1769 : static void
1770 71072 : classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr)
1771 : {
1772 71072 : gimple *stmt = DR_STMT (dr);
1773 71072 : tree base, size, rhs = gimple_assign_rhs1 (stmt);
1774 :
1775 71072 : if (const_with_all_bytes_same (rhs) == -1
1776 71072 : && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs))
1777 55342 : || (TYPE_MODE (TREE_TYPE (rhs))
1778 27671 : != TYPE_MODE (unsigned_char_type_node))))
1779 63121 : return;
1780 :
1781 33826 : if (TREE_CODE (rhs) == SSA_NAME
1782 5028 : && !SSA_NAME_IS_DEFAULT_DEF (rhs)
1783 38210 : && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
1784 : return;
1785 :
1786 29585 : int res = compute_access_range (loop, dr, &base, &size);
1787 29585 : if (res == 0)
1788 : return;
1789 8012 : if (res == 1)
1790 : {
1791 61 : partition->kind = PKIND_PARTIAL_MEMSET;
1792 61 : return;
1793 : }
1794 :
1795 7951 : tree base_offset;
1796 7951 : tree base_base;
1797 7951 : split_constant_offset (base, &base_base, &base_offset);
1798 7951 : if (!cst_and_fits_in_hwi (base_offset))
1799 : return;
1800 7951 : unsigned HOST_WIDE_INT const_base_offset = int_cst_value (base_offset);
1801 :
1802 7951 : struct builtin_info *builtin;
1803 7951 : builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size);
1804 7951 : builtin->dst_base_base = base_base;
1805 7951 : builtin->dst_base_offset = const_base_offset;
1806 7951 : partition->builtin = builtin;
1807 7951 : partition->kind = PKIND_MEMSET;
1808 : }
1809 :
1810 : /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
1811 : if it forms builtin memcpy or memmove call. */
1812 :
1813 : void
1814 34904 : loop_distribution::classify_builtin_ldst (loop_p loop, struct graph *rdg,
1815 : partition *partition,
1816 : data_reference_p dst_dr,
1817 : data_reference_p src_dr)
1818 : {
1819 34904 : tree base, size, src_base, src_size;
1820 34904 : auto_vec<tree> dst_steps, src_steps;
1821 :
1822 : /* Compute access range of both load and store. */
1823 34904 : int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps);
1824 34904 : if (res != 2)
1825 : return;
1826 18724 : res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps);
1827 18724 : if (res != 2)
1828 : return;
1829 :
1830 : /* They must have the same access size. */
1831 12842 : if (!operand_equal_p (size, src_size, 0))
1832 : return;
1833 :
1834 : /* They must have the same storage order. */
1835 25684 : if (reverse_storage_order_for_component_p (DR_REF (dst_dr))
1836 12842 : != reverse_storage_order_for_component_p (DR_REF (src_dr)))
1837 : return;
1838 :
1839 : /* Load and store in loop nest must access memory in the same way, i.e,
1840 : their must have the same steps in each loop of the nest. */
1841 38526 : if (dst_steps.length () != src_steps.length ())
1842 : return;
1843 25784 : for (unsigned i = 0; i < dst_steps.length (); ++i)
1844 13241 : if (!operand_equal_p (dst_steps[i], src_steps[i], 0))
1845 : return;
1846 :
1847 : /* Now check that if there is a dependence. */
1848 12543 : ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr);
1849 :
1850 : /* Classify as memcpy if no dependence between load and store. */
1851 12543 : if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1852 : {
1853 4584 : partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1854 4584 : partition->kind = PKIND_MEMCPY;
1855 4584 : return;
1856 : }
1857 :
1858 : /* Can't do memmove in case of unknown dependence or dependence without
1859 : classical distance vector. */
1860 7959 : if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1861 35479 : || DDR_NUM_DIST_VECTS (ddr) == 0)
1862 : return;
1863 :
1864 575 : unsigned i;
1865 575 : lambda_vector dist_v;
1866 575 : int num_lev = (DDR_LOOP_NEST (ddr)).length ();
1867 1092 : FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1868 : {
1869 517 : unsigned dep_lev = dependence_level (dist_v, num_lev);
1870 : /* Can't do memmove if load depends on store. */
1871 558 : if (dep_lev > 0 && dist_v[dep_lev - 1] > 0 && !DDR_REVERSED_P (ddr))
1872 : return;
1873 : }
1874 :
1875 298 : partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1876 298 : partition->kind = PKIND_MEMMOVE;
1877 298 : return;
1878 34904 : }
1879 :
1880 : bool
1881 246256 : loop_distribution::classify_partition (loop_p loop,
1882 : struct graph *rdg, partition *partition,
1883 : bitmap stmt_in_all_partitions)
1884 : {
1885 246256 : bitmap_iterator bi;
1886 246256 : unsigned i;
1887 246256 : data_reference_p single_ld = NULL, single_st = NULL;
1888 246256 : bool volatiles_p = false, has_reduction = false;
1889 :
1890 3461848 : EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1891 : {
1892 3215592 : gimple *stmt = RDG_STMT (rdg, i);
1893 :
1894 5430652 : if (gimple_has_volatile_ops (stmt))
1895 3215592 : volatiles_p = true;
1896 :
1897 : /* If the stmt is not included by all partitions and there is uses
1898 : outside of the loop, then mark the partition as reduction. */
1899 3215592 : if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
1900 : {
1901 : /* Due to limitation in the transform phase we have to fuse all
1902 : reduction partitions. As a result, this could cancel valid
1903 : loop distribution especially for loop that induction variable
1904 : is used outside of loop. To workaround this issue, we skip
1905 : marking partition as reudction if the reduction stmt belongs
1906 : to all partitions. In such case, reduction will be computed
1907 : correctly no matter how partitions are fused/distributed. */
1908 63374 : if (!bitmap_bit_p (stmt_in_all_partitions, i))
1909 25636 : partition->reduction_p = true;
1910 : else
1911 : has_reduction = true;
1912 : }
1913 : }
1914 :
1915 : /* Simple workaround to prevent classifying the partition as builtin
1916 : if it contains any use outside of loop. For the case where all
1917 : partitions have the reduction this simple workaround is delayed
1918 : to only affect the last partition. */
1919 246256 : if (partition->reduction_p)
1920 : return has_reduction;
1921 :
1922 : /* Perform general partition disqualification for builtins. */
1923 221806 : if (volatiles_p
1924 221806 : || !flag_tree_loop_distribute_patterns)
1925 : return has_reduction;
1926 :
1927 : /* Find single load/store data references for builtin partition. */
1928 219809 : if (!find_single_drs (loop, rdg, partition->stmts, &single_st, &single_ld)
1929 219809 : || !single_st)
1930 : return has_reduction;
1931 :
1932 129796 : if (single_ld && single_st)
1933 : {
1934 58724 : gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld);
1935 : /* Direct aggregate copy or via an SSA name temporary. */
1936 58724 : if (load != store
1937 58724 : && gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
1938 : return has_reduction;
1939 : }
1940 :
1941 105976 : partition->loc = gimple_location (DR_STMT (single_st));
1942 :
1943 : /* Classify the builtin kind. */
1944 : if (single_ld == NULL)
1945 71072 : classify_builtin_st (loop, partition, single_st);
1946 : else
1947 34904 : classify_builtin_ldst (loop, rdg, partition, single_st, single_ld);
1948 : return has_reduction;
1949 : }
1950 :
1951 : bool
1952 866785 : loop_distribution::share_memory_accesses (struct graph *rdg,
1953 : partition *partition1, partition *partition2)
1954 : {
1955 866785 : unsigned i, j;
1956 866785 : bitmap_iterator bi, bj;
1957 866785 : data_reference_p dr1, dr2;
1958 :
1959 : /* First check whether in the intersection of the two partitions are
1960 : any loads or stores. Common loads are the situation that happens
1961 : most often. */
1962 6375536 : EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi)
1963 5513397 : if (RDG_MEM_WRITE_STMT (rdg, i)
1964 5513397 : || RDG_MEM_READS_STMT (rdg, i))
1965 : return true;
1966 :
1967 : /* Then check whether the two partitions access the same memory object. */
1968 1920610 : EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1969 : {
1970 1060988 : dr1 = datarefs_vec[i];
1971 :
1972 1060988 : if (!DR_BASE_ADDRESS (dr1)
1973 1059422 : || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1))
1974 1566 : continue;
1975 :
1976 2269613 : EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj)
1977 : {
1978 1212708 : dr2 = datarefs_vec[j];
1979 :
1980 1212708 : if (!DR_BASE_ADDRESS (dr2)
1981 1212191 : || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2))
1982 517 : continue;
1983 :
1984 1212191 : if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0)
1985 1063249 : && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0)
1986 1061856 : && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0)
1987 1214739 : && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0))
1988 : return true;
1989 : }
1990 : }
1991 :
1992 : return false;
1993 : }
1994 :
1995 : /* For each seed statement in STARTING_STMTS, this function builds
1996 : partition for it by adding depended statements according to RDG.
1997 : All partitions are recorded in PARTITIONS. */
1998 :
1999 : void
2000 137031 : loop_distribution::rdg_build_partitions (struct graph *rdg,
2001 : vec<gimple *> starting_stmts,
2002 : vec<partition *> *partitions)
2003 : {
2004 137031 : auto_bitmap processed;
2005 137031 : int i;
2006 137031 : gimple *stmt;
2007 :
2008 526883 : FOR_EACH_VEC_ELT (starting_stmts, i, stmt)
2009 : {
2010 252821 : int v = rdg_vertex_for_stmt (rdg, stmt);
2011 :
2012 252821 : if (dump_file && (dump_flags & TDF_DETAILS))
2013 141 : fprintf (dump_file,
2014 : "ldist asked to generate code for vertex %d\n", v);
2015 :
2016 : /* If the vertex is already contained in another partition so
2017 : is the partition rooted at it. */
2018 252821 : if (bitmap_bit_p (processed, v))
2019 6565 : continue;
2020 :
2021 246256 : partition *partition = build_rdg_partition_for_vertex (rdg, v);
2022 246256 : bitmap_ior_into (processed, partition->stmts);
2023 :
2024 246256 : if (dump_file && (dump_flags & TDF_DETAILS))
2025 : {
2026 141 : fprintf (dump_file, "ldist creates useful %s partition:\n",
2027 141 : partition->type == PTYPE_PARALLEL ? "parallel" : "sequent");
2028 141 : bitmap_print (dump_file, partition->stmts, " ", "\n");
2029 : }
2030 :
2031 246256 : partitions->safe_push (partition);
2032 : }
2033 :
2034 : /* All vertices should have been assigned to at least one partition now,
2035 : other than vertices belonging to dead code. */
2036 137031 : }
2037 :
2038 : /* Dump to FILE the PARTITIONS. */
2039 :
2040 : static void
2041 42 : dump_rdg_partitions (FILE *file, const vec<partition *> &partitions)
2042 : {
2043 42 : int i;
2044 42 : partition *partition;
2045 :
2046 108 : FOR_EACH_VEC_ELT (partitions, i, partition)
2047 66 : debug_bitmap_file (file, partition->stmts);
2048 42 : }
2049 :
2050 : /* Debug PARTITIONS. */
2051 : extern void debug_rdg_partitions (const vec<partition *> &);
2052 :
2053 : DEBUG_FUNCTION void
2054 0 : debug_rdg_partitions (const vec<partition *> &partitions)
2055 : {
2056 0 : dump_rdg_partitions (stderr, partitions);
2057 0 : }
2058 :
2059 : /* Returns the number of read and write operations in the RDG. */
2060 :
2061 : static int
2062 2646 : number_of_rw_in_rdg (struct graph *rdg)
2063 : {
2064 2646 : int i, res = 0;
2065 :
2066 40163 : for (i = 0; i < rdg->n_vertices; i++)
2067 : {
2068 37517 : if (RDG_MEM_WRITE_STMT (rdg, i))
2069 7636 : ++res;
2070 :
2071 37517 : if (RDG_MEM_READS_STMT (rdg, i))
2072 5476 : ++res;
2073 : }
2074 :
2075 2646 : return res;
2076 : }
2077 :
2078 : /* Returns the number of read and write operations in a PARTITION of
2079 : the RDG. */
2080 :
2081 : static int
2082 5834 : number_of_rw_in_partition (struct graph *rdg, partition *partition)
2083 : {
2084 5834 : int res = 0;
2085 5834 : unsigned i;
2086 5834 : bitmap_iterator ii;
2087 :
2088 56099 : EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii)
2089 : {
2090 50265 : if (RDG_MEM_WRITE_STMT (rdg, i))
2091 7628 : ++res;
2092 :
2093 50265 : if (RDG_MEM_READS_STMT (rdg, i))
2094 5476 : ++res;
2095 : }
2096 :
2097 5834 : return res;
2098 : }
2099 :
2100 : /* Returns true when one of the PARTITIONS contains all the read or
2101 : write operations of RDG. */
2102 :
2103 : static bool
2104 2646 : partition_contains_all_rw (struct graph *rdg,
2105 : const vec<partition *> &partitions)
2106 : {
2107 2646 : int i;
2108 2646 : partition *partition;
2109 2646 : int nrw = number_of_rw_in_rdg (rdg);
2110 :
2111 8412 : FOR_EACH_VEC_ELT (partitions, i, partition)
2112 5834 : if (nrw == number_of_rw_in_partition (rdg, partition))
2113 : return true;
2114 :
2115 : return false;
2116 : }
2117 :
2118 : int
2119 973840 : loop_distribution::pg_add_dependence_edges (struct graph *rdg, int dir,
2120 : bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs)
2121 : {
2122 973840 : unsigned i, j;
2123 973840 : bitmap_iterator bi, bj;
2124 973840 : data_reference_p dr1, dr2, saved_dr1;
2125 :
2126 : /* dependence direction - 0 is no dependence, -1 is back,
2127 : 1 is forth, 2 is both (we can stop then, merging will occur). */
2128 1870825 : EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi)
2129 : {
2130 1097043 : dr1 = datarefs_vec[i];
2131 :
2132 2284401 : EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj)
2133 : {
2134 1387416 : int res, this_dir = 1;
2135 1387416 : ddr_p ddr;
2136 :
2137 1387416 : dr2 = datarefs_vec[j];
2138 :
2139 : /* Skip all <read, read> data dependence. */
2140 1387416 : if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
2141 106850 : continue;
2142 :
2143 1280566 : saved_dr1 = dr1;
2144 : /* Re-shuffle data-refs to be in topological order. */
2145 2561132 : if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
2146 1280566 : > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
2147 : {
2148 92764 : std::swap (dr1, dr2);
2149 92764 : this_dir = -this_dir;
2150 : }
2151 1280566 : ddr = get_data_dependence (rdg, dr1, dr2);
2152 1280566 : if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2153 : {
2154 211867 : this_dir = 0;
2155 211867 : res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1),
2156 : DR_BASE_ADDRESS (dr2));
2157 : /* Be conservative. If data references are not well analyzed,
2158 : or the two data references have the same base address and
2159 : offset, add dependence and consider it alias to each other.
2160 : In other words, the dependence cannot be resolved by
2161 : runtime alias check. */
2162 211867 : if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2)
2163 211465 : || !DR_OFFSET (dr1) || !DR_OFFSET (dr2)
2164 211465 : || !DR_INIT (dr1) || !DR_INIT (dr2)
2165 211465 : || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1))
2166 203441 : || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2))
2167 203295 : || res == 0)
2168 : this_dir = 2;
2169 : /* Data dependence could be resolved by runtime alias check,
2170 : record it in ALIAS_DDRS. */
2171 13034 : else if (alias_ddrs != NULL)
2172 6497 : alias_ddrs->safe_push (ddr);
2173 : /* Or simply ignore it. */
2174 : }
2175 1068699 : else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE)
2176 : {
2177 : /* Known dependences can still be unordered througout the
2178 : iteration space, see gcc.dg/tree-ssa/ldist-16.c and
2179 : gcc.dg/tree-ssa/pr94969.c. */
2180 2103 : if (DDR_NUM_DIST_VECTS (ddr) != 1)
2181 : this_dir = 2;
2182 : else
2183 : {
2184 : /* If the overlap is exact preserve stmt order. */
2185 1838 : if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0),
2186 1838 : DDR_NB_LOOPS (ddr)))
2187 : ;
2188 : /* Else as the distance vector is lexicographic positive swap
2189 : the dependence direction. */
2190 : else
2191 : {
2192 842 : if (DDR_REVERSED_P (ddr))
2193 42 : this_dir = -this_dir;
2194 842 : this_dir = -this_dir;
2195 : }
2196 : /* When then dependence distance of the innermost common
2197 : loop of the DRs is zero we have a conflict. This is
2198 : due to wonky dependence analysis which sometimes
2199 : ends up using a zero distance in place of unknown. */
2200 919 : auto l1 = gimple_bb (DR_STMT (dr1))->loop_father;
2201 919 : auto l2 = gimple_bb (DR_STMT (dr2))->loop_father;
2202 919 : int idx = index_in_loop_nest (find_common_loop (l1, l2)->num,
2203 919 : DDR_LOOP_NEST (ddr));
2204 919 : if (DDR_DIST_VECT (ddr, 0)[idx] == 0
2205 : /* Unless it is the outermost loop which is the one
2206 : we eventually distribute. */
2207 919 : && idx != 0)
2208 : this_dir = 2;
2209 : }
2210 : }
2211 : else
2212 : this_dir = 0;
2213 7433 : if (this_dir == 2)
2214 200058 : return 2;
2215 1080521 : else if (dir == 0)
2216 : dir = this_dir;
2217 7368 : else if (this_dir != 0 && dir != this_dir)
2218 : return 2;
2219 : /* Shuffle "back" dr1. */
2220 1080508 : dr1 = saved_dr1;
2221 : }
2222 : }
2223 : return dir;
2224 : }
2225 :
2226 : /* Compare postorder number of the partition graph vertices V1 and V2. */
2227 :
2228 : static int
2229 788950 : pgcmp (const void *v1_, const void *v2_)
2230 : {
2231 788950 : const vertex *v1 = (const vertex *)v1_;
2232 788950 : const vertex *v2 = (const vertex *)v2_;
2233 788950 : return v2->post - v1->post;
2234 : }
2235 :
2236 : /* Data attached to vertices of partition dependence graph. */
2237 : struct pg_vdata
2238 : {
2239 : /* ID of the corresponding partition. */
2240 : int id;
2241 : /* The partition. */
2242 : struct partition *partition;
2243 : };
2244 :
2245 : /* Data attached to edges of partition dependence graph. */
2246 : struct pg_edata
2247 : {
2248 : /* If the dependence edge can be resolved by runtime alias check,
2249 : this vector contains data dependence relations for runtime alias
2250 : check. On the other hand, if the dependence edge is introduced
2251 : because of compilation time known data dependence, this vector
2252 : contains nothing. */
2253 : vec<ddr_p> alias_ddrs;
2254 : };
2255 :
2256 : /* Callback data for traversing edges in graph. */
2257 : struct pg_edge_callback_data
2258 : {
2259 : /* Bitmap contains strong connected components should be merged. */
2260 : bitmap sccs_to_merge;
2261 : /* Array constains component information for all vertices. */
2262 : int *vertices_component;
2263 : /* Vector to record all data dependence relations which are needed
2264 : to break strong connected components by runtime alias checks. */
2265 : vec<ddr_p> *alias_ddrs;
2266 : };
2267 :
2268 : /* Initialize vertice's data for partition dependence graph PG with
2269 : PARTITIONS. */
2270 :
2271 : static void
2272 12262 : init_partition_graph_vertices (struct graph *pg,
2273 : vec<struct partition *> *partitions)
2274 : {
2275 12262 : int i;
2276 12262 : partition *partition;
2277 12262 : struct pg_vdata *data;
2278 :
2279 69018 : for (i = 0; partitions->iterate (i, &partition); ++i)
2280 : {
2281 56756 : data = new pg_vdata;
2282 56756 : pg->vertices[i].data = data;
2283 56756 : data->id = i;
2284 56756 : data->partition = partition;
2285 : }
2286 12262 : }
2287 :
2288 : /* Add edge <I, J> to partition dependence graph PG. Attach vector of data
2289 : dependence relations to the EDGE if DDRS isn't NULL. */
2290 :
2291 : static void
2292 412111 : add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs)
2293 : {
2294 412111 : struct graph_edge *e = add_edge (pg, i, j);
2295 :
2296 : /* If the edge is attached with data dependence relations, it means this
2297 : dependence edge can be resolved by runtime alias checks. */
2298 412111 : if (ddrs != NULL)
2299 : {
2300 10070 : struct pg_edata *data = new pg_edata;
2301 :
2302 10070 : gcc_assert (ddrs->length () > 0);
2303 10070 : e->data = data;
2304 10070 : data->alias_ddrs = vNULL;
2305 10070 : data->alias_ddrs.safe_splice (*ddrs);
2306 : }
2307 412111 : }
2308 :
2309 : /* Callback function for graph travesal algorithm. It returns true
2310 : if edge E should skipped when traversing the graph. */
2311 :
2312 : static bool
2313 1951 : pg_skip_alias_edge (struct graph_edge *e)
2314 : {
2315 1951 : struct pg_edata *data = (struct pg_edata *)e->data;
2316 1951 : return (data != NULL && data->alias_ddrs.length () > 0);
2317 : }
2318 :
2319 : /* Callback function freeing data attached to edge E of graph. */
2320 :
2321 : static void
2322 412111 : free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *)
2323 : {
2324 412111 : if (e->data != NULL)
2325 : {
2326 10069 : struct pg_edata *data = (struct pg_edata *)e->data;
2327 10069 : data->alias_ddrs.release ();
2328 10069 : delete data;
2329 : }
2330 412111 : }
2331 :
2332 : /* Free data attached to vertice of partition dependence graph PG. */
2333 :
2334 : static void
2335 12262 : free_partition_graph_vdata (struct graph *pg)
2336 : {
2337 12262 : int i;
2338 12262 : struct pg_vdata *data;
2339 :
2340 69018 : for (i = 0; i < pg->n_vertices; ++i)
2341 : {
2342 56756 : data = (struct pg_vdata *)pg->vertices[i].data;
2343 56756 : delete data;
2344 : }
2345 12262 : }
2346 :
2347 : /* Build and return partition dependence graph for PARTITIONS. RDG is
2348 : reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P
2349 : is true, data dependence caused by possible alias between references
2350 : is ignored, as if it doesn't exist at all; otherwise all depdendences
2351 : are considered. */
2352 :
2353 : struct graph *
2354 12262 : loop_distribution::build_partition_graph (struct graph *rdg,
2355 : vec<struct partition *> *partitions,
2356 : bool ignore_alias_p)
2357 : {
2358 12262 : int i, j;
2359 12262 : struct partition *partition1, *partition2;
2360 24524 : graph *pg = new_graph (partitions->length ());
2361 12262 : auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p;
2362 :
2363 12262 : alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs;
2364 :
2365 12262 : init_partition_graph_vertices (pg, partitions);
2366 :
2367 12262 : for (i = 0; partitions->iterate (i, &partition1); ++i)
2368 : {
2369 1099614 : for (j = i + 1; partitions->iterate (j, &partition2); ++j)
2370 : {
2371 : /* dependence direction - 0 is no dependence, -1 is back,
2372 : 1 is forth, 2 is both (we can stop then, merging will occur). */
2373 973840 : int dir = 0;
2374 :
2375 : /* If the first partition has reduction, add back edge; if the
2376 : second partition has reduction, add forth edge. This makes
2377 : sure that reduction partition will be sorted as the last one. */
2378 973840 : if (partition_reduction_p (partition1))
2379 : dir = -1;
2380 973695 : else if (partition_reduction_p (partition2))
2381 1468 : dir = 1;
2382 :
2383 : /* Cleanup the temporary vector. */
2384 973840 : alias_ddrs.truncate (0);
2385 :
2386 973840 : dir = pg_add_dependence_edges (rdg, dir, partition1->datarefs,
2387 : partition2->datarefs, alias_ddrs_p);
2388 :
2389 : /* Add edge to partition graph if there exists dependence. There
2390 : are two types of edges. One type edge is caused by compilation
2391 : time known dependence, this type cannot be resolved by runtime
2392 : alias check. The other type can be resolved by runtime alias
2393 : check. */
2394 973840 : if (dir == 1 || dir == 2
2395 980565 : || alias_ddrs.length () > 0)
2396 : {
2397 : /* Attach data dependence relations to edge that can be resolved
2398 : by runtime alias check. */
2399 206788 : bool alias_edge_p = (dir != 1 && dir != 2);
2400 408572 : add_partition_graph_edge (pg, i, j,
2401 : (alias_edge_p) ? &alias_ddrs : NULL);
2402 : }
2403 973840 : if (dir == -1 || dir == 2
2404 1952746 : || alias_ddrs.length () > 0)
2405 : {
2406 : /* Attach data dependence relations to edge that can be resolved
2407 : by runtime alias check. */
2408 205323 : bool alias_edge_p = (dir != -1 && dir != 2);
2409 405580 : add_partition_graph_edge (pg, j, i,
2410 : (alias_edge_p) ? &alias_ddrs : NULL);
2411 : }
2412 : }
2413 : }
2414 12262 : return pg;
2415 12262 : }
2416 :
2417 : /* Sort partitions in PG in descending post order and store them in
2418 : PARTITIONS. */
2419 :
2420 : static void
2421 12262 : sort_partitions_by_post_order (struct graph *pg,
2422 : vec<struct partition *> *partitions)
2423 : {
2424 12262 : int i;
2425 12262 : struct pg_vdata *data;
2426 :
2427 : /* Now order the remaining nodes in descending postorder. */
2428 12262 : qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp);
2429 12262 : partitions->truncate (0);
2430 81280 : for (i = 0; i < pg->n_vertices; ++i)
2431 : {
2432 56756 : data = (struct pg_vdata *)pg->vertices[i].data;
2433 56756 : if (data->partition)
2434 49141 : partitions->safe_push (data->partition);
2435 : }
2436 12262 : }
2437 :
2438 : void
2439 6683 : loop_distribution::merge_dep_scc_partitions (struct graph *rdg,
2440 : vec<struct partition *> *partitions,
2441 : bool ignore_alias_p)
2442 : {
2443 6683 : struct partition *partition1, *partition2;
2444 6683 : struct pg_vdata *data;
2445 6683 : graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p);
2446 6683 : int i, j, num_sccs = graphds_scc (pg, NULL);
2447 :
2448 : /* Strong connected compoenent means dependence cycle, we cannot distribute
2449 : them. So fuse them together. */
2450 6683 : if ((unsigned) num_sccs < partitions->length ())
2451 : {
2452 1934 : for (i = 0; i < num_sccs; ++i)
2453 : {
2454 2126 : for (j = 0; partitions->iterate (j, &partition1); ++j)
2455 2126 : if (pg->vertices[j].component == i)
2456 : break;
2457 9750 : for (j = j + 1; partitions->iterate (j, &partition2); ++j)
2458 7738 : if (pg->vertices[j].component == i)
2459 : {
2460 6667 : partition_merge_into (NULL, partition1,
2461 : partition2, FUSE_SAME_SCC);
2462 6667 : partition1->type = PTYPE_SEQUENTIAL;
2463 6667 : (*partitions)[j] = NULL;
2464 6667 : partition_free (partition2);
2465 6667 : data = (struct pg_vdata *)pg->vertices[j].data;
2466 6667 : data->partition = NULL;
2467 : }
2468 : }
2469 : }
2470 :
2471 6683 : sort_partitions_by_post_order (pg, partitions);
2472 13366 : gcc_assert (partitions->length () == (unsigned)num_sccs);
2473 6683 : free_partition_graph_vdata (pg);
2474 6683 : for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2475 6683 : free_graph (pg);
2476 6683 : }
2477 :
2478 : /* Callback function for traversing edge E in graph G. DATA is private
2479 : callback data. */
2480 :
2481 : static void
2482 741 : pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data)
2483 : {
2484 741 : int i, j, component;
2485 741 : struct pg_edge_callback_data *cbdata;
2486 741 : struct pg_edata *edata = (struct pg_edata *) e->data;
2487 :
2488 : /* If the edge doesn't have attached data dependence, it represents
2489 : compilation time known dependences. This type dependence cannot
2490 : be resolved by runtime alias check. */
2491 741 : if (edata == NULL || edata->alias_ddrs.length () == 0)
2492 : return;
2493 :
2494 723 : cbdata = (struct pg_edge_callback_data *) data;
2495 723 : i = e->src;
2496 723 : j = e->dest;
2497 723 : component = cbdata->vertices_component[i];
2498 : /* Vertices are topologically sorted according to compilation time
2499 : known dependences, so we can break strong connected components
2500 : by removing edges of the opposite direction, i.e, edges pointing
2501 : from vertice with smaller post number to vertice with bigger post
2502 : number. */
2503 723 : if (g->vertices[i].post < g->vertices[j].post
2504 : /* We only need to remove edges connecting vertices in the same
2505 : strong connected component to break it. */
2506 368 : && component == cbdata->vertices_component[j]
2507 : /* Check if we want to break the strong connected component or not. */
2508 1091 : && !bitmap_bit_p (cbdata->sccs_to_merge, component))
2509 368 : cbdata->alias_ddrs->safe_splice (edata->alias_ddrs);
2510 : }
2511 :
2512 : /* Callback function for traversing edge E. DATA is private
2513 : callback data. */
2514 :
2515 : static void
2516 741 : pg_unmark_merged_alias_ddrs (struct graph *, struct graph_edge *e, void *data)
2517 : {
2518 741 : int i, j, component;
2519 741 : struct pg_edge_callback_data *cbdata;
2520 741 : struct pg_edata *edata = (struct pg_edata *) e->data;
2521 :
2522 741 : if (edata == NULL || edata->alias_ddrs.length () == 0)
2523 : return;
2524 :
2525 724 : cbdata = (struct pg_edge_callback_data *) data;
2526 724 : i = e->src;
2527 724 : j = e->dest;
2528 724 : component = cbdata->vertices_component[i];
2529 : /* Make sure to not skip vertices inside SCCs we are going to merge. */
2530 724 : if (component == cbdata->vertices_component[j]
2531 724 : && bitmap_bit_p (cbdata->sccs_to_merge, component))
2532 : {
2533 1 : edata->alias_ddrs.release ();
2534 1 : delete edata;
2535 1 : e->data = NULL;
2536 : }
2537 : }
2538 :
2539 : /* This is the main function breaking strong conected components in
2540 : PARTITIONS giving reduced depdendence graph RDG. Store data dependence
2541 : relations for runtime alias check in ALIAS_DDRS. */
2542 : void
2543 5579 : loop_distribution::break_alias_scc_partitions (struct graph *rdg,
2544 : vec<struct partition *> *partitions,
2545 : vec<ddr_p> *alias_ddrs)
2546 : {
2547 5579 : int i, j, k, num_sccs, num_sccs_no_alias = 0;
2548 : /* Build partition dependence graph. */
2549 5579 : graph *pg = build_partition_graph (rdg, partitions, false);
2550 :
2551 5579 : alias_ddrs->truncate (0);
2552 : /* Find strong connected components in the graph, with all dependence edges
2553 : considered. */
2554 5579 : num_sccs = graphds_scc (pg, NULL);
2555 : /* All SCCs now can be broken by runtime alias checks because SCCs caused by
2556 : compilation time known dependences are merged before this function. */
2557 5579 : if ((unsigned) num_sccs < partitions->length ())
2558 : {
2559 293 : struct pg_edge_callback_data cbdata;
2560 293 : auto_bitmap sccs_to_merge;
2561 293 : auto_vec<enum partition_type> scc_types;
2562 293 : struct partition *partition, *first;
2563 :
2564 : /* If all partitions in a SCC have the same type, we can simply merge the
2565 : SCC. This loop finds out such SCCS and record them in bitmap. */
2566 293 : bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs);
2567 603 : for (i = 0; i < num_sccs; ++i)
2568 : {
2569 333 : for (j = 0; partitions->iterate (j, &first); ++j)
2570 333 : if (pg->vertices[j].component == i)
2571 : break;
2572 :
2573 310 : bool same_type = true, all_builtins = partition_builtin_p (first);
2574 1541 : for (++j; partitions->iterate (j, &partition); ++j)
2575 : {
2576 1268 : if (pg->vertices[j].component != i)
2577 154 : continue;
2578 :
2579 1114 : if (first->type != partition->type)
2580 : {
2581 : same_type = false;
2582 : break;
2583 : }
2584 1077 : all_builtins &= partition_builtin_p (partition);
2585 : }
2586 : /* Merge SCC if all partitions in SCC have the same type, though the
2587 : result partition is sequential, because vectorizer can do better
2588 : runtime alias check. One expecption is all partitions in SCC are
2589 : builtins. */
2590 310 : if (!same_type || all_builtins)
2591 79 : bitmap_clear_bit (sccs_to_merge, i);
2592 : }
2593 :
2594 : /* Initialize callback data for traversing. */
2595 293 : cbdata.sccs_to_merge = sccs_to_merge;
2596 293 : cbdata.alias_ddrs = alias_ddrs;
2597 293 : cbdata.vertices_component = XNEWVEC (int, pg->n_vertices);
2598 : /* Record the component information which will be corrupted by next
2599 : graph scc finding call. */
2600 1721 : for (i = 0; i < pg->n_vertices; ++i)
2601 1428 : cbdata.vertices_component[i] = pg->vertices[i].component;
2602 :
2603 : /* Collect data dependences for runtime alias checks to break SCCs. */
2604 293 : if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs)
2605 : {
2606 : /* For SCCs we want to merge clear all alias_ddrs for edges
2607 : inside the component. */
2608 75 : for_each_edge (pg, pg_unmark_merged_alias_ddrs, &cbdata);
2609 :
2610 : /* Run SCC finding algorithm again, with alias dependence edges
2611 : skipped. This is to topologically sort partitions according to
2612 : compilation time known dependence. Note the topological order
2613 : is stored in the form of pg's post order number. */
2614 75 : num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge);
2615 : /* We cannot assert partitions->length () == num_sccs_no_alias
2616 : since we are not ignoring alias edges in cycles we are
2617 : going to merge. That's required to compute correct postorder. */
2618 : /* With topological order, we can construct two subgraphs L and R.
2619 : L contains edge <x, y> where x < y in terms of post order, while
2620 : R contains edge <x, y> where x > y. Edges for compilation time
2621 : known dependence all fall in R, so we break SCCs by removing all
2622 : (alias) edges of in subgraph L. */
2623 75 : for_each_edge (pg, pg_collect_alias_ddrs, &cbdata);
2624 : }
2625 :
2626 : /* For SCC that doesn't need to be broken, merge it. */
2627 603 : for (i = 0; i < num_sccs; ++i)
2628 : {
2629 310 : if (!bitmap_bit_p (sccs_to_merge, i))
2630 79 : continue;
2631 :
2632 248 : for (j = 0; partitions->iterate (j, &first); ++j)
2633 248 : if (cbdata.vertices_component[j] == i)
2634 : break;
2635 1635 : for (k = j + 1; partitions->iterate (k, &partition); ++k)
2636 : {
2637 1094 : struct pg_vdata *data;
2638 :
2639 1094 : if (cbdata.vertices_component[k] != i)
2640 146 : continue;
2641 :
2642 948 : partition_merge_into (NULL, first, partition, FUSE_SAME_SCC);
2643 948 : (*partitions)[k] = NULL;
2644 948 : partition_free (partition);
2645 948 : data = (struct pg_vdata *)pg->vertices[k].data;
2646 948 : gcc_assert (data->id == k);
2647 948 : data->partition = NULL;
2648 : /* The result partition of merged SCC must be sequential. */
2649 948 : first->type = PTYPE_SEQUENTIAL;
2650 : }
2651 : }
2652 : /* If reduction partition's SCC is broken by runtime alias checks,
2653 : we force a negative post order to it making sure it will be scheduled
2654 : in the last. */
2655 293 : if (num_sccs_no_alias > 0)
2656 : {
2657 : j = -1;
2658 327 : for (i = 0; i < pg->n_vertices; ++i)
2659 : {
2660 252 : struct pg_vdata *data = (struct pg_vdata *)pg->vertices[i].data;
2661 252 : if (data->partition && partition_reduction_p (data->partition))
2662 : {
2663 5 : gcc_assert (j == -1);
2664 : j = i;
2665 : }
2666 : }
2667 75 : if (j >= 0)
2668 5 : pg->vertices[j].post = -1;
2669 : }
2670 :
2671 293 : free (cbdata.vertices_component);
2672 293 : }
2673 :
2674 5579 : sort_partitions_by_post_order (pg, partitions);
2675 5579 : free_partition_graph_vdata (pg);
2676 5579 : for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2677 5579 : free_graph (pg);
2678 :
2679 5579 : if (dump_file && (dump_flags & TDF_DETAILS))
2680 : {
2681 15 : fprintf (dump_file, "Possible alias data dependence to break:\n");
2682 15 : dump_data_dependence_relations (dump_file, *alias_ddrs);
2683 : }
2684 5579 : }
2685 :
2686 : /* Compute and return an expression whose value is the segment length which
2687 : will be accessed by DR in NITERS iterations. */
2688 :
2689 : static tree
2690 1286 : data_ref_segment_size (struct data_reference *dr, tree niters)
2691 : {
2692 1286 : niters = size_binop (MINUS_EXPR,
2693 : fold_convert (sizetype, niters),
2694 : size_one_node);
2695 1286 : return size_binop (MULT_EXPR,
2696 : fold_convert (sizetype, DR_STEP (dr)),
2697 : fold_convert (sizetype, niters));
2698 : }
2699 :
2700 : /* Return true if LOOP's latch is dominated by statement for data reference
2701 : DR. */
2702 :
2703 : static inline bool
2704 1286 : latch_dominated_by_data_ref (class loop *loop, data_reference *dr)
2705 : {
2706 1286 : return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
2707 1286 : gimple_bb (DR_STMT (dr)));
2708 : }
2709 :
2710 : /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's
2711 : data dependence relations ALIAS_DDRS. */
2712 :
2713 : static void
2714 75 : compute_alias_check_pairs (class loop *loop, vec<ddr_p> *alias_ddrs,
2715 : vec<dr_with_seg_len_pair_t> *comp_alias_pairs)
2716 : {
2717 75 : unsigned int i;
2718 75 : unsigned HOST_WIDE_INT factor = 1;
2719 75 : tree niters_plus_one, niters = number_of_latch_executions (loop);
2720 :
2721 75 : gcc_assert (niters != NULL_TREE && niters != chrec_dont_know);
2722 75 : niters = fold_convert (sizetype, niters);
2723 75 : niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node);
2724 :
2725 75 : if (dump_file && (dump_flags & TDF_DETAILS))
2726 0 : fprintf (dump_file, "Creating alias check pairs:\n");
2727 :
2728 : /* Iterate all data dependence relations and compute alias check pairs. */
2729 718 : for (i = 0; i < alias_ddrs->length (); i++)
2730 : {
2731 643 : ddr_p ddr = (*alias_ddrs)[i];
2732 643 : struct data_reference *dr_a = DDR_A (ddr);
2733 643 : struct data_reference *dr_b = DDR_B (ddr);
2734 643 : tree seg_length_a, seg_length_b;
2735 :
2736 643 : if (latch_dominated_by_data_ref (loop, dr_a))
2737 639 : seg_length_a = data_ref_segment_size (dr_a, niters_plus_one);
2738 : else
2739 4 : seg_length_a = data_ref_segment_size (dr_a, niters);
2740 :
2741 643 : if (latch_dominated_by_data_ref (loop, dr_b))
2742 639 : seg_length_b = data_ref_segment_size (dr_b, niters_plus_one);
2743 : else
2744 4 : seg_length_b = data_ref_segment_size (dr_b, niters);
2745 :
2746 643 : unsigned HOST_WIDE_INT access_size_a
2747 643 : = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a))));
2748 643 : unsigned HOST_WIDE_INT access_size_b
2749 643 : = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b))));
2750 643 : unsigned int align_a = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a)));
2751 643 : unsigned int align_b = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b)));
2752 :
2753 643 : dr_with_seg_len_pair_t dr_with_seg_len_pair
2754 1286 : (dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a),
2755 1286 : dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b),
2756 : /* ??? Would WELL_ORDERED be safe? */
2757 643 : dr_with_seg_len_pair_t::REORDERED);
2758 :
2759 643 : comp_alias_pairs->safe_push (dr_with_seg_len_pair);
2760 : }
2761 :
2762 75 : if (tree_fits_uhwi_p (niters))
2763 46 : factor = tree_to_uhwi (niters);
2764 :
2765 : /* Prune alias check pairs. */
2766 75 : prune_runtime_alias_test_list (comp_alias_pairs, factor);
2767 75 : if (dump_file && (dump_flags & TDF_DETAILS))
2768 0 : fprintf (dump_file,
2769 : "Improved number of alias checks from %d to %d\n",
2770 : alias_ddrs->length (), comp_alias_pairs->length ());
2771 75 : }
2772 :
2773 : /* Given data dependence relations in ALIAS_DDRS, generate runtime alias
2774 : checks and version LOOP under condition of these runtime alias checks. */
2775 :
2776 : static void
2777 75 : version_loop_by_alias_check (vec<struct partition *> *partitions,
2778 : class loop *loop, vec<ddr_p> *alias_ddrs)
2779 : {
2780 75 : profile_probability prob;
2781 75 : basic_block cond_bb;
2782 75 : class loop *nloop;
2783 75 : tree lhs, arg0, cond_expr = NULL_TREE;
2784 75 : gimple_seq cond_stmts = NULL;
2785 75 : gimple *call_stmt = NULL;
2786 75 : auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs;
2787 :
2788 : /* Generate code for runtime alias checks if necessary. */
2789 75 : gcc_assert (alias_ddrs->length () > 0);
2790 :
2791 75 : if (dump_file && (dump_flags & TDF_DETAILS))
2792 0 : fprintf (dump_file,
2793 : "Version loop <%d> with runtime alias check\n", loop->num);
2794 :
2795 75 : compute_alias_check_pairs (loop, alias_ddrs, &comp_alias_pairs);
2796 75 : create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr);
2797 75 : cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts,
2798 : is_gimple_val, NULL_TREE);
2799 :
2800 : /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS. */
2801 75 : bool cancelable_p = flag_tree_loop_vectorize;
2802 75 : if (cancelable_p)
2803 : {
2804 : unsigned i = 0;
2805 : struct partition *partition;
2806 216 : for (; partitions->iterate (i, &partition); ++i)
2807 182 : if (!partition_builtin_p (partition))
2808 : break;
2809 :
2810 : /* If all partitions are builtins, distributing it would be profitable and
2811 : we don't want to cancel the runtime alias checks. */
2812 142 : if (i == partitions->length ())
2813 : cancelable_p = false;
2814 : }
2815 :
2816 : /* Generate internal function call for loop distribution alias check if the
2817 : runtime alias check should be cancelable. */
2818 41 : if (cancelable_p)
2819 : {
2820 37 : call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS,
2821 : 2, NULL_TREE, cond_expr);
2822 37 : lhs = make_ssa_name (boolean_type_node);
2823 37 : gimple_call_set_lhs (call_stmt, lhs);
2824 : }
2825 : else
2826 : lhs = cond_expr;
2827 :
2828 75 : prob = profile_probability::guessed_always ().apply_scale (9, 10);
2829 75 : initialize_original_copy_tables ();
2830 75 : nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (),
2831 : prob, prob.invert (), true);
2832 75 : free_original_copy_tables ();
2833 : /* Record the original loop number in newly generated loops. In case of
2834 : distribution, the original loop will be distributed and the new loop
2835 : is kept. */
2836 75 : loop->orig_loop_num = nloop->num;
2837 75 : nloop->orig_loop_num = nloop->num;
2838 75 : nloop->dont_vectorize = true;
2839 75 : nloop->force_vectorize = false;
2840 :
2841 75 : if (call_stmt)
2842 : {
2843 : /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original
2844 : loop could be destroyed. */
2845 37 : arg0 = build_int_cst (integer_type_node, loop->orig_loop_num);
2846 37 : gimple_call_set_arg (call_stmt, 0, arg0);
2847 37 : gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt);
2848 : }
2849 :
2850 75 : if (cond_stmts)
2851 : {
2852 75 : gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb);
2853 75 : gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT);
2854 : }
2855 75 : update_ssa (TODO_update_ssa_no_phi);
2856 75 : }
2857 :
2858 : /* Return true if loop versioning is needed to distrubute PARTITIONS.
2859 : ALIAS_DDRS are data dependence relations for runtime alias check. */
2860 :
2861 : static inline bool
2862 12018 : version_for_distribution_p (vec<struct partition *> *partitions,
2863 : vec<ddr_p> *alias_ddrs)
2864 : {
2865 : /* No need to version loop if we have only one partition. */
2866 14596 : if (partitions->length () == 1)
2867 : return false;
2868 :
2869 : /* Need to version loop if runtime alias check is necessary. */
2870 2578 : return (alias_ddrs->length () > 0);
2871 : }
2872 :
2873 : /* Compare base offset of builtin mem* partitions P1 and P2. */
2874 :
2875 : static int
2876 375 : offset_cmp (const void *vp1, const void *vp2)
2877 : {
2878 375 : struct partition *p1 = *(struct partition *const *) vp1;
2879 375 : struct partition *p2 = *(struct partition *const *) vp2;
2880 375 : unsigned HOST_WIDE_INT o1 = p1->builtin->dst_base_offset;
2881 375 : unsigned HOST_WIDE_INT o2 = p2->builtin->dst_base_offset;
2882 375 : return (o2 < o1) - (o1 < o2);
2883 : }
2884 :
2885 : /* Fuse adjacent memset builtin PARTITIONS if possible. This is a special
2886 : case optimization transforming below code:
2887 :
2888 : __builtin_memset (&obj, 0, 100);
2889 : _1 = &obj + 100;
2890 : __builtin_memset (_1, 0, 200);
2891 : _2 = &obj + 300;
2892 : __builtin_memset (_2, 0, 100);
2893 :
2894 : into:
2895 :
2896 : __builtin_memset (&obj, 0, 400);
2897 :
2898 : Note we don't have dependence information between different partitions
2899 : at this point, as a result, we can't handle nonadjacent memset builtin
2900 : partitions since dependence might be broken. */
2901 :
2902 : static void
2903 2579 : fuse_memset_builtins (vec<struct partition *> *partitions)
2904 : {
2905 2579 : unsigned i, j;
2906 2579 : struct partition *part1, *part2;
2907 2579 : tree rhs1, rhs2;
2908 :
2909 8160 : for (i = 0; partitions->iterate (i, &part1);)
2910 : {
2911 5581 : if (part1->kind != PKIND_MEMSET)
2912 : {
2913 3414 : i++;
2914 3414 : continue;
2915 : }
2916 :
2917 : /* Find sub-array of memset builtins of the same base. Index range
2918 : of the sub-array is [i, j) with "j > i". */
2919 2241 : for (j = i + 1; partitions->iterate (j, &part2); ++j)
2920 : {
2921 1714 : if (part2->kind != PKIND_MEMSET
2922 2274 : || !operand_equal_p (part1->builtin->dst_base_base,
2923 560 : part2->builtin->dst_base_base, 0))
2924 : break;
2925 :
2926 : /* Memset calls setting different values can't be merged. */
2927 122 : rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2928 122 : rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2929 122 : if (!operand_equal_p (rhs1, rhs2, 0))
2930 : break;
2931 : }
2932 :
2933 : /* Stable sort is required in order to avoid breaking dependence. */
2934 2167 : gcc_stablesort (&(*partitions)[i], j - i, sizeof (*partitions)[i],
2935 : offset_cmp);
2936 : /* Continue with next partition. */
2937 2167 : i = j;
2938 : }
2939 :
2940 : /* Merge all consecutive memset builtin partitions. */
2941 11310 : for (i = 0; i < partitions->length () - 1;)
2942 : {
2943 3076 : part1 = (*partitions)[i];
2944 3076 : if (part1->kind != PKIND_MEMSET)
2945 : {
2946 1362 : i++;
2947 3048 : continue;
2948 : }
2949 :
2950 1714 : part2 = (*partitions)[i + 1];
2951 : /* Only merge memset partitions of the same base and with constant
2952 : access sizes. */
2953 3314 : if (part2->kind != PKIND_MEMSET
2954 560 : || TREE_CODE (part1->builtin->size) != INTEGER_CST
2955 295 : || TREE_CODE (part2->builtin->size) != INTEGER_CST
2956 2009 : || !operand_equal_p (part1->builtin->dst_base_base,
2957 295 : part2->builtin->dst_base_base, 0))
2958 : {
2959 1600 : i++;
2960 1600 : continue;
2961 : }
2962 114 : rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2963 114 : rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2964 114 : int bytev1 = const_with_all_bytes_same (rhs1);
2965 114 : int bytev2 = const_with_all_bytes_same (rhs2);
2966 : /* Only merge memset partitions of the same value. */
2967 114 : if (bytev1 != bytev2 || bytev1 == -1)
2968 : {
2969 44 : i++;
2970 44 : continue;
2971 : }
2972 140 : wide_int end1 = wi::add (part1->builtin->dst_base_offset,
2973 70 : wi::to_wide (part1->builtin->size));
2974 : /* Only merge adjacent memset partitions. */
2975 70 : if (wi::ne_p (end1, part2->builtin->dst_base_offset))
2976 : {
2977 42 : i++;
2978 42 : continue;
2979 : }
2980 : /* Merge partitions[i] and partitions[i+1]. */
2981 28 : part1->builtin->size = fold_build2 (PLUS_EXPR, sizetype,
2982 : part1->builtin->size,
2983 : part2->builtin->size);
2984 28 : partition_free (part2);
2985 28 : partitions->ordered_remove (i + 1);
2986 70 : }
2987 2579 : }
2988 :
2989 : void
2990 39274 : loop_distribution::finalize_partitions (class loop *loop,
2991 : vec<struct partition *> *partitions,
2992 : vec<ddr_p> *alias_ddrs)
2993 : {
2994 39274 : unsigned i;
2995 39274 : struct partition *partition, *a;
2996 :
2997 44874 : if (partitions->length () == 1
2998 39349 : || alias_ddrs->length () > 0)
2999 39274 : return;
3000 :
3001 5525 : unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0;
3002 5525 : bool same_type_p = true;
3003 5525 : enum partition_type type = ((*partitions)[0])->type;
3004 29466 : for (i = 0; partitions->iterate (i, &partition); ++i)
3005 : {
3006 23941 : same_type_p &= (type == partition->type);
3007 23941 : if (partition_builtin_p (partition))
3008 : {
3009 2609 : num_builtin++;
3010 2609 : continue;
3011 : }
3012 21332 : num_normal++;
3013 21332 : if (partition->kind == PKIND_PARTIAL_MEMSET)
3014 4 : num_partial_memset++;
3015 : }
3016 :
3017 : /* Don't distribute current loop into too many loops given we don't have
3018 : memory stream cost model. Be even more conservative in case of loop
3019 : nest distribution. */
3020 5525 : if ((same_type_p && num_builtin == 0
3021 2923 : && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1))
3022 2606 : || (loop->inner != NULL
3023 65 : && i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
3024 2597 : || (loop->inner == NULL
3025 2541 : && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
3026 : {
3027 : a = (*partitions)[0];
3028 18286 : for (i = 1; partitions->iterate (i, &partition); ++i)
3029 : {
3030 15340 : partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
3031 15340 : partition_free (partition);
3032 : }
3033 2946 : partitions->truncate (1);
3034 : }
3035 :
3036 : /* Fuse memset builtins if possible. */
3037 5525 : if (partitions->length () > 1)
3038 2579 : fuse_memset_builtins (partitions);
3039 : }
3040 :
3041 : /* Distributes the code from LOOP in such a way that producer statements
3042 : are placed before consumer statements. Tries to separate only the
3043 : statements from STMTS into separate loops. Returns the number of
3044 : distributed loops. Set NB_CALLS to number of generated builtin calls.
3045 : Set *DESTROY_P to whether LOOP needs to be destroyed. */
3046 :
3047 : int
3048 139522 : loop_distribution::distribute_loop (class loop *loop,
3049 : const vec<gimple *> &stmts,
3050 : control_dependences *cd, int *nb_calls, bool *destroy_p,
3051 : bool only_patterns_p)
3052 : {
3053 139522 : ddrs_table = new hash_table<ddr_hasher> (389);
3054 139522 : struct graph *rdg;
3055 139522 : partition *partition;
3056 139522 : int i, nbp;
3057 :
3058 139522 : *destroy_p = false;
3059 139522 : *nb_calls = 0;
3060 139522 : loop_nest.create (0);
3061 139522 : if (!find_loop_nest (loop, &loop_nest))
3062 : {
3063 0 : loop_nest.release ();
3064 0 : delete ddrs_table;
3065 0 : return 0;
3066 : }
3067 :
3068 139522 : datarefs_vec.create (20);
3069 139522 : has_nonaddressable_dataref_p = false;
3070 139522 : rdg = build_rdg (loop, cd);
3071 139522 : if (!rdg)
3072 : {
3073 2491 : if (dump_file && (dump_flags & TDF_DETAILS))
3074 0 : fprintf (dump_file,
3075 : "Loop %d not distributed: failed to build the RDG.\n",
3076 : loop->num);
3077 :
3078 2491 : loop_nest.release ();
3079 2491 : free_data_refs (datarefs_vec);
3080 2491 : delete ddrs_table;
3081 2491 : return 0;
3082 : }
3083 :
3084 274062 : if (datarefs_vec.length () > MAX_DATAREFS_NUM)
3085 : {
3086 0 : if (dump_file && (dump_flags & TDF_DETAILS))
3087 0 : fprintf (dump_file,
3088 : "Loop %d not distributed: too many memory references.\n",
3089 : loop->num);
3090 :
3091 0 : free_rdg (rdg, loop);
3092 0 : loop_nest.release ();
3093 0 : free_data_refs (datarefs_vec);
3094 0 : delete ddrs_table;
3095 0 : return 0;
3096 : }
3097 :
3098 : data_reference_p dref;
3099 565613 : for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
3100 428582 : dref->aux = (void *) (uintptr_t) i;
3101 :
3102 137031 : if (dump_file && (dump_flags & TDF_DETAILS))
3103 70 : dump_rdg (dump_file, rdg);
3104 :
3105 137031 : auto_vec<struct partition *, 3> partitions;
3106 137031 : rdg_build_partitions (rdg, stmts, &partitions);
3107 :
3108 137031 : auto_vec<ddr_p> alias_ddrs;
3109 :
3110 137031 : auto_bitmap stmt_in_all_partitions;
3111 137031 : bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts);
3112 383287 : for (i = 1; partitions.iterate (i, &partition); ++i)
3113 109225 : bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts);
3114 :
3115 : bool any_builtin = false;
3116 : bool reduction_in_all = false;
3117 383287 : int reduction_partition_num = -1;
3118 383287 : FOR_EACH_VEC_ELT (partitions, i, partition)
3119 : {
3120 246256 : reduction_in_all
3121 246256 : |= classify_partition (loop, rdg, partition, stmt_in_all_partitions);
3122 246256 : any_builtin |= partition_builtin_p (partition);
3123 : }
3124 :
3125 : /* If we are only distributing patterns but did not detect any,
3126 : simply bail out. */
3127 137031 : if (only_patterns_p
3128 137031 : && !any_builtin)
3129 : {
3130 97757 : nbp = 0;
3131 97757 : goto ldist_done;
3132 : }
3133 :
3134 : /* If we are only distributing patterns fuse all partitions that
3135 : were not classified as builtins. This also avoids chopping
3136 : a loop into pieces, separated by builtin calls. That is, we
3137 : only want no or a single loop body remaining. */
3138 39274 : struct partition *into;
3139 39274 : if (only_patterns_p)
3140 : {
3141 17900 : for (i = 0; partitions.iterate (i, &into); ++i)
3142 10645 : if (!partition_builtin_p (into))
3143 : break;
3144 11397 : for (++i; partitions.iterate (i, &partition); ++i)
3145 2619 : if (!partition_builtin_p (partition))
3146 : {
3147 1995 : partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN);
3148 1995 : partitions.unordered_remove (i);
3149 1995 : partition_free (partition);
3150 1995 : i--;
3151 : }
3152 : }
3153 :
3154 : /* Due to limitations in the transform phase we have to fuse all
3155 : reduction partitions into the last partition so the existing
3156 : loop will contain all loop-closed PHI nodes. */
3157 109660 : for (i = 0; partitions.iterate (i, &into); ++i)
3158 72417 : if (partition_reduction_p (into))
3159 : break;
3160 41797 : for (i = i + 1; partitions.iterate (i, &partition); ++i)
3161 2523 : if (partition_reduction_p (partition))
3162 : {
3163 2296 : partition_merge_into (rdg, into, partition, FUSE_REDUCTION);
3164 2296 : partitions.unordered_remove (i);
3165 2296 : partition_free (partition);
3166 2296 : i--;
3167 : }
3168 :
3169 : /* Apply our simple cost model - fuse partitions with similar
3170 : memory accesses. */
3171 108380 : for (i = 0; partitions.iterate (i, &into); ++i)
3172 : {
3173 69106 : bool changed = false;
3174 935891 : for (int j = i + 1; partitions.iterate (j, &partition); ++j)
3175 : {
3176 866785 : if (share_memory_accesses (rdg, into, partition))
3177 : {
3178 7163 : partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
3179 7163 : partitions.unordered_remove (j);
3180 7163 : partition_free (partition);
3181 7163 : j--;
3182 7163 : changed = true;
3183 : }
3184 : }
3185 : /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar
3186 : accesses when 1 and 2 have similar accesses but not 0 and 1
3187 : then in the next iteration we will fail to consider merging
3188 : 1 into 0,2. So try again if we did any merging into 0. */
3189 69106 : if (changed)
3190 3625 : i--;
3191 : }
3192 :
3193 : /* Put a non-builtin partition last if we need to preserve a reduction.
3194 : In most cases this helps to keep a normal partition last avoiding to
3195 : spill a reduction result across builtin calls.
3196 : ??? The proper way would be to use dependences to see whether we
3197 : can move builtin partitions earlier during merge_dep_scc_partitions
3198 : and its sort_partitions_by_post_order. Especially when the
3199 : dependence graph is composed of multiple independent subgraphs the
3200 : heuristic does not work reliably. */
3201 39274 : if (reduction_in_all
3202 46374 : && partition_builtin_p (partitions.last()))
3203 136 : FOR_EACH_VEC_ELT (partitions, i, partition)
3204 76 : if (!partition_builtin_p (partition))
3205 : {
3206 15 : partitions.unordered_remove (i);
3207 15 : partitions.quick_push (partition);
3208 15 : break;
3209 : }
3210 :
3211 : /* Build the partition dependency graph and fuse partitions in strong
3212 : connected component. */
3213 39274 : if (partitions.length () > 1)
3214 : {
3215 : /* Don't support loop nest distribution under runtime alias check
3216 : since it's not likely to enable many vectorization opportunities.
3217 : Also if loop has any data reference which may be not addressable
3218 : since alias check needs to take, compare address of the object. */
3219 6683 : if (loop->inner || has_nonaddressable_dataref_p)
3220 354 : merge_dep_scc_partitions (rdg, &partitions, false);
3221 : else
3222 : {
3223 6329 : merge_dep_scc_partitions (rdg, &partitions, true);
3224 6329 : if (partitions.length () > 1)
3225 5579 : break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
3226 : }
3227 : }
3228 :
3229 39274 : finalize_partitions (loop, &partitions, &alias_ddrs);
3230 :
3231 : /* If there is a reduction in all partitions make sure the last
3232 : non-builtin partition provides the LC PHI defs. */
3233 39274 : if (reduction_in_all)
3234 : {
3235 14252 : FOR_EACH_VEC_ELT (partitions, i, partition)
3236 7152 : if (!partition_builtin_p (partition))
3237 7076 : reduction_partition_num = i;
3238 7100 : if (reduction_partition_num == -1)
3239 : {
3240 : /* If all partitions are builtin, force the last one to
3241 : be code generated as normal partition. */
3242 60 : partition = partitions.last ();
3243 60 : partition->kind = PKIND_NORMAL;
3244 : }
3245 : }
3246 :
3247 39274 : nbp = partitions.length ();
3248 39274 : if (nbp == 0
3249 75902 : || (nbp == 1 && !partition_builtin_p (partitions[0]))
3250 51360 : || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
3251 : {
3252 27256 : nbp = 0;
3253 27256 : goto ldist_done;
3254 : }
3255 :
3256 12093 : if (version_for_distribution_p (&partitions, &alias_ddrs))
3257 75 : version_loop_by_alias_check (&partitions, loop, &alias_ddrs);
3258 :
3259 12018 : if (dump_file && (dump_flags & TDF_DETAILS))
3260 : {
3261 42 : fprintf (dump_file,
3262 : "distribute loop <%d> into partitions:\n", loop->num);
3263 42 : dump_rdg_partitions (dump_file, partitions);
3264 : }
3265 :
3266 27192 : FOR_EACH_VEC_ELT (partitions, i, partition)
3267 : {
3268 15174 : if (partition_builtin_p (partition))
3269 12149 : (*nb_calls)++;
3270 15174 : *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1,
3271 : i == reduction_partition_num);
3272 : }
3273 :
3274 137031 : ldist_done:
3275 137031 : loop_nest.release ();
3276 137031 : free_data_refs (datarefs_vec);
3277 137031 : for (hash_table<ddr_hasher>::iterator iter = ddrs_table->begin ();
3278 2486115 : iter != ddrs_table->end (); ++iter)
3279 : {
3280 1174542 : free_dependence_relation (*iter);
3281 1174542 : *iter = NULL;
3282 : }
3283 137031 : delete ddrs_table;
3284 :
3285 348850 : FOR_EACH_VEC_ELT (partitions, i, partition)
3286 211819 : partition_free (partition);
3287 :
3288 137031 : free_rdg (rdg, loop);
3289 137031 : return nbp - *nb_calls;
3290 137031 : }
3291 :
3292 :
3293 201437 : void loop_distribution::bb_top_order_init (void)
3294 : {
3295 201437 : int rpo_num;
3296 201437 : int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (cfun) - NUM_FIXED_BLOCKS);
3297 201437 : edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
3298 201437 : bitmap exit_bbs = BITMAP_ALLOC (NULL);
3299 :
3300 201437 : bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun));
3301 201437 : bb_top_order_index_size = last_basic_block_for_fn (cfun);
3302 :
3303 201437 : entry->flags &= ~EDGE_DFS_BACK;
3304 201437 : bitmap_set_bit (exit_bbs, EXIT_BLOCK);
3305 201437 : rpo_num = rev_post_order_and_mark_dfs_back_seme (cfun, entry, exit_bbs, true,
3306 : rpo, NULL);
3307 201437 : BITMAP_FREE (exit_bbs);
3308 :
3309 6910762 : for (int i = 0; i < rpo_num; i++)
3310 6709325 : bb_top_order_index[rpo[i]] = i;
3311 :
3312 201437 : free (rpo);
3313 201437 : }
3314 :
3315 201437 : void loop_distribution::bb_top_order_destroy ()
3316 : {
3317 201437 : free (bb_top_order_index);
3318 201437 : bb_top_order_index = NULL;
3319 201437 : bb_top_order_index_size = 0;
3320 201437 : }
3321 :
3322 :
3323 : /* Given LOOP, this function records seed statements for distribution in
3324 : WORK_LIST. Return false if there is nothing for distribution. */
3325 :
3326 : static bool
3327 183897 : find_seed_stmts_for_distribution (class loop *loop, vec<gimple *> *work_list)
3328 : {
3329 183897 : basic_block *bbs = get_loop_body_in_dom_order (loop);
3330 :
3331 : /* Initialize the worklist with stmts we seed the partitions with. */
3332 665720 : for (unsigned i = 0; i < loop->num_nodes; ++i)
3333 : {
3334 : /* In irreducible sub-regions we don't know how to redirect
3335 : conditions, so fail. See PR100492. */
3336 526047 : if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP)
3337 : {
3338 4 : if (dump_file && (dump_flags & TDF_DETAILS))
3339 0 : fprintf (dump_file, "loop %d contains an irreducible region.\n",
3340 : loop->num);
3341 4 : work_list->truncate (0);
3342 4 : break;
3343 : }
3344 526043 : for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
3345 1241954 : !gsi_end_p (gsi); gsi_next (&gsi))
3346 : {
3347 715911 : gphi *phi = gsi.phi ();
3348 1431822 : if (virtual_operand_p (gimple_phi_result (phi)))
3349 190998 : continue;
3350 : /* Distribute stmts which have defs that are used outside of
3351 : the loop. */
3352 524913 : if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
3353 499910 : continue;
3354 25003 : work_list->safe_push (phi);
3355 : }
3356 1052086 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3357 3567178 : !gsi_end_p (gsi); gsi_next (&gsi))
3358 : {
3359 3085355 : gimple *stmt = gsi_stmt (gsi);
3360 :
3361 : /* Ignore clobbers, they do not have true side effects. */
3362 3085355 : if (gimple_clobber_p (stmt))
3363 2773163 : continue;
3364 :
3365 : /* If there is a stmt with side-effects bail out - we
3366 : cannot and should not distribute this loop. */
3367 3076767 : if (gimple_has_side_effects (stmt))
3368 : {
3369 44220 : free (bbs);
3370 44220 : return false;
3371 : }
3372 :
3373 : /* Distribute stmts which have defs that are used outside of
3374 : the loop. */
3375 3032547 : if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
3376 : ;
3377 : /* Otherwise only distribute stores for now. */
3378 4752256 : else if (!gimple_vdef (stmt))
3379 2764575 : continue;
3380 :
3381 267972 : work_list->safe_push (stmt);
3382 : }
3383 : }
3384 139677 : bool res = work_list->length () > 0;
3385 139531 : if (res && !can_copy_bbs_p (bbs, loop->num_nodes))
3386 : {
3387 8 : if (dump_file && (dump_flags & TDF_DETAILS))
3388 0 : fprintf (dump_file, "cannot copy loop %d.\n", loop->num);
3389 : res = false;
3390 : }
3391 139677 : free (bbs);
3392 139677 : return res;
3393 : }
3394 :
3395 : /* A helper function for generate_{rawmemchr,strlen}_builtin functions in order
3396 : to place new statements SEQ before LOOP and replace the old reduction
3397 : variable with the new one. */
3398 :
3399 : static void
3400 29 : generate_reduction_builtin_1 (loop_p loop, gimple_seq &seq,
3401 : tree reduction_var_old, tree reduction_var_new,
3402 : const char *info, machine_mode load_mode)
3403 : {
3404 29 : gcc_assert (flag_tree_loop_distribute_patterns);
3405 :
3406 : /* Place new statements before LOOP. */
3407 29 : gimple_stmt_iterator gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
3408 29 : gsi_insert_seq_after (&gsi, seq, GSI_CONTINUE_LINKING);
3409 :
3410 : /* Replace old reduction variable with new one. */
3411 29 : imm_use_iterator iter;
3412 29 : gimple *stmt;
3413 29 : use_operand_p use_p;
3414 144 : FOR_EACH_IMM_USE_STMT (stmt, iter, reduction_var_old)
3415 : {
3416 344 : FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
3417 86 : SET_USE (use_p, reduction_var_new);
3418 :
3419 86 : update_stmt (stmt);
3420 29 : }
3421 :
3422 29 : if (dump_file && (dump_flags & TDF_DETAILS))
3423 7 : fprintf (dump_file, info, GET_MODE_NAME (load_mode));
3424 29 : }
3425 :
3426 : /* Generate a call to rawmemchr and place it before LOOP. REDUCTION_VAR is
3427 : replaced with a fresh SSA name representing the result of the call. */
3428 :
3429 : static void
3430 0 : generate_rawmemchr_builtin (loop_p loop, tree reduction_var,
3431 : data_reference_p store_dr, tree base, tree pattern,
3432 : location_t loc)
3433 : {
3434 0 : gimple_seq seq = NULL;
3435 :
3436 0 : tree mem = force_gimple_operand (base, &seq, true, NULL_TREE);
3437 0 : gimple *fn_call = gimple_build_call_internal (IFN_RAWMEMCHR, 2, mem, pattern);
3438 0 : tree reduction_var_new = copy_ssa_name (reduction_var);
3439 0 : gimple_call_set_lhs (fn_call, reduction_var_new);
3440 0 : gimple_set_location (fn_call, loc);
3441 0 : gimple_seq_add_stmt (&seq, fn_call);
3442 :
3443 0 : if (store_dr)
3444 : {
3445 0 : gassign *g = gimple_build_assign (DR_REF (store_dr), reduction_var_new);
3446 0 : gimple_seq_add_stmt (&seq, g);
3447 : }
3448 :
3449 0 : generate_reduction_builtin_1 (loop, seq, reduction_var, reduction_var_new,
3450 : "generated rawmemchr%s\n",
3451 0 : TYPE_MODE (TREE_TYPE (TREE_TYPE (base))));
3452 0 : }
3453 :
3454 : /* Helper function for generate_strlen_builtin(,_using_rawmemchr) */
3455 :
3456 : static void
3457 29 : generate_strlen_builtin_1 (loop_p loop, gimple_seq &seq,
3458 : tree reduction_var_old, tree reduction_var_new,
3459 : machine_mode mode, tree start_len)
3460 : {
3461 : /* REDUCTION_VAR_NEW has either size type or ptrdiff type and must be
3462 : converted if types of old and new reduction variable are not compatible. */
3463 29 : reduction_var_new = gimple_convert (&seq, TREE_TYPE (reduction_var_old),
3464 : reduction_var_new);
3465 :
3466 : /* Loops of the form `for (i=42; s[i]; ++i);` have an additional start
3467 : length. */
3468 29 : if (!integer_zerop (start_len))
3469 : {
3470 25 : tree lhs = make_ssa_name (TREE_TYPE (reduction_var_new));
3471 25 : gimple *g = gimple_build_assign (lhs, PLUS_EXPR, reduction_var_new,
3472 : start_len);
3473 25 : gimple_seq_add_stmt (&seq, g);
3474 25 : reduction_var_new = lhs;
3475 : }
3476 :
3477 29 : generate_reduction_builtin_1 (loop, seq, reduction_var_old, reduction_var_new,
3478 : "generated strlen%s\n", mode);
3479 29 : }
3480 :
3481 : /* Generate a call to strlen and place it before LOOP. REDUCTION_VAR is
3482 : replaced with a fresh SSA name representing the result of the call. */
3483 :
3484 : static void
3485 29 : generate_strlen_builtin (loop_p loop, tree reduction_var, tree base,
3486 : tree start_len, location_t loc)
3487 : {
3488 29 : gimple_seq seq = NULL;
3489 :
3490 29 : tree reduction_var_new = make_ssa_name (size_type_node);
3491 :
3492 29 : tree mem = force_gimple_operand (base, &seq, true, NULL_TREE);
3493 58 : tree fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_STRLEN));
3494 29 : gimple *fn_call = gimple_build_call (fn, 1, mem);
3495 29 : gimple_call_set_lhs (fn_call, reduction_var_new);
3496 29 : gimple_set_location (fn_call, loc);
3497 29 : gimple_seq_add_stmt (&seq, fn_call);
3498 :
3499 29 : generate_strlen_builtin_1 (loop, seq, reduction_var, reduction_var_new,
3500 : QImode, start_len);
3501 29 : }
3502 :
3503 : /* Generate code in order to mimic the behaviour of strlen but this time over
3504 : an array of elements with mode different than QI. REDUCTION_VAR is replaced
3505 : with a fresh SSA name representing the result, i.e., the length. */
3506 :
3507 : static void
3508 0 : generate_strlen_builtin_using_rawmemchr (loop_p loop, tree reduction_var,
3509 : tree base, tree load_type,
3510 : tree start_len, location_t loc)
3511 : {
3512 0 : gimple_seq seq = NULL;
3513 :
3514 0 : tree start = force_gimple_operand (base, &seq, true, NULL_TREE);
3515 0 : tree zero = build_zero_cst (load_type);
3516 0 : gimple *fn_call = gimple_build_call_internal (IFN_RAWMEMCHR, 2, start, zero);
3517 0 : tree end = make_ssa_name (TREE_TYPE (base));
3518 0 : gimple_call_set_lhs (fn_call, end);
3519 0 : gimple_set_location (fn_call, loc);
3520 0 : gimple_seq_add_stmt (&seq, fn_call);
3521 :
3522 : /* Determine the number of elements between START and END by
3523 : evaluating (END - START) / sizeof (*START). */
3524 0 : tree diff = make_ssa_name (ptrdiff_type_node);
3525 0 : gimple *diff_stmt = gimple_build_assign (diff, POINTER_DIFF_EXPR, end, start);
3526 0 : gimple_seq_add_stmt (&seq, diff_stmt);
3527 : /* Let SIZE be the size of each character. */
3528 0 : tree size = gimple_convert (&seq, ptrdiff_type_node,
3529 0 : TYPE_SIZE_UNIT (load_type));
3530 0 : tree count = make_ssa_name (ptrdiff_type_node);
3531 0 : gimple *count_stmt = gimple_build_assign (count, TRUNC_DIV_EXPR, diff, size);
3532 0 : gimple_seq_add_stmt (&seq, count_stmt);
3533 :
3534 0 : generate_strlen_builtin_1 (loop, seq, reduction_var, count,
3535 0 : TYPE_MODE (load_type),
3536 : start_len);
3537 0 : }
3538 :
3539 : /* Return true if we can count at least as many characters by taking pointer
3540 : difference as we can count via reduction_var without an overflow. Thus
3541 : compute 2^n < (2^(m-1) / s) where n = TYPE_PRECISION (reduction_var_type),
3542 : m = TYPE_PRECISION (ptrdiff_type_node), and s = size of each character. */
3543 : static bool
3544 0 : reduction_var_overflows_first (tree reduction_var_type, tree load_type)
3545 : {
3546 0 : widest_int n2 = wi::lshift (1, TYPE_PRECISION (reduction_var_type));;
3547 0 : widest_int m2 = wi::lshift (1, TYPE_PRECISION (ptrdiff_type_node) - 1);
3548 0 : widest_int s = wi::to_widest (TYPE_SIZE_UNIT (load_type));
3549 0 : return wi::ltu_p (n2, wi::udiv_trunc (m2, s));
3550 0 : }
3551 :
3552 : static gimple *
3553 24286 : determine_reduction_stmt_1 (const loop_p loop, const basic_block *bbs)
3554 : {
3555 24286 : gimple *reduction_stmt = NULL;
3556 :
3557 89280 : for (unsigned i = 0, ninsns = 0; i < loop->num_nodes; ++i)
3558 : {
3559 70440 : basic_block bb = bbs[i];
3560 :
3561 135216 : for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
3562 64776 : gsi_next (&bsi))
3563 : {
3564 65520 : gphi *phi = bsi.phi ();
3565 131040 : if (virtual_operand_p (gimple_phi_result (phi)))
3566 20913 : continue;
3567 44607 : if (stmt_has_scalar_dependences_outside_loop (loop, phi))
3568 : {
3569 7622 : if (reduction_stmt)
3570 744 : return NULL;
3571 : reduction_stmt = phi;
3572 : }
3573 : }
3574 :
3575 69696 : for (gimple_stmt_iterator bsi = gsi_start_nondebug_bb (bb);
3576 217781 : !gsi_end_p (bsi); gsi_next_nondebug (&bsi), ++ninsns)
3577 : {
3578 : /* Bail out early for loops which are unlikely to match. */
3579 152787 : if (ninsns > 16)
3580 4702 : return NULL;
3581 149926 : gimple *stmt = gsi_stmt (bsi);
3582 149926 : if (gimple_clobber_p (stmt))
3583 5560 : continue;
3584 144366 : if (gimple_code (stmt) == GIMPLE_LABEL)
3585 975 : continue;
3586 255688 : if (gimple_has_volatile_ops (stmt))
3587 : return NULL;
3588 143341 : if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
3589 : {
3590 5860 : if (reduction_stmt)
3591 : return NULL;
3592 : reduction_stmt = stmt;
3593 : }
3594 : }
3595 : }
3596 :
3597 : return reduction_stmt;
3598 : }
3599 :
3600 : /* If LOOP has a single non-volatile reduction statement, then return a pointer
3601 : to it. Otherwise return NULL. */
3602 : static gimple *
3603 24286 : determine_reduction_stmt (const loop_p loop)
3604 : {
3605 24286 : basic_block *bbs = get_loop_body (loop);
3606 24286 : gimple *reduction_stmt = determine_reduction_stmt_1 (loop, bbs);
3607 24286 : XDELETEVEC (bbs);
3608 24286 : return reduction_stmt;
3609 : }
3610 :
3611 : /* Transform loops which mimic the effects of builtins rawmemchr or strlen and
3612 : replace them accordingly. For example, a loop of the form
3613 :
3614 : for (; *p != 42; ++p);
3615 :
3616 : is replaced by
3617 :
3618 : p = rawmemchr<MODE> (p, 42);
3619 :
3620 : under the assumption that rawmemchr is available for a particular MODE.
3621 : Another example is
3622 :
3623 : int i;
3624 : for (i = 42; s[i]; ++i);
3625 :
3626 : which is replaced by
3627 :
3628 : i = (int)strlen (&s[42]) + 42;
3629 :
3630 : for some character array S. In case array S is not of type character array
3631 : we end up with
3632 :
3633 : i = (int)(rawmemchr<MODE> (&s[42], 0) - &s[42]) + 42;
3634 :
3635 : assuming that rawmemchr is available for a particular MODE. */
3636 :
3637 : bool
3638 84833 : loop_distribution::transform_reduction_loop (loop_p loop)
3639 : {
3640 84833 : gimple *reduction_stmt;
3641 84833 : data_reference_p load_dr = NULL, store_dr = NULL;
3642 :
3643 84833 : edge e = single_exit (loop);
3644 247045 : gcond *cond = safe_dyn_cast <gcond *> (*gsi_last_bb (e->src));
3645 67333 : if (!cond)
3646 : return false;
3647 : /* Ensure loop condition is an (in)equality test and loop is exited either if
3648 : the inequality test fails or the equality test succeeds. */
3649 52139 : if (!(e->flags & EDGE_FALSE_VALUE && gimple_cond_code (cond) == NE_EXPR)
3650 90039 : && !(e->flags & EDGE_TRUE_VALUE && gimple_cond_code (cond) == EQ_EXPR))
3651 : return false;
3652 : /* A limitation of the current implementation is that we only support
3653 : constant patterns in (in)equality tests. */
3654 32116 : tree pattern = gimple_cond_rhs (cond);
3655 32116 : if (TREE_CODE (pattern) != INTEGER_CST)
3656 : return false;
3657 :
3658 24286 : reduction_stmt = determine_reduction_stmt (loop);
3659 :
3660 : /* A limitation of the current implementation is that we require a reduction
3661 : statement. Therefore, loops without a reduction statement as in the
3662 : following are not recognized:
3663 : int *p;
3664 : void foo (void) { for (; *p; ++p); } */
3665 24286 : if (reduction_stmt == NULL)
3666 : return false;
3667 :
3668 : /* Reduction variables are guaranteed to be SSA names. */
3669 8276 : tree reduction_var;
3670 8276 : switch (gimple_code (reduction_stmt))
3671 : {
3672 8179 : case GIMPLE_ASSIGN:
3673 8179 : case GIMPLE_PHI:
3674 8179 : reduction_var = gimple_get_lhs (reduction_stmt);
3675 8179 : break;
3676 : default:
3677 : /* Bail out e.g. for GIMPLE_CALL. */
3678 : return false;
3679 : }
3680 :
3681 8179 : struct graph *rdg = build_rdg (loop, NULL);
3682 8179 : if (rdg == NULL)
3683 : {
3684 725 : if (dump_file && (dump_flags & TDF_DETAILS))
3685 0 : fprintf (dump_file,
3686 : "Loop %d not transformed: failed to build the RDG.\n",
3687 : loop->num);
3688 :
3689 725 : return false;
3690 : }
3691 14908 : auto_bitmap partition_stmts;
3692 7454 : bitmap_set_range (partition_stmts, 0, rdg->n_vertices);
3693 7454 : find_single_drs (loop, rdg, partition_stmts, &store_dr, &load_dr);
3694 7454 : free_rdg (rdg, loop);
3695 :
3696 : /* Bail out if there is no single load. */
3697 7454 : if (load_dr == NULL)
3698 : return false;
3699 :
3700 : /* Reaching this point we have a loop with a single reduction variable,
3701 : a single load, and an optional single store. */
3702 :
3703 3747 : tree load_ref = DR_REF (load_dr);
3704 3747 : tree load_type = TREE_TYPE (load_ref);
3705 3747 : tree load_access_base = build_fold_addr_expr (load_ref);
3706 3747 : tree load_access_size = TYPE_SIZE_UNIT (load_type);
3707 3747 : affine_iv load_iv, reduction_iv;
3708 :
3709 3747 : if (!INTEGRAL_TYPE_P (load_type)
3710 3747 : || !type_has_mode_precision_p (load_type))
3711 2332 : return false;
3712 :
3713 : /* We already ensured that the loop condition tests for (in)equality where the
3714 : rhs is a constant pattern. Now ensure that the lhs is the result of the
3715 : load. */
3716 1415 : if (gimple_cond_lhs (cond) != gimple_assign_lhs (DR_STMT (load_dr)))
3717 : return false;
3718 :
3719 : /* Bail out if no affine induction variable with constant step can be
3720 : determined. */
3721 1270 : if (!simple_iv (loop, loop, load_access_base, &load_iv, false))
3722 : return false;
3723 :
3724 : /* Bail out if memory accesses are not consecutive or not growing. */
3725 1261 : if (!operand_equal_p (load_iv.step, load_access_size, 0))
3726 : return false;
3727 :
3728 1241 : if (!simple_iv (loop, loop, reduction_var, &reduction_iv, false))
3729 : return false;
3730 :
3731 : /* Handle rawmemchr like loops. */
3732 1127 : if (operand_equal_p (load_iv.base, reduction_iv.base)
3733 1127 : && operand_equal_p (load_iv.step, reduction_iv.step))
3734 : {
3735 311 : if (store_dr)
3736 : {
3737 : /* Ensure that we store to X and load from X+I where I>0. */
3738 0 : if (TREE_CODE (load_iv.base) != POINTER_PLUS_EXPR
3739 0 : || !integer_onep (TREE_OPERAND (load_iv.base, 1)))
3740 0 : return false;
3741 0 : tree ptr_base = TREE_OPERAND (load_iv.base, 0);
3742 0 : if (TREE_CODE (ptr_base) != SSA_NAME)
3743 : return false;
3744 0 : gimple *def = SSA_NAME_DEF_STMT (ptr_base);
3745 0 : if (!gimple_assign_single_p (def)
3746 0 : || gimple_assign_rhs1 (def) != DR_REF (store_dr))
3747 : return false;
3748 : /* Ensure that the reduction value is stored. */
3749 0 : if (gimple_assign_rhs1 (DR_STMT (store_dr)) != reduction_var)
3750 : return false;
3751 : }
3752 : /* Bail out if target does not provide rawmemchr for a certain mode. */
3753 311 : machine_mode mode = TYPE_MODE (load_type);
3754 311 : if (direct_optab_handler (rawmemchr_optab, mode) == CODE_FOR_nothing)
3755 : return false;
3756 0 : location_t loc = gimple_location (DR_STMT (load_dr));
3757 0 : generate_rawmemchr_builtin (loop, reduction_var, store_dr, load_iv.base,
3758 : pattern, loc);
3759 0 : return true;
3760 : }
3761 :
3762 : /* Handle strlen like loops. */
3763 816 : if (store_dr == NULL
3764 806 : && integer_zerop (pattern)
3765 806 : && INTEGRAL_TYPE_P (TREE_TYPE (reduction_var))
3766 796 : && TREE_CODE (reduction_iv.base) == INTEGER_CST
3767 792 : && TREE_CODE (reduction_iv.step) == INTEGER_CST
3768 1608 : && integer_onep (reduction_iv.step))
3769 : {
3770 736 : location_t loc = gimple_location (DR_STMT (load_dr));
3771 736 : tree reduction_var_type = TREE_TYPE (reduction_var);
3772 : /* While determining the length of a string an overflow might occur.
3773 : If an overflow only occurs in the loop implementation and not in the
3774 : strlen implementation, then either the overflow is undefined or the
3775 : truncated result of strlen equals the one of the loop. Otherwise if
3776 : an overflow may also occur in the strlen implementation, then
3777 : replacing a loop by a call to strlen is sound whenever we ensure that
3778 : if an overflow occurs in the strlen implementation, then also an
3779 : overflow occurs in the loop implementation which is undefined. It
3780 : seems reasonable to relax this and assume that the strlen
3781 : implementation cannot overflow in case sizetype is big enough in the
3782 : sense that an overflow can only happen for string objects which are
3783 : bigger than half of the address space; at least for 32-bit targets and
3784 : up.
3785 :
3786 : For strlen which makes use of rawmemchr the maximal length of a string
3787 : which can be determined without an overflow is PTRDIFF_MAX / S where
3788 : each character has size S. Since an overflow for ptrdiff type is
3789 : undefined we have to make sure that if an overflow occurs, then an
3790 : overflow occurs in the loop implementation, too, and this is
3791 : undefined, too. Similar as before we relax this and assume that no
3792 : string object is larger than half of the address space; at least for
3793 : 32-bit targets and up. */
3794 736 : if (TYPE_MODE (load_type) == TYPE_MODE (char_type_node)
3795 456 : && TYPE_PRECISION (load_type) == TYPE_PRECISION (char_type_node)
3796 456 : && ((TYPE_PRECISION (sizetype) >= TYPE_PRECISION (ptr_type_node) - 1
3797 456 : && TYPE_PRECISION (ptr_type_node) >= 32)
3798 0 : || (TYPE_OVERFLOW_UNDEFINED (reduction_var_type)
3799 0 : && TYPE_PRECISION (reduction_var_type) <= TYPE_PRECISION (sizetype)))
3800 765 : && builtin_decl_implicit (BUILT_IN_STRLEN))
3801 29 : generate_strlen_builtin (loop, reduction_var, load_iv.base,
3802 : reduction_iv.base, loc);
3803 707 : else if (direct_optab_handler (rawmemchr_optab, TYPE_MODE (load_type))
3804 : != CODE_FOR_nothing
3805 707 : && ((TYPE_PRECISION (ptrdiff_type_node) == TYPE_PRECISION (ptr_type_node)
3806 0 : && TYPE_PRECISION (ptrdiff_type_node) >= 32)
3807 0 : || (TYPE_OVERFLOW_UNDEFINED (reduction_var_type)
3808 0 : && reduction_var_overflows_first (reduction_var_type, load_type))))
3809 0 : generate_strlen_builtin_using_rawmemchr (loop, reduction_var,
3810 : load_iv.base,
3811 : load_type,
3812 : reduction_iv.base, loc);
3813 : else
3814 707 : return false;
3815 29 : return true;
3816 : }
3817 :
3818 : return false;
3819 : }
3820 :
3821 : /* Given innermost LOOP, return the outermost enclosing loop that forms a
3822 : perfect loop nest. */
3823 :
3824 : static class loop *
3825 166949 : prepare_perfect_loop_nest (class loop *loop)
3826 : {
3827 166949 : class loop *outer = loop_outer (loop);
3828 166949 : tree niters = number_of_latch_executions (loop);
3829 :
3830 : /* TODO: We only support the innermost 3-level loop nest distribution
3831 : because of compilation time issue for now. This should be relaxed
3832 : in the future. Note we only allow 3-level loop nest distribution
3833 : when parallelizing loops. */
3834 166949 : while ((loop->inner == NULL
3835 17457 : || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1))
3836 167009 : && loop_outer (outer)
3837 41160 : && outer->inner == loop && loop->next == NULL
3838 26761 : && single_exit (outer)
3839 25083 : && !chrec_contains_symbols_defined_in_loop (niters, outer->num)
3840 18384 : && (niters = number_of_latch_executions (outer)) != NULL_TREE
3841 202790 : && niters != chrec_dont_know)
3842 : {
3843 17457 : loop = outer;
3844 17457 : outer = loop_outer (loop);
3845 : }
3846 :
3847 166949 : return loop;
3848 : }
3849 :
3850 :
3851 : unsigned int
3852 201437 : loop_distribution::execute (function *fun)
3853 : {
3854 201437 : bool changed = false;
3855 201437 : basic_block bb;
3856 201437 : control_dependences *cd = NULL;
3857 201437 : auto_vec<loop_p> loops_to_be_destroyed;
3858 :
3859 596572 : if (number_of_loops (fun) <= 1)
3860 : return 0;
3861 :
3862 201437 : bb_top_order_init ();
3863 :
3864 7313636 : FOR_ALL_BB_FN (bb, fun)
3865 : {
3866 7112199 : gimple_stmt_iterator gsi;
3867 11070700 : for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3868 3958501 : gimple_set_uid (gsi_stmt (gsi), -1);
3869 64065412 : for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3870 49841014 : gimple_set_uid (gsi_stmt (gsi), -1);
3871 : }
3872 :
3873 : /* We can at the moment only distribute non-nested loops, thus restrict
3874 : walking to innermost loops. */
3875 1022831 : for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST))
3876 : {
3877 : /* Don't distribute multiple exit edges loop, or cold loop when
3878 : not doing pattern detection. */
3879 418520 : if (!single_exit (loop)
3880 418520 : || (!flag_tree_loop_distribute_patterns
3881 1206 : && !optimize_loop_for_speed_p (loop)))
3882 166514 : continue;
3883 :
3884 : /* If niters is unknown don't distribute loop but rather try to transform
3885 : it to a call to a builtin. */
3886 252006 : tree niters = number_of_latch_executions (loop);
3887 252006 : if (niters == NULL_TREE || niters == chrec_dont_know)
3888 : {
3889 85057 : datarefs_vec.create (20);
3890 85057 : if (flag_tree_loop_distribute_patterns
3891 85057 : && transform_reduction_loop (loop))
3892 : {
3893 29 : changed = true;
3894 29 : loops_to_be_destroyed.safe_push (loop);
3895 29 : if (dump_enabled_p ())
3896 : {
3897 7 : dump_user_location_t loc = find_loop_location (loop);
3898 7 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
3899 : loc, "Loop %d transformed into a builtin.\n",
3900 : loop->num);
3901 : }
3902 : }
3903 85057 : free_data_refs (datarefs_vec);
3904 85057 : continue;
3905 85057 : }
3906 :
3907 : /* Get the perfect loop nest for distribution. */
3908 166949 : loop = prepare_perfect_loop_nest (loop);
3909 338828 : for (; loop; loop = loop->inner)
3910 : {
3911 183897 : auto_vec<gimple *> work_list;
3912 183897 : if (!find_seed_stmts_for_distribution (loop, &work_list))
3913 44375 : continue;
3914 :
3915 139522 : const char *str = loop->inner ? " nest" : "";
3916 139522 : dump_user_location_t loc = find_loop_location (loop);
3917 139522 : if (!cd)
3918 : {
3919 61777 : calculate_dominance_info (CDI_DOMINATORS);
3920 61777 : calculate_dominance_info (CDI_POST_DOMINATORS);
3921 61777 : cd = new control_dependences ();
3922 61777 : free_dominance_info (CDI_POST_DOMINATORS);
3923 : }
3924 :
3925 139522 : bool destroy_p;
3926 139522 : int nb_generated_loops, nb_generated_calls;
3927 139522 : bool only_patterns = !optimize_loop_for_speed_p (loop)
3928 139522 : || !flag_tree_loop_distribution;
3929 : /* do not try to distribute loops that are not expected to iterate. */
3930 30793 : if (!only_patterns)
3931 : {
3932 30793 : HOST_WIDE_INT iterations = estimated_loop_iterations_int (loop);
3933 30793 : if (iterations < 0)
3934 14643 : iterations = likely_max_loop_iterations_int (loop);
3935 30793 : if (!iterations)
3936 108739 : only_patterns = true;
3937 : }
3938 139522 : nb_generated_loops
3939 139522 : = distribute_loop (loop, work_list, cd, &nb_generated_calls,
3940 : &destroy_p, only_patterns);
3941 139522 : if (destroy_p)
3942 10128 : loops_to_be_destroyed.safe_push (loop);
3943 :
3944 139522 : if (nb_generated_loops + nb_generated_calls > 0)
3945 : {
3946 12018 : changed = true;
3947 12018 : if (dump_enabled_p ())
3948 71 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
3949 : loc, "Loop%s %d distributed: split to %d loops "
3950 : "and %d library calls.\n", str, loop->num,
3951 : nb_generated_loops, nb_generated_calls);
3952 :
3953 12018 : break;
3954 : }
3955 :
3956 127504 : if (dump_file && (dump_flags & TDF_DETAILS))
3957 28 : fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num);
3958 183897 : }
3959 201437 : }
3960 :
3961 201437 : if (cd)
3962 61777 : delete cd;
3963 :
3964 201437 : if (bb_top_order_index != NULL)
3965 201437 : bb_top_order_destroy ();
3966 :
3967 201437 : if (changed)
3968 : {
3969 : /* Destroy loop bodies that could not be reused. Do this late as we
3970 : otherwise can end up referring to stale data in control
3971 : dependences. */
3972 : unsigned i;
3973 : class loop *loop;
3974 17896 : FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop)
3975 10157 : destroy_loop (loop);
3976 :
3977 : /* Cached scalar evolutions now may refer to wrong or non-existing
3978 : loops. */
3979 7739 : scev_reset ();
3980 7739 : mark_virtual_operands_for_renaming (fun);
3981 7739 : rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
3982 : }
3983 :
3984 201437 : checking_verify_loop_structure ();
3985 :
3986 201437 : return changed ? TODO_cleanup_cfg : 0;
3987 201437 : }
3988 :
3989 :
3990 : /* Distribute all loops in the current function. */
3991 :
3992 : namespace {
3993 :
3994 : const pass_data pass_data_loop_distribution =
3995 : {
3996 : GIMPLE_PASS, /* type */
3997 : "ldist", /* name */
3998 : OPTGROUP_LOOP, /* optinfo_flags */
3999 : TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
4000 : ( PROP_cfg | PROP_ssa ), /* properties_required */
4001 : 0, /* properties_provided */
4002 : 0, /* properties_destroyed */
4003 : 0, /* todo_flags_start */
4004 : 0, /* todo_flags_finish */
4005 : };
4006 :
4007 : class pass_loop_distribution : public gimple_opt_pass
4008 : {
4009 : public:
4010 285722 : pass_loop_distribution (gcc::context *ctxt)
4011 571444 : : gimple_opt_pass (pass_data_loop_distribution, ctxt)
4012 : {}
4013 :
4014 : /* opt_pass methods: */
4015 241458 : bool gate (function *) final override
4016 : {
4017 241458 : return flag_tree_loop_distribution
4018 241458 : || flag_tree_loop_distribute_patterns;
4019 : }
4020 :
4021 : unsigned int execute (function *) final override;
4022 :
4023 : }; // class pass_loop_distribution
4024 :
4025 : unsigned int
4026 201437 : pass_loop_distribution::execute (function *fun)
4027 : {
4028 201437 : return loop_distribution ().execute (fun);
4029 : }
4030 :
4031 : } // anon namespace
4032 :
4033 : gimple_opt_pass *
4034 285722 : make_pass_loop_distribution (gcc::context *ctxt)
4035 : {
4036 285722 : return new pass_loop_distribution (ctxt);
4037 : }
|