Branch data Line data Source code
1 : : /* Loop versioning pass.
2 : : Copyright (C) 2018-2024 Free Software Foundation, Inc.
3 : :
4 : : This file is part of GCC.
5 : :
6 : : GCC is free software; you can redistribute it and/or modify it
7 : : under the terms of the GNU General Public License as published by the
8 : : Free Software Foundation; either version 3, or (at your option) any
9 : : later version.
10 : :
11 : : GCC is distributed in the hope that it will be useful, but WITHOUT
12 : : ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 : : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 : : for more details.
15 : :
16 : : You should have received a copy of the GNU General Public License
17 : : along with GCC; see the file COPYING3. If not see
18 : : <http://www.gnu.org/licenses/>. */
19 : :
20 : : #include "config.h"
21 : : #define INCLUDE_MEMORY
22 : : #include "system.h"
23 : : #include "coretypes.h"
24 : : #include "backend.h"
25 : : #include "tree.h"
26 : : #include "gimple.h"
27 : : #include "gimple-iterator.h"
28 : : #include "tree-pass.h"
29 : : #include "gimplify-me.h"
30 : : #include "cfgloop.h"
31 : : #include "tree-ssa-loop.h"
32 : : #include "ssa.h"
33 : : #include "tree-scalar-evolution.h"
34 : : #include "tree-ssa-loop-ivopts.h"
35 : : #include "fold-const.h"
36 : : #include "tree-ssa-propagate.h"
37 : : #include "tree-inline.h"
38 : : #include "domwalk.h"
39 : : #include "tree-vectorizer.h"
40 : : #include "omp-general.h"
41 : : #include "predict.h"
42 : : #include "tree-into-ssa.h"
43 : : #include "gimple-range.h"
44 : : #include "tree-cfg.h"
45 : :
46 : : namespace {
47 : :
48 : : /* This pass looks for loops that could be simplified if certain loop
49 : : invariant conditions were true. It is effectively a form of loop
50 : : splitting in which the pass produces the split conditions itself,
51 : : instead of using ones that are already present in the IL.
52 : :
53 : : Versioning for when strides are 1
54 : : ---------------------------------
55 : :
56 : : At the moment the only thing the pass looks for are memory references
57 : : like:
58 : :
59 : : for (auto i : ...)
60 : : ...x[i * stride]...
61 : :
62 : : It considers changing such loops to:
63 : :
64 : : if (stride == 1)
65 : : for (auto i : ...) [A]
66 : : ...x[i]...
67 : : else
68 : : for (auto i : ...) [B]
69 : : ...x[i * stride]...
70 : :
71 : : This can have several benefits:
72 : :
73 : : (1) [A] is often easier or cheaper to vectorize than [B].
74 : :
75 : : (2) The scalar code in [A] is simpler than the scalar code in [B]
76 : : (if the loops cannot be vectorized or need an epilogue loop).
77 : :
78 : : (3) We might recognize [A] as a pattern, such as a memcpy or memset.
79 : :
80 : : (4) [A] has simpler address evolutions, which can help other passes
81 : : like loop interchange.
82 : :
83 : : The optimization is particularly useful for assumed-shape arrays in
84 : : Fortran, where the stride of the innermost dimension depends on the
85 : : array descriptor but is often equal to 1 in practice. For example:
86 : :
87 : : subroutine f1(x)
88 : : real :: x(:)
89 : : x(:) = 100
90 : : end subroutine f1
91 : :
92 : : generates the equivalent of:
93 : :
94 : : raw_stride = *x.dim[0].stride;
95 : : stride = raw_stride != 0 ? raw_stride : 1;
96 : : x_base = *x.data;
97 : : ...
98 : : tmp1 = stride * S;
99 : : tmp2 = tmp1 - stride;
100 : : *x_base[tmp2] = 1.0e+2;
101 : :
102 : : but in the common case that stride == 1, the last three statements
103 : : simplify to:
104 : :
105 : : tmp3 = S + -1;
106 : : *x_base[tmp3] = 1.0e+2;
107 : :
108 : : The optimization is in principle very simple. The difficult parts are:
109 : :
110 : : (a) deciding which parts of a general address calculation correspond
111 : : to the inner dimension of an array, since this usually isn't explicit
112 : : in the IL, and for C often isn't even explicit in the source code
113 : :
114 : : (b) estimating when the transformation is worthwhile
115 : :
116 : : Structure
117 : : ---------
118 : :
119 : : The pass has four phases:
120 : :
121 : : (1) Walk through the statements looking for and recording potential
122 : : versioning opportunities. Stop if there are none.
123 : :
124 : : (2) Use context-sensitive range information to see whether any versioning
125 : : conditions are impossible in practice. Remove them if so, and stop
126 : : if no opportunities remain.
127 : :
128 : : (We do this only after (1) to keep compile time down when no
129 : : versioning opportunities exist.)
130 : :
131 : : (3) Apply the cost model. Decide which versioning opportunities are
132 : : worthwhile and at which nesting level they should be applied.
133 : :
134 : : (4) Attempt to version all the loops selected by (3), so that:
135 : :
136 : : for (...)
137 : : ...
138 : :
139 : : becomes:
140 : :
141 : : if (!cond)
142 : : for (...) // Original loop
143 : : ...
144 : : else
145 : : for (...) // New loop
146 : : ...
147 : :
148 : : Use the version condition COND to simplify the new loop. */
149 : :
150 : : /* Enumerates the likelihood that a particular value indexes the inner
151 : : dimension of an array. */
152 : : enum inner_likelihood {
153 : : INNER_UNLIKELY,
154 : : INNER_DONT_KNOW,
155 : : INNER_LIKELY
156 : : };
157 : :
158 : : /* Information about one term of an address_info. */
159 : : struct address_term_info
160 : : {
161 : : /* The value of the term is EXPR * MULTIPLIER. */
162 : : tree expr;
163 : : unsigned HOST_WIDE_INT multiplier;
164 : :
165 : : /* The stride applied by EXPR in each iteration of some unrecorded loop,
166 : : or null if no stride has been identified. */
167 : : tree stride;
168 : :
169 : : /* Enumerates the likelihood that EXPR indexes the inner dimension
170 : : of an array. */
171 : : enum inner_likelihood inner_likelihood;
172 : :
173 : : /* True if STRIDE == 1 is a versioning opportunity when considered
174 : : in isolation. */
175 : : bool versioning_opportunity_p;
176 : : };
177 : :
178 : : /* Information about an address calculation, and the range of constant
179 : : offsets applied to it. */
180 : 124370 : class address_info
181 : : {
182 : : public:
183 : : static const unsigned int MAX_TERMS = 8;
184 : :
185 : : /* One statement that calculates the address. If multiple statements
186 : : share the same address, we only record the first. */
187 : : gimple *stmt;
188 : :
189 : : /* The loop containing STMT (cached for convenience). If multiple
190 : : statements share the same address, they all belong to this loop. */
191 : : class loop *loop;
192 : :
193 : : /* A decomposition of the calculation into a sum of terms plus an
194 : : optional base. When BASE is provided, it is never an SSA name.
195 : : Once initialization is complete, all members of TERMs are SSA names. */
196 : : tree base;
197 : : auto_vec<address_term_info, MAX_TERMS> terms;
198 : :
199 : : /* All bytes accessed from the address fall in the offset range
200 : : [MIN_OFFSET, MAX_OFFSET). */
201 : : HOST_WIDE_INT min_offset, max_offset;
202 : : };
203 : :
204 : : /* Stores addresses based on their base and terms (ignoring the offsets). */
205 : : struct address_info_hasher : nofree_ptr_hash <address_info>
206 : : {
207 : : static hashval_t hash (const address_info *);
208 : : static bool equal (const address_info *, const address_info *);
209 : : };
210 : :
211 : : /* Information about the versioning we'd like to apply to a loop. */
212 : 128151 : class loop_info
213 : : {
214 : : public:
215 : : bool worth_versioning_p () const;
216 : :
217 : : /* True if we've decided not to version this loop. The remaining
218 : : fields are meaningless if so. */
219 : : bool rejected_p;
220 : :
221 : : /* True if at least one subloop of this loop benefits from versioning. */
222 : : bool subloops_benefit_p;
223 : :
224 : : /* An estimate of the total number of instructions in the loop,
225 : : excluding those in subloops that benefit from versioning. */
226 : : unsigned int num_insns;
227 : :
228 : : /* The outermost loop that can handle all the version checks
229 : : described below. */
230 : : class loop *outermost;
231 : :
232 : : /* The first entry in the list of blocks that belong to this loop
233 : : (and not to subloops). m_next_block_in_loop provides the chain
234 : : pointers for the list. */
235 : : basic_block block_list;
236 : :
237 : : /* We'd like to version the loop for the case in which these SSA names
238 : : (keyed off their SSA_NAME_VERSION) are all equal to 1 at runtime. */
239 : : bitmap_head unity_names;
240 : :
241 : : /* If versioning succeeds, this points the version of the loop that
242 : : assumes the version conditions holds. */
243 : : class loop *optimized_loop;
244 : : };
245 : :
246 : : /* The main pass structure. */
247 : : class loop_versioning
248 : : {
249 : : public:
250 : : loop_versioning (function *);
251 : : ~loop_versioning ();
252 : : unsigned int run ();
253 : :
254 : : private:
255 : : /* Used to walk the dominator tree to find loop versioning conditions
256 : : that are always false. */
257 : 1284 : class lv_dom_walker : public dom_walker
258 : : {
259 : : public:
260 : : lv_dom_walker (loop_versioning &);
261 : :
262 : : edge before_dom_children (basic_block) final override;
263 : :
264 : : private:
265 : : /* The parent pass. */
266 : : loop_versioning &m_lv;
267 : : };
268 : :
269 : : /* Used to simplify statements based on conditions that are established
270 : : by the version checks. */
271 : 2104 : class name_prop : public substitute_and_fold_engine
272 : : {
273 : : public:
274 : 2104 : name_prop (loop_info &li) : m_li (li) {}
275 : : tree value_of_expr (tree name, gimple *) final override;
276 : :
277 : : private:
278 : : /* Information about the versioning we've performed on the loop. */
279 : : loop_info &m_li;
280 : : };
281 : :
282 : 1884328 : loop_info &get_loop_info (class loop *loop) { return m_loops[loop->num]; }
283 : :
284 : : unsigned int max_insns_for_loop (class loop *);
285 : : bool expensive_stmt_p (gimple *);
286 : :
287 : : void version_for_unity (gimple *, tree);
288 : : bool acceptable_multiplier_p (tree, unsigned HOST_WIDE_INT,
289 : : unsigned HOST_WIDE_INT * = 0);
290 : : bool acceptable_type_p (tree, unsigned HOST_WIDE_INT *);
291 : : bool multiply_term_by (address_term_info &, tree);
292 : : inner_likelihood get_inner_likelihood (tree, unsigned HOST_WIDE_INT);
293 : : void dump_inner_likelihood (address_info &, address_term_info &);
294 : : void analyze_stride (address_info &, address_term_info &,
295 : : tree, class loop *);
296 : : bool find_per_loop_multiplication (address_info &, address_term_info &);
297 : : bool analyze_term_using_scevs (address_info &, address_term_info &);
298 : : void analyze_arbitrary_term (address_info &, address_term_info &);
299 : : void analyze_address_fragment (address_info &);
300 : : void record_address_fragment (gimple *, unsigned HOST_WIDE_INT,
301 : : tree, unsigned HOST_WIDE_INT, HOST_WIDE_INT);
302 : : void analyze_expr (gimple *, tree);
303 : : bool analyze_block (basic_block);
304 : : bool analyze_blocks ();
305 : :
306 : : void prune_loop_conditions (class loop *);
307 : : bool prune_conditions ();
308 : :
309 : : void merge_loop_info (class loop *, class loop *);
310 : : void add_loop_to_queue (class loop *);
311 : : bool decide_whether_loop_is_versionable (class loop *);
312 : : bool make_versioning_decisions ();
313 : :
314 : : bool version_loop (class loop *);
315 : : void implement_versioning_decisions ();
316 : :
317 : : /* The function we're optimizing. */
318 : : function *m_fn;
319 : :
320 : : /* The obstack to use for all pass-specific bitmaps. */
321 : : bitmap_obstack m_bitmap_obstack;
322 : :
323 : : /* An obstack to use for general allocation. */
324 : : obstack m_obstack;
325 : :
326 : : /* The total number of loop version conditions we've found. */
327 : : unsigned int m_num_conditions;
328 : :
329 : : /* Assume that an address fragment of the form i * stride * scale
330 : : (for variable stride and constant scale) will not benefit from
331 : : versioning for stride == 1 when scale is greater than this value. */
332 : : unsigned HOST_WIDE_INT m_maximum_scale;
333 : :
334 : : /* Information about each loop. */
335 : : auto_vec<loop_info> m_loops;
336 : :
337 : : /* Used to form a linked list of blocks that belong to a loop,
338 : : started by loop_info::block_list. */
339 : : auto_vec<basic_block> m_next_block_in_loop;
340 : :
341 : : /* The list of loops that we've decided to version. */
342 : : auto_vec<class loop *> m_loops_to_version;
343 : :
344 : : /* A table of addresses in the current loop, keyed off their values
345 : : but not their offsets. */
346 : : hash_table <address_info_hasher> m_address_table;
347 : :
348 : : /* A list of all addresses in M_ADDRESS_TABLE, in a predictable order. */
349 : : auto_vec <address_info *, 32> m_address_list;
350 : : };
351 : :
352 : : /* If EXPR is an SSA name and not a default definition, return the
353 : : defining statement, otherwise return null. */
354 : :
355 : : static gimple *
356 : 1048890 : maybe_get_stmt (tree expr)
357 : : {
358 : 904866 : if (TREE_CODE (expr) == SSA_NAME && !SSA_NAME_IS_DEFAULT_DEF (expr))
359 : 0 : return SSA_NAME_DEF_STMT (expr);
360 : : return NULL;
361 : : }
362 : :
363 : : /* Like maybe_get_stmt, but also return null if the defining
364 : : statement isn't an assignment. */
365 : :
366 : : static gassign *
367 : 932489 : maybe_get_assign (tree expr)
368 : : {
369 : 1976605 : return safe_dyn_cast <gassign *> (maybe_get_stmt (expr));
370 : : }
371 : :
372 : : /* Return true if this pass should look through a cast of expression FROM
373 : : to type TYPE when analyzing pieces of an address. */
374 : :
375 : : static bool
376 : 55047 : look_through_cast_p (tree type, tree from)
377 : : {
378 : 55047 : return (INTEGRAL_TYPE_P (TREE_TYPE (from)) == INTEGRAL_TYPE_P (type)
379 : 55047 : && POINTER_TYPE_P (TREE_TYPE (from)) == POINTER_TYPE_P (type));
380 : : }
381 : :
382 : : /* Strip all conversions of integers or pointers from EXPR, regardless
383 : : of whether the conversions are nops. This is useful in the context
384 : : of this pass because we're not trying to fold or simulate the
385 : : expression; we just want to see how it's structured. */
386 : :
387 : : static tree
388 : 367259 : strip_casts (tree expr)
389 : : {
390 : 367259 : const unsigned int MAX_NITERS = 4;
391 : :
392 : 367259 : tree type = TREE_TYPE (expr);
393 : 367259 : while (CONVERT_EXPR_P (expr)
394 : 367449 : && look_through_cast_p (type, TREE_OPERAND (expr, 0)))
395 : 190 : expr = TREE_OPERAND (expr, 0);
396 : :
397 : 421302 : for (unsigned int niters = 0; niters < MAX_NITERS; ++niters)
398 : : {
399 : 421302 : gassign *assign = maybe_get_assign (expr);
400 : 421302 : if (assign
401 : 209574 : && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign))
402 : 476159 : && look_through_cast_p (type, gimple_assign_rhs1 (assign)))
403 : 54043 : expr = gimple_assign_rhs1 (assign);
404 : : else
405 : : break;
406 : : }
407 : 367259 : return expr;
408 : : }
409 : :
410 : : /* Compare two address_term_infos in the same address_info. */
411 : :
412 : : static int
413 : 275850 : compare_address_terms (const void *a_uncast, const void *b_uncast)
414 : : {
415 : 275850 : const address_term_info *a = (const address_term_info *) a_uncast;
416 : 275850 : const address_term_info *b = (const address_term_info *) b_uncast;
417 : :
418 : 275850 : if (a->expr != b->expr)
419 : 275506 : return SSA_NAME_VERSION (a->expr) < SSA_NAME_VERSION (b->expr) ? -1 : 1;
420 : :
421 : 344 : if (a->multiplier != b->multiplier)
422 : 266 : return a->multiplier < b->multiplier ? -1 : 1;
423 : :
424 : : return 0;
425 : : }
426 : :
427 : : /* Dump ADDRESS using flags FLAGS. */
428 : :
429 : : static void
430 : 420 : dump_address_info (dump_flags_t flags, address_info &address)
431 : : {
432 : 420 : if (address.base)
433 : 2 : dump_printf (flags, "%T + ", address.base);
434 : 2340 : for (unsigned int i = 0; i < address.terms.length (); ++i)
435 : : {
436 : 750 : if (i != 0)
437 : 330 : dump_printf (flags, " + ");
438 : 750 : dump_printf (flags, "%T", address.terms[i].expr);
439 : 750 : if (address.terms[i].multiplier != 1)
440 : 522 : dump_printf (flags, " * %wd", address.terms[i].multiplier);
441 : : }
442 : 840 : dump_printf (flags, " + [%wd, %wd]",
443 : 420 : address.min_offset, address.max_offset - 1);
444 : 420 : }
445 : :
446 : : /* Hash an address_info based on its base and terms. */
447 : :
448 : : hashval_t
449 : 173300 : address_info_hasher::hash (const address_info *info)
450 : : {
451 : 173300 : inchash::hash hash;
452 : 173300 : hash.add_int (info->base ? TREE_CODE (info->base) : 0);
453 : 346600 : hash.add_int (info->terms.length ());
454 : 433759 : for (unsigned int i = 0; i < info->terms.length (); ++i)
455 : : {
456 : 260459 : hash.add_int (SSA_NAME_VERSION (info->terms[i].expr));
457 : 260459 : hash.add_hwi (info->terms[i].multiplier);
458 : : }
459 : 173300 : return hash.end ();
460 : : }
461 : :
462 : : /* Return true if two address_infos have equal bases and terms. Other
463 : : properties might be different (such as the statement or constant
464 : : offset range). */
465 : :
466 : : bool
467 : 93977 : address_info_hasher::equal (const address_info *a, const address_info *b)
468 : : {
469 : 93977 : if (a->base != b->base
470 : 93977 : && (!a->base || !b->base || !operand_equal_p (a->base, b->base, 0)))
471 : 2633 : return false;
472 : :
473 : 274032 : if (a->terms.length () != b->terms.length ())
474 : : return false;
475 : :
476 : 180638 : for (unsigned int i = 0; i < a->terms.length (); ++i)
477 : 116631 : if (a->terms[i].expr != b->terms[i].expr
478 : 116631 : || a->terms[i].multiplier != b->terms[i].multiplier)
479 : : return false;
480 : :
481 : : return true;
482 : : }
483 : :
484 : : /* Return true if we want to version the loop, i.e. if we have a
485 : : specific reason for doing so and no specific reason not to. */
486 : :
487 : : bool
488 : 6658 : loop_info::worth_versioning_p () const
489 : : {
490 : 6658 : return (!rejected_p
491 : 5074 : && (!bitmap_empty_p (&unity_names) || subloops_benefit_p));
492 : : }
493 : :
494 : 642 : loop_versioning::lv_dom_walker::lv_dom_walker (loop_versioning &lv)
495 : 0 : : dom_walker (CDI_DOMINATORS), m_lv (lv)
496 : : {
497 : 0 : }
498 : :
499 : : /* Process BB before processing the blocks it dominates. */
500 : :
501 : : edge
502 : 28367 : loop_versioning::lv_dom_walker::before_dom_children (basic_block bb)
503 : : {
504 : 28367 : if (bb == bb->loop_father->header)
505 : 3929 : m_lv.prune_loop_conditions (bb->loop_father);
506 : :
507 : 28367 : return NULL;
508 : : }
509 : :
510 : : /* Decide whether to replace VAL with a new value in a versioned loop.
511 : : Return the new value if so, otherwise return null. */
512 : :
513 : : tree
514 : 68105 : loop_versioning::name_prop::value_of_expr (tree val, gimple *)
515 : : {
516 : 68105 : if (TREE_CODE (val) == SSA_NAME
517 : 68105 : && bitmap_bit_p (&m_li.unity_names, SSA_NAME_VERSION (val)))
518 : 1273 : return build_one_cst (TREE_TYPE (val));
519 : : return NULL_TREE;
520 : : }
521 : :
522 : : /* Initialize the structure to optimize FN. */
523 : :
524 : 24579 : loop_versioning::loop_versioning (function *fn)
525 : 24579 : : m_fn (fn),
526 : 24579 : m_num_conditions (0),
527 : 24579 : m_address_table (31)
528 : : {
529 : 24579 : unsigned m_nloops = number_of_loops (fn);
530 : 24579 : bitmap_obstack_initialize (&m_bitmap_obstack);
531 : 24579 : gcc_obstack_init (&m_obstack);
532 : :
533 : : /* Initialize the loop information. */
534 : 24579 : m_loops.safe_grow_cleared (m_nloops, true);
535 : 152730 : for (unsigned int i = 0; i < m_nloops; ++i)
536 : : {
537 : 128151 : m_loops[i].outermost = get_loop (m_fn, 0);
538 : 128151 : bitmap_initialize (&m_loops[i].unity_names, &m_bitmap_obstack);
539 : : }
540 : :
541 : : /* Initialize the list of blocks that belong to each loop. */
542 : 24579 : unsigned int nbbs = last_basic_block_for_fn (fn);
543 : 24579 : m_next_block_in_loop.safe_grow (nbbs, true);
544 : 24579 : basic_block bb;
545 : 619877 : FOR_EACH_BB_FN (bb, fn)
546 : : {
547 : 595298 : loop_info &li = get_loop_info (bb->loop_father);
548 : 595298 : m_next_block_in_loop[bb->index] = li.block_list;
549 : 595298 : li.block_list = bb;
550 : : }
551 : :
552 : : /* MAX_FIXED_MODE_SIZE should be a reasonable maximum scale for
553 : : unvectorizable code, since it is the largest size that can be
554 : : handled efficiently by scalar code. omp_max_vf calculates the
555 : : maximum number of bytes in a vector, when such a value is relevant
556 : : to loop optimization. */
557 : 24579 : m_maximum_scale = estimated_poly_value (omp_max_vf ());
558 : 73737 : m_maximum_scale = MAX (m_maximum_scale, MAX_FIXED_MODE_SIZE);
559 : 24579 : }
560 : :
561 : 24579 : loop_versioning::~loop_versioning ()
562 : : {
563 : 24579 : bitmap_obstack_release (&m_bitmap_obstack);
564 : 24579 : obstack_free (&m_obstack, NULL);
565 : 24579 : }
566 : :
567 : : /* Return the maximum number of instructions allowed in LOOP before
568 : : it becomes too big for versioning.
569 : :
570 : : There are separate limits for inner and outer loops. The limit for
571 : : inner loops applies only to loops that benefit directly from versioning.
572 : : The limit for outer loops applies to all code in the outer loop and
573 : : its subloops that *doesn't* benefit directly from versioning; such code
574 : : would be "taken along for the ride". The idea is that if the cost of
575 : : the latter is small, it is better to version outer loops rather than
576 : : inner loops, both to reduce the number of repeated checks and to enable
577 : : more of the loop nest to be optimized as a natural nest (e.g. by loop
578 : : interchange or outer-loop vectorization). */
579 : :
580 : : unsigned int
581 : 1371 : loop_versioning::max_insns_for_loop (class loop *loop)
582 : : {
583 : 1371 : return (loop->inner
584 : 317 : ? param_loop_versioning_max_outer_insns
585 : 1054 : : param_loop_versioning_max_inner_insns);
586 : : }
587 : :
588 : : /* Return true if for cost reasons we should avoid versioning any loop
589 : : that contains STMT.
590 : :
591 : : Note that we don't need to check whether versioning is invalid for
592 : : correctness reasons, since the versioning process does that for us.
593 : : The conditions involved are too rare to be worth duplicating here. */
594 : :
595 : : bool
596 : 671231 : loop_versioning::expensive_stmt_p (gimple *stmt)
597 : : {
598 : 0 : if (gcall *call = dyn_cast <gcall *> (stmt))
599 : : /* Assume for now that the time spent in an "expensive" call would
600 : : overwhelm any saving from versioning. */
601 : 20491 : return !gimple_inexpensive_call_p (call);
602 : : return false;
603 : : }
604 : :
605 : : /* Record that we want to version the loop that contains STMT for the
606 : : case in which SSA name NAME is equal to 1. We already know that NAME
607 : : is invariant in the loop. */
608 : :
609 : : void
610 : 2325 : loop_versioning::version_for_unity (gimple *stmt, tree name)
611 : : {
612 : 2325 : class loop *loop = loop_containing_stmt (stmt);
613 : 2325 : loop_info &li = get_loop_info (loop);
614 : :
615 : 2325 : if (bitmap_set_bit (&li.unity_names, SSA_NAME_VERSION (name)))
616 : : {
617 : : /* This is the first time we've wanted to version LOOP for NAME.
618 : : Keep track of the outermost loop that can handle all versioning
619 : : checks in LI. */
620 : 1327 : class loop *outermost
621 : 1327 : = outermost_invariant_loop_for_expr (loop, name);
622 : 1560 : if (loop_depth (li.outermost) < loop_depth (outermost))
623 : 1035 : li.outermost = outermost;
624 : :
625 : 1327 : if (dump_enabled_p ())
626 : : {
627 : 70 : dump_printf_loc (MSG_NOTE, stmt, "want to version containing loop"
628 : : " for when %T == 1", name);
629 : 70 : if (outermost == loop)
630 : 23 : dump_printf (MSG_NOTE, "; cannot hoist check further");
631 : : else
632 : : {
633 : 57 : dump_printf (MSG_NOTE, "; could implement the check at loop"
634 : : " depth %d", loop_depth (outermost));
635 : 67 : if (loop_depth (li.outermost) > loop_depth (outermost))
636 : 0 : dump_printf (MSG_NOTE, ", but other checks only allow"
637 : 0 : " a depth of %d", loop_depth (li.outermost));
638 : : }
639 : 70 : dump_printf (MSG_NOTE, "\n");
640 : : }
641 : :
642 : 1327 : m_num_conditions += 1;
643 : : }
644 : : else
645 : : {
646 : : /* This is a duplicate request. */
647 : 998 : if (dump_enabled_p ())
648 : 6 : dump_printf_loc (MSG_NOTE, stmt, "already asked to version containing"
649 : : " loop for when %T == 1\n", name);
650 : : }
651 : 2325 : }
652 : :
653 : : /* Return true if OP1_TREE is constant and if in principle it is worth
654 : : versioning an address fragment of the form:
655 : :
656 : : i * OP1_TREE * OP2 * stride
657 : :
658 : : for the case in which stride == 1. This in practice means testing
659 : : whether:
660 : :
661 : : OP1_TREE * OP2 <= M_MAXIMUM_SCALE.
662 : :
663 : : If RESULT is nonnull, store OP1_TREE * OP2 there when returning true. */
664 : :
665 : : bool
666 : 281806 : loop_versioning::acceptable_multiplier_p (tree op1_tree,
667 : : unsigned HOST_WIDE_INT op2,
668 : : unsigned HOST_WIDE_INT *result)
669 : : {
670 : 281806 : if (tree_fits_uhwi_p (op1_tree))
671 : : {
672 : 265219 : unsigned HOST_WIDE_INT op1 = tree_to_uhwi (op1_tree);
673 : : /* The first part checks for overflow. */
674 : 265219 : if (op1 * op2 >= op2 && op1 * op2 <= m_maximum_scale)
675 : : {
676 : 242784 : if (result)
677 : 199706 : *result = op1 * op2;
678 : 242784 : return true;
679 : : }
680 : : }
681 : : return false;
682 : : }
683 : :
684 : : /* Return true if it is worth using loop versioning on a memory access
685 : : of type TYPE. Store the size of the access in *SIZE if so. */
686 : :
687 : : bool
688 : 178470 : loop_versioning::acceptable_type_p (tree type, unsigned HOST_WIDE_INT *size)
689 : : {
690 : 178470 : return (TYPE_SIZE_UNIT (type)
691 : 178470 : && acceptable_multiplier_p (TYPE_SIZE_UNIT (type), 1, size));
692 : : }
693 : :
694 : : /* See whether OP is constant and whether we can multiply TERM by that
695 : : constant without exceeding M_MAXIMUM_SCALE. Return true and update
696 : : TERM if so. */
697 : :
698 : : bool
699 : 72136 : loop_versioning::multiply_term_by (address_term_info &term, tree op)
700 : : {
701 : 0 : return acceptable_multiplier_p (op, term.multiplier, &term.multiplier);
702 : : }
703 : :
704 : : /* Decide whether an address fragment of the form STRIDE * MULTIPLIER
705 : : is likely to be indexing an innermost dimension, returning the result
706 : : as an INNER_* probability. */
707 : :
708 : : inner_likelihood
709 : 66701 : loop_versioning::get_inner_likelihood (tree stride,
710 : : unsigned HOST_WIDE_INT multiplier)
711 : : {
712 : 66701 : const unsigned int MAX_NITERS = 8;
713 : :
714 : : /* Iterate over possible values of STRIDE. Return INNER_LIKELY if at
715 : : least one of those values is likely to be for the innermost dimension.
716 : : Record in UNLIKELY_P if at least one of those values is unlikely to be
717 : : for the innermost dimension.
718 : :
719 : : E.g. for:
720 : :
721 : : stride = cond ? a * b : 1
722 : :
723 : : we should treat STRIDE as being a likely inner dimension, since
724 : : we know that it is 1 under at least some circumstances. (See the
725 : : Fortran example below.) However:
726 : :
727 : : stride = a * b
728 : :
729 : : on its own is unlikely to be for the innermost dimension, since
730 : : that would require both a and b to be 1 at runtime. */
731 : 66701 : bool unlikely_p = false;
732 : 66701 : tree worklist[MAX_NITERS];
733 : 66701 : unsigned int length = 0;
734 : 66701 : worklist[length++] = stride;
735 : 104582 : for (unsigned int i = 0; i < length; ++i)
736 : : {
737 : 80959 : tree expr = worklist[i];
738 : :
739 : 80959 : if (CONSTANT_CLASS_P (expr))
740 : : {
741 : : /* See if EXPR * MULTIPLIER would be consistent with an individual
742 : : access or a small grouped access. */
743 : 45448 : if (acceptable_multiplier_p (expr, multiplier))
744 : : return INNER_LIKELY;
745 : : else
746 : : unlikely_p = true;
747 : : }
748 : 69892 : else if (gimple *stmt = maybe_get_stmt (expr))
749 : : {
750 : : /* If EXPR is set by a PHI node, queue its arguments in case
751 : : we find one that is consistent with an inner dimension.
752 : :
753 : : An important instance of this is the Fortran handling of array
754 : : descriptors, which calculates the stride of the inner dimension
755 : : using a PHI equivalent of:
756 : :
757 : : raw_stride = a.dim[0].stride;
758 : : stride = raw_stride != 0 ? raw_stride : 1;
759 : :
760 : : (Strides for outer dimensions do not treat 0 specially.) */
761 : 34381 : if (gphi *phi = dyn_cast <gphi *> (stmt))
762 : : {
763 : 9071 : unsigned int nargs = gimple_phi_num_args (phi);
764 : 23687 : for (unsigned int j = 0; j < nargs && length < MAX_NITERS; ++j)
765 : 14616 : worklist[length++] = strip_casts (gimple_phi_arg_def (phi, j));
766 : : }
767 : : /* If the value is set by an assignment, expect it to be read
768 : : from memory (such as an array descriptor) rather than be
769 : : calculated. */
770 : 63191 : else if (gassign *assign = dyn_cast <gassign *> (stmt))
771 : : {
772 : 24994 : if (!gimple_assign_load_p (assign))
773 : 11934 : unlikely_p = true;
774 : : }
775 : : /* Things like calls don't really tell us anything. */
776 : : }
777 : : }
778 : :
779 : : /* We didn't find any possible values of STRIDE that were likely to be
780 : : for the innermost dimension. If we found one that was actively
781 : : unlikely to be for the innermost dimension, assume that that applies
782 : : to STRIDE too. */
783 : 23623 : return unlikely_p ? INNER_UNLIKELY : INNER_DONT_KNOW;
784 : : }
785 : :
786 : : /* Dump the likelihood that TERM's stride is for the innermost dimension.
787 : : ADDRESS is the address that contains TERM. */
788 : :
789 : : void
790 : 237 : loop_versioning::dump_inner_likelihood (address_info &address,
791 : : address_term_info &term)
792 : : {
793 : 237 : if (term.inner_likelihood == INNER_LIKELY)
794 : 82 : dump_printf_loc (MSG_NOTE, address.stmt, "%T is likely to be the"
795 : : " innermost dimension\n", term.stride);
796 : 155 : else if (term.inner_likelihood == INNER_UNLIKELY)
797 : 37 : dump_printf_loc (MSG_NOTE, address.stmt, "%T is probably not the"
798 : : " innermost dimension\n", term.stride);
799 : : else
800 : 118 : dump_printf_loc (MSG_NOTE, address.stmt, "cannot tell whether %T"
801 : : " is the innermost dimension\n", term.stride);
802 : 237 : }
803 : :
804 : : /* The caller has identified that STRIDE is the stride of interest
805 : : in TERM, and that the stride is applied in OP_LOOP. Record this
806 : : information in TERM, deciding whether STRIDE is likely to be for
807 : : the innermost dimension of an array and whether it represents a
808 : : versioning opportunity. ADDRESS is the address that contains TERM. */
809 : :
810 : : void
811 : 49900 : loop_versioning::analyze_stride (address_info &address,
812 : : address_term_info &term,
813 : : tree stride, class loop *op_loop)
814 : : {
815 : 49900 : term.stride = stride;
816 : :
817 : 49900 : term.inner_likelihood = get_inner_likelihood (stride, term.multiplier);
818 : 49900 : if (dump_enabled_p ())
819 : 197 : dump_inner_likelihood (address, term);
820 : :
821 : : /* To be a versioning opportunity we require:
822 : :
823 : : - The multiplier applied by TERM is equal to the access size,
824 : : so that when STRIDE is 1, the accesses in successive loop
825 : : iterations are consecutive.
826 : :
827 : : This is deliberately conservative. We could relax it to handle
828 : : other cases (such as those with gaps between iterations) if we
829 : : find any real testcases for which it's useful.
830 : :
831 : : - the stride is applied in the same loop as STMT rather than
832 : : in an outer loop. Although versioning for strides applied in
833 : : outer loops could help in some cases -- such as enabling
834 : : more loop interchange -- the savings are much lower than for
835 : : inner loops.
836 : :
837 : : - the stride is an SSA name that is invariant in STMT's loop,
838 : : since otherwise versioning isn't possible. */
839 : 49900 : unsigned HOST_WIDE_INT access_size = address.max_offset - address.min_offset;
840 : 49900 : if (term.multiplier == access_size
841 : 41775 : && address.loop == op_loop
842 : 39466 : && TREE_CODE (stride) == SSA_NAME
843 : 52512 : && expr_invariant_in_loop_p (address.loop, stride))
844 : : {
845 : 2601 : term.versioning_opportunity_p = true;
846 : 2601 : if (dump_enabled_p ())
847 : 88 : dump_printf_loc (MSG_NOTE, address.stmt, "%T == 1 is a versioning"
848 : : " opportunity\n", stride);
849 : : }
850 : 49900 : }
851 : :
852 : : /* See whether address term TERM (which belongs to ADDRESS) is the result
853 : : of multiplying a varying SSA name by a loop-invariant SSA name.
854 : : Return true and update TERM if so.
855 : :
856 : : This handles both cases that SCEV might handle, such as:
857 : :
858 : : for (int i = 0; i < n; ++i)
859 : : res += a[i * stride];
860 : :
861 : : and ones in which the term varies arbitrarily between iterations, such as:
862 : :
863 : : for (int i = 0; i < n; ++i)
864 : : res += a[index[i] * stride]; */
865 : :
866 : : bool
867 : 86915 : loop_versioning::find_per_loop_multiplication (address_info &address,
868 : : address_term_info &term)
869 : : {
870 : 86915 : gassign *mult = maybe_get_assign (term.expr);
871 : 110862 : if (!mult || gimple_assign_rhs_code (mult) != MULT_EXPR)
872 : : return false;
873 : :
874 : 7216 : class loop *mult_loop = loop_containing_stmt (mult);
875 : 7216 : if (!loop_outer (mult_loop))
876 : : return false;
877 : :
878 : 6992 : tree op1 = strip_casts (gimple_assign_rhs1 (mult));
879 : 13984 : tree op2 = strip_casts (gimple_assign_rhs2 (mult));
880 : 6992 : if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
881 : : return false;
882 : :
883 : 6242 : bool invariant1_p = expr_invariant_in_loop_p (mult_loop, op1);
884 : 6242 : bool invariant2_p = expr_invariant_in_loop_p (mult_loop, op2);
885 : 6242 : if (invariant1_p == invariant2_p)
886 : : return false;
887 : :
888 : : /* Make sure that the loop invariant is OP2 rather than OP1. */
889 : 6025 : if (invariant1_p)
890 : 3732 : std::swap (op1, op2);
891 : :
892 : 6025 : if (dump_enabled_p ())
893 : 90 : dump_printf_loc (MSG_NOTE, address.stmt, "address term %T = varying %T"
894 : : " * loop-invariant %T\n", term.expr, op1, op2);
895 : 6025 : analyze_stride (address, term, op2, mult_loop);
896 : 6025 : return true;
897 : : }
898 : :
899 : : /* Try to use scalar evolutions to find an address stride for TERM,
900 : : which belongs to ADDRESS. Return true and update TERM if so.
901 : :
902 : : Here we are interested in any evolution information we can find,
903 : : not just evolutions wrt ADDRESS->LOOP. For example, if we find that
904 : : an outer loop obviously iterates over the inner dimension of an array,
905 : : that information can help us eliminate worthless versioning opportunities
906 : : in inner loops. */
907 : :
908 : : bool
909 : 80890 : loop_versioning::analyze_term_using_scevs (address_info &address,
910 : : address_term_info &term)
911 : : {
912 : 117905 : gimple *setter = maybe_get_stmt (term.expr);
913 : 67569 : if (!setter)
914 : : return false;
915 : :
916 : 67569 : class loop *wrt_loop = loop_containing_stmt (setter);
917 : 67569 : if (!loop_outer (wrt_loop))
918 : : return false;
919 : :
920 : 53956 : tree chrec = strip_casts (analyze_scalar_evolution (wrt_loop, term.expr));
921 : 53956 : if (TREE_CODE (chrec) == POLYNOMIAL_CHREC)
922 : : {
923 : 43875 : if (dump_enabled_p ())
924 : 107 : dump_printf_loc (MSG_NOTE, address.stmt,
925 : : "address term %T = %T\n", term.expr, chrec);
926 : :
927 : : /* Peel casts and accumulate constant multiplications, up to the
928 : : limit allowed by M_MAXIMUM_SCALE. */
929 : 43875 : tree stride = strip_casts (CHREC_RIGHT (chrec));
930 : 43875 : while (TREE_CODE (stride) == MULT_EXPR
931 : 43875 : && multiply_term_by (term, TREE_OPERAND (stride, 1)))
932 : 0 : stride = strip_casts (TREE_OPERAND (stride, 0));
933 : :
934 : : gassign *assign;
935 : 43996 : while ((assign = maybe_get_assign (stride))
936 : 355 : && gimple_assign_rhs_code (assign) == MULT_EXPR
937 : 44117 : && multiply_term_by (term, gimple_assign_rhs2 (assign)))
938 : : {
939 : 121 : if (dump_enabled_p ())
940 : 24 : dump_printf_loc (MSG_NOTE, address.stmt,
941 : : "looking through %G", (gimple *) assign);
942 : 121 : stride = strip_casts (gimple_assign_rhs1 (assign));
943 : : }
944 : :
945 : 43875 : analyze_stride (address, term, stride, get_chrec_loop (chrec));
946 : 43875 : return true;
947 : : }
948 : :
949 : : return false;
950 : : }
951 : :
952 : : /* Address term TERM is an arbitrary term that provides no versioning
953 : : opportunities. Analyze it to see whether it contains any likely
954 : : inner strides, so that we don't mistakenly version for other
955 : : (less likely) candidates.
956 : :
957 : : This copes with invariant innermost indices such as:
958 : :
959 : : x(i, :) = 100
960 : :
961 : : where the "i" component of the address is invariant in the loop
962 : : but provides the real inner stride.
963 : :
964 : : ADDRESS is the address that contains TERM. */
965 : :
966 : : void
967 : 16267 : loop_versioning::analyze_arbitrary_term (address_info &address,
968 : : address_term_info &term)
969 : :
970 : : {
971 : : /* A multiplication offers two potential strides. Pick the one that
972 : : is most likely to be an innermost stride. */
973 : 16267 : tree expr = term.expr, alt = NULL_TREE;
974 : 16267 : gassign *mult = maybe_get_assign (expr);
975 : 29241 : if (mult && gimple_assign_rhs_code (mult) == MULT_EXPR)
976 : : {
977 : 534 : expr = strip_casts (gimple_assign_rhs1 (mult));
978 : 1068 : alt = strip_casts (gimple_assign_rhs2 (mult));
979 : : }
980 : 16267 : term.stride = expr;
981 : 16267 : term.inner_likelihood = get_inner_likelihood (expr, term.multiplier);
982 : 16267 : if (alt)
983 : : {
984 : 534 : inner_likelihood alt_l = get_inner_likelihood (alt, term.multiplier);
985 : 534 : if (alt_l > term.inner_likelihood)
986 : : {
987 : 193 : term.stride = alt;
988 : 193 : term.inner_likelihood = alt_l;
989 : : }
990 : : }
991 : 16267 : if (dump_enabled_p ())
992 : 40 : dump_inner_likelihood (address, term);
993 : 16267 : }
994 : :
995 : : /* Try to identify loop strides in ADDRESS and try to choose realistic
996 : : versioning opportunities based on these strides.
997 : :
998 : : The main difficulty here isn't finding strides that could be used
999 : : in a version check (that's pretty easy). The problem instead is to
1000 : : avoid versioning for some stride S that is unlikely ever to be 1 at
1001 : : runtime. Versioning for S == 1 on its own would lead to unnecessary
1002 : : code bloat, while adding S == 1 to more realistic version conditions
1003 : : would lose the optimisation opportunity offered by those other conditions.
1004 : :
1005 : : For example, versioning for a stride of 1 in the Fortran code:
1006 : :
1007 : : integer :: a(:,:)
1008 : : a(1,:) = 1
1009 : :
1010 : : is not usually a good idea, since the assignment is iterating over
1011 : : an outer dimension and is relatively unlikely to have a stride of 1.
1012 : : (It isn't impossible, since the inner dimension might be 1, or the
1013 : : array might be transposed.) Similarly, in:
1014 : :
1015 : : integer :: a(:,:), b(:,:)
1016 : : b(:,1) = a(1,:)
1017 : :
1018 : : b(:,1) is relatively likely to have a stride of 1 while a(1,:) isn't.
1019 : : Versioning for when both strides are 1 would lose most of the benefit
1020 : : of versioning for b's access.
1021 : :
1022 : : The approach we take is as follows:
1023 : :
1024 : : - Analyze each term to see whether it has an identifiable stride,
1025 : : regardless of which loop applies the stride.
1026 : :
1027 : : - Evaluate the likelihood that each such stride is for the innermost
1028 : : dimension of an array, on the scale "likely", "don't know" or "unlikely".
1029 : :
1030 : : - If there is a single "likely" innermost stride, and that stride is
1031 : : applied in the loop that contains STMT, version the loop for when the
1032 : : stride is 1. This deals with the cases in which we're fairly
1033 : : confident of doing the right thing, such as the b(:,1) reference above.
1034 : :
1035 : : - If there are no "likely" innermost strides, and the loop that contains
1036 : : STMT uses a stride that we rated as "don't know", version for when
1037 : : that stride is 1. This is principally used for C code such as:
1038 : :
1039 : : for (int i = 0; i < n; ++i)
1040 : : a[i * x] = ...;
1041 : :
1042 : : and:
1043 : :
1044 : : for (int j = 0; j < n; ++j)
1045 : : for (int i = 0; i < n; ++i)
1046 : : a[i * x + j * y] = ...;
1047 : :
1048 : : where nothing in the way "x" and "y" are set gives a hint as to
1049 : : whether "i" iterates over the innermost dimension of the array.
1050 : : In these situations it seems reasonable to assume the
1051 : : programmer has nested the loops appropriately (although of course
1052 : : there are examples like GEMM in which this assumption doesn't hold
1053 : : for all accesses in the loop).
1054 : :
1055 : : This case is also useful for the Fortran equivalent of the
1056 : : above C code. */
1057 : :
1058 : : void
1059 : 55230 : loop_versioning::analyze_address_fragment (address_info &address)
1060 : : {
1061 : 55230 : if (dump_enabled_p ())
1062 : : {
1063 : 171 : dump_printf_loc (MSG_NOTE, address.stmt, "analyzing address fragment ");
1064 : 171 : dump_address_info (MSG_NOTE, address);
1065 : 171 : dump_printf (MSG_NOTE, "\n");
1066 : : }
1067 : :
1068 : : /* Analyze each component of the sum to see whether it involves an
1069 : : apparent stride.
1070 : :
1071 : : There is an overlap between the addresses that
1072 : : find_per_loop_multiplication and analyze_term_using_scevs can handle,
1073 : : but the former is much cheaper than SCEV analysis, so try it first. */
1074 : 284290 : for (unsigned int i = 0; i < address.terms.length (); ++i)
1075 : 86915 : if (!find_per_loop_multiplication (address, address.terms[i])
1076 : 80890 : && !analyze_term_using_scevs (address, address.terms[i])
1077 : 123930 : && !POINTER_TYPE_P (TREE_TYPE (address.terms[i].expr)))
1078 : 16267 : analyze_arbitrary_term (address, address.terms[i]);
1079 : :
1080 : : /* Check for strides that are likely to be for the innermost dimension.
1081 : :
1082 : : 1. If there is a single likely inner stride, if it is an SSA name,
1083 : : and if it is worth versioning the loop for when the SSA name
1084 : : equals 1, record that we want to do so.
1085 : :
1086 : : 2. Otherwise, if there any likely inner strides, bail out. This means
1087 : : one of:
1088 : :
1089 : : (a) There are multiple likely inner strides. This suggests we're
1090 : : confused and be can't be confident of doing the right thing.
1091 : :
1092 : : (b) There is a single likely inner stride and it is a constant
1093 : : rather than an SSA name. This can mean either that the access
1094 : : is a natural one without any variable strides, such as:
1095 : :
1096 : : for (int i = 0; i < n; ++i)
1097 : : a[i] += 1;
1098 : :
1099 : : or that a variable stride is applied to an outer dimension,
1100 : : such as:
1101 : :
1102 : : for (int i = 0; i < n; ++i)
1103 : : for (int j = 0; j < n; ++j)
1104 : : a[j * stride][i] += 1;
1105 : :
1106 : : (c) There is a single likely inner stride, and it is an SSA name,
1107 : : but it isn't a worthwhile versioning opportunity. This usually
1108 : : means that the variable stride is applied by an outer loop,
1109 : : such as:
1110 : :
1111 : : for (int i = 0; i < n; ++i)
1112 : : for (int j = 0; j < n; ++j)
1113 : : a[j][i * stride] += 1;
1114 : :
1115 : : or (using an example with a more natural loop nesting):
1116 : :
1117 : : for (int i = 0; i < n; ++i)
1118 : : for (int j = 0; j < n; ++j)
1119 : : a[i][j] += b[i * stride];
1120 : :
1121 : : in cases where b[i * stride] cannot (yet) be hoisted for
1122 : : aliasing reasons.
1123 : :
1124 : : 3. If there are no likely inner strides, fall through to the next
1125 : : set of checks.
1126 : :
1127 : : Pointer equality is enough to check for uniqueness in (1), since we
1128 : : only care about SSA names. */
1129 : : tree chosen_stride = NULL_TREE;
1130 : : tree version_stride = NULL_TREE;
1131 : 141995 : for (unsigned int i = 0; i < address.terms.length (); ++i)
1132 : 86871 : if (chosen_stride != address.terms[i].stride
1133 : 86871 : && address.terms[i].inner_likelihood == INNER_LIKELY)
1134 : : {
1135 : 42202 : if (chosen_stride)
1136 : : return;
1137 : 42096 : chosen_stride = address.terms[i].stride;
1138 : 42096 : if (address.terms[i].versioning_opportunity_p)
1139 : 637 : version_stride = chosen_stride;
1140 : : }
1141 : :
1142 : : /* If there are no likely inner strides, see if there is a single
1143 : : versioning opportunity for a stride that was rated as INNER_DONT_KNOW.
1144 : : See the comment above the function for the cases that this code
1145 : : handles. */
1146 : 55124 : if (!chosen_stride)
1147 : 39969 : for (unsigned int i = 0; i < address.terms.length (); ++i)
1148 : 26844 : if (version_stride != address.terms[i].stride
1149 : 16814 : && address.terms[i].inner_likelihood == INNER_DONT_KNOW
1150 : 37349 : && address.terms[i].versioning_opportunity_p)
1151 : : {
1152 : 1708 : if (version_stride)
1153 : : return;
1154 : : version_stride = address.terms[i].stride;
1155 : : }
1156 : :
1157 : 55115 : if (version_stride)
1158 : 2325 : version_for_unity (address.stmt, version_stride);
1159 : : }
1160 : :
1161 : : /* Treat EXPR * MULTIPLIER + OFFSET as a fragment of an address that addresses
1162 : : TYPE_SIZE bytes and record this address fragment for later processing.
1163 : : STMT is the statement that contains the address. */
1164 : :
1165 : : void
1166 : 147517 : loop_versioning::record_address_fragment (gimple *stmt,
1167 : : unsigned HOST_WIDE_INT type_size,
1168 : : tree expr,
1169 : : unsigned HOST_WIDE_INT multiplier,
1170 : : HOST_WIDE_INT offset)
1171 : : {
1172 : : /* We're only interested in computed values. */
1173 : 147517 : if (TREE_CODE (expr) != SSA_NAME)
1174 : 23401 : return;
1175 : :
1176 : : /* Quick exit if no part of the address is calculated in STMT's loop,
1177 : : since such addresses have no versioning opportunities. */
1178 : 132802 : class loop *loop = loop_containing_stmt (stmt);
1179 : 132802 : if (expr_invariant_in_loop_p (loop, expr))
1180 : : return;
1181 : :
1182 : : /* Set up an address_info for EXPR * MULTIPLIER. */
1183 : 124370 : address_info *address = XOBNEW (&m_obstack, address_info);
1184 : 124370 : new (address) address_info;
1185 : 124370 : address->stmt = stmt;
1186 : 124370 : address->loop = loop;
1187 : 124370 : address->base = NULL_TREE;
1188 : 124370 : address->terms.quick_grow (1);
1189 : 124370 : address->terms[0].expr = expr;
1190 : 124370 : address->terms[0].multiplier = multiplier;
1191 : 124370 : address->terms[0].stride = NULL_TREE;
1192 : 124370 : address->terms[0].inner_likelihood = INNER_UNLIKELY;
1193 : 124370 : address->terms[0].versioning_opportunity_p = false;
1194 : 124370 : address->min_offset = offset;
1195 : :
1196 : : /* Peel apart the expression into a sum of address_terms, where each
1197 : : term is multiplied by a constant. Treat a + b and a - b the same,
1198 : : since it doesn't matter for our purposes whether an address is
1199 : : increasing or decreasing. Distribute (a + b) * constant into
1200 : : a * constant + b * constant.
1201 : :
1202 : : We don't care which loop each term belongs to, since we want to
1203 : : examine as many candidate strides as possible when determining
1204 : : which is likely to be for the innermost dimension. We therefore
1205 : : don't limit the search to statements in STMT's loop. */
1206 : 488379 : for (unsigned int i = 0; i < address->terms.length (); )
1207 : : {
1208 : 364009 : if (gassign *assign = maybe_get_assign (address->terms[i].expr))
1209 : : {
1210 : 221267 : tree_code code = gimple_assign_rhs_code (assign);
1211 : 221267 : if (code == PLUS_EXPR
1212 : 221267 : || code == POINTER_PLUS_EXPR
1213 : 103487 : || code == MINUS_EXPR)
1214 : : {
1215 : 119233 : tree op1 = gimple_assign_rhs1 (assign);
1216 : 119233 : tree op2 = gimple_assign_rhs2 (assign);
1217 : 119233 : if (TREE_CODE (op2) == INTEGER_CST)
1218 : : {
1219 : 58184 : address->terms[i].expr = strip_casts (op1);
1220 : : /* This is heuristic only, so don't worry about truncation
1221 : : or overflow. */
1222 : 58184 : address->min_offset += (TREE_INT_CST_LOW (op2)
1223 : 58184 : * address->terms[i].multiplier);
1224 : 58184 : continue;
1225 : : }
1226 : 61049 : else if (address->terms.length () < address_info::MAX_TERMS)
1227 : : {
1228 : 61010 : unsigned int j = address->terms.length ();
1229 : 61010 : address->terms.quick_push (address->terms[i]);
1230 : 61010 : address->terms[i].expr = strip_casts (op1);
1231 : 61010 : address->terms[j].expr = strip_casts (op2);
1232 : 61010 : continue;
1233 : 61010 : }
1234 : : }
1235 : 102073 : if (code == MULT_EXPR)
1236 : : {
1237 : 71990 : tree op1 = gimple_assign_rhs1 (assign);
1238 : 71990 : tree op2 = gimple_assign_rhs2 (assign);
1239 : 71990 : if (multiply_term_by (address->terms[i], op2))
1240 : : {
1241 : 52068 : address->terms[i].expr = strip_casts (op1);
1242 : 52068 : continue;
1243 : : }
1244 : : }
1245 : 50005 : if (CONVERT_EXPR_CODE_P (code))
1246 : : {
1247 : 7367 : tree op1 = gimple_assign_rhs1 (assign);
1248 : 7367 : address->terms[i].expr = strip_casts (op1);
1249 : 7367 : continue;
1250 : 7367 : }
1251 : : }
1252 : 185380 : i += 1;
1253 : : }
1254 : :
1255 : : /* Peel off any symbolic pointer. */
1256 : 124370 : if (TREE_CODE (address->terms[0].expr) != SSA_NAME
1257 : 124370 : && address->terms[0].multiplier == 1)
1258 : : {
1259 : 4880 : if (address->terms.length () == 1)
1260 : : {
1261 : 0 : obstack_free (&m_obstack, address);
1262 : 0 : return;
1263 : : }
1264 : 4880 : address->base = address->terms[0].expr;
1265 : 4880 : address->terms.ordered_remove (0);
1266 : : }
1267 : :
1268 : : /* Require all remaining terms to be SSA names. (This could be false
1269 : : for unfolded statements, but they aren't worth dealing with.) */
1270 : 608532 : for (unsigned int i = 0; i < address->terms.length (); ++i)
1271 : 180150 : if (TREE_CODE (address->terms[i].expr) != SSA_NAME)
1272 : : {
1273 : 254 : obstack_free (&m_obstack, address);
1274 : 254 : return;
1275 : : }
1276 : :
1277 : : /* The loop above set MIN_OFFSET based on the first byte of the
1278 : : referenced data. Calculate the end + 1. */
1279 : 124116 : address->max_offset = address->min_offset + type_size;
1280 : :
1281 : : /* Put the terms into a canonical order for the hash table lookup below. */
1282 : 124116 : address->terms.qsort (compare_address_terms);
1283 : :
1284 : 124116 : if (dump_enabled_p ())
1285 : : {
1286 : 249 : dump_printf_loc (MSG_NOTE, stmt, "recording address fragment %T", expr);
1287 : 249 : if (multiplier != 1)
1288 : 118 : dump_printf (MSG_NOTE, " * %wd", multiplier);
1289 : 249 : dump_printf (MSG_NOTE, " = ");
1290 : 249 : dump_address_info (MSG_NOTE, *address);
1291 : 249 : dump_printf (MSG_NOTE, "\n");
1292 : : }
1293 : :
1294 : : /* Pool address information with the same terms (but potentially
1295 : : different offsets). */
1296 : 124116 : address_info **slot = m_address_table.find_slot (address, INSERT);
1297 : 124116 : if (address_info *old_address = *slot)
1298 : : {
1299 : : /* We've already seen an address with the same terms. Extend the
1300 : : offset range to account for this access. Doing this can paper
1301 : : over gaps, such as in:
1302 : :
1303 : : a[i * stride * 4] + a[i * stride * 4 + 3];
1304 : :
1305 : : where nothing references "+ 1" or "+ 2". However, the vectorizer
1306 : : handles such gapped accesses without problems, so it's not worth
1307 : : trying to exclude them. */
1308 : 64007 : if (old_address->min_offset > address->min_offset)
1309 : 14950 : old_address->min_offset = address->min_offset;
1310 : 64007 : if (old_address->max_offset < address->max_offset)
1311 : 12010 : old_address->max_offset = address->max_offset;
1312 : 64007 : obstack_free (&m_obstack, address);
1313 : : }
1314 : : else
1315 : : {
1316 : : /* This is the first time we've seen an address with these terms. */
1317 : 60109 : *slot = address;
1318 : 60109 : m_address_list.safe_push (address);
1319 : : }
1320 : : }
1321 : :
1322 : : /* Analyze expression EXPR, which occurs in STMT. */
1323 : :
1324 : : void
1325 : 1369028 : loop_versioning::analyze_expr (gimple *stmt, tree expr)
1326 : : {
1327 : 1369028 : unsigned HOST_WIDE_INT type_size;
1328 : :
1329 : 1524366 : while (handled_component_p (expr))
1330 : : {
1331 : : /* See whether we can use versioning to avoid a multiplication
1332 : : in an array index. */
1333 : 155338 : if (TREE_CODE (expr) == ARRAY_REF
1334 : 155338 : && acceptable_type_p (TREE_TYPE (expr), &type_size))
1335 : 88053 : record_address_fragment (stmt, type_size,
1336 : 88053 : TREE_OPERAND (expr, 1), type_size, 0);
1337 : 155338 : expr = TREE_OPERAND (expr, 0);
1338 : : }
1339 : :
1340 : : /* See whether we can use versioning to avoid a multiplication
1341 : : in the pointer calculation of a MEM_REF. */
1342 : 1369028 : if (TREE_CODE (expr) == MEM_REF
1343 : 1369028 : && acceptable_type_p (TREE_TYPE (expr), &type_size))
1344 : 59464 : record_address_fragment (stmt, type_size, TREE_OPERAND (expr, 0), 1,
1345 : : /* This is heuristic only, so don't worry
1346 : : about truncation or overflow. */
1347 : 59464 : TREE_INT_CST_LOW (TREE_OPERAND (expr, 1)));
1348 : :
1349 : : /* These would be easy to handle if they existed at this stage. */
1350 : 1369028 : gcc_checking_assert (TREE_CODE (expr) != TARGET_MEM_REF);
1351 : 1369028 : }
1352 : :
1353 : : /* Analyze all the statements in BB looking for useful version checks.
1354 : : Return true on success, false if something prevents the block from
1355 : : being versioned. */
1356 : :
1357 : : bool
1358 : 220261 : loop_versioning::analyze_block (basic_block bb)
1359 : : {
1360 : 220261 : class loop *loop = bb->loop_father;
1361 : 220261 : loop_info &li = get_loop_info (loop);
1362 : 1198848 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1363 : 758326 : gsi_next (&gsi))
1364 : : {
1365 : 773992 : gimple *stmt = gsi_stmt (gsi);
1366 : 773992 : if (is_gimple_debug (stmt))
1367 : 102761 : continue;
1368 : :
1369 : 691722 : if (expensive_stmt_p (stmt))
1370 : : {
1371 : 15666 : if (dump_enabled_p ())
1372 : 58 : dump_printf_loc (MSG_NOTE, stmt, "expensive statement"
1373 : : " prevents versioning: %G", stmt);
1374 : 15666 : return false;
1375 : : }
1376 : :
1377 : : /* Only look for direct versioning opportunities in inner loops
1378 : : since the benefit tends to be much smaller for outer loops. */
1379 : 655565 : if (!loop->inner)
1380 : : {
1381 : 557002 : unsigned int nops = gimple_num_ops (stmt);
1382 : 2101832 : for (unsigned int i = 0; i < nops; ++i)
1383 : 1544830 : if (tree op = gimple_op (stmt, i))
1384 : 1369028 : analyze_expr (stmt, op);
1385 : : }
1386 : :
1387 : : /* The point of the instruction limit is to prevent excessive
1388 : : code growth, so this is a size-based estimate even though
1389 : : the optimization is aimed at speed. */
1390 : 655565 : li.num_insns += estimate_num_insns (stmt, &eni_size_weights);
1391 : : }
1392 : :
1393 : : return true;
1394 : : }
1395 : :
1396 : : /* Analyze all the blocks in the function, looking for useful version checks.
1397 : : Return true if we found one. */
1398 : :
1399 : : bool
1400 : 24579 : loop_versioning::analyze_blocks ()
1401 : : {
1402 : 24579 : AUTO_DUMP_SCOPE ("analyze_blocks",
1403 : : dump_user_location_t::from_function_decl (m_fn->decl));
1404 : :
1405 : : /* For now we don't try to version the whole function, although
1406 : : versioning at that level could be useful in some cases. */
1407 : 24579 : get_loop_info (get_loop (m_fn, 0)).rejected_p = true;
1408 : :
1409 : 142821 : for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
1410 : : {
1411 : 69084 : loop_info &linfo = get_loop_info (loop);
1412 : :
1413 : : /* Ignore cold loops. */
1414 : 69084 : if (!optimize_loop_for_speed_p (loop))
1415 : 1372 : linfo.rejected_p = true;
1416 : :
1417 : : /* See whether an inner loop prevents versioning of this loop. */
1418 : 69084 : if (!linfo.rejected_p)
1419 : 79045 : for (class loop *inner = loop->inner; inner; inner = inner->next)
1420 : 15138 : if (get_loop_info (inner).rejected_p)
1421 : : {
1422 : 3805 : linfo.rejected_p = true;
1423 : 3805 : break;
1424 : : }
1425 : :
1426 : : /* If versioning the loop is still a possibility, examine the
1427 : : statements in the loop to look for versioning opportunities. */
1428 : 69084 : if (!linfo.rejected_p)
1429 : : {
1430 : 63907 : void *start_point = obstack_alloc (&m_obstack, 0);
1431 : :
1432 : 268502 : for (basic_block bb = linfo.block_list; bb;
1433 : 204595 : bb = m_next_block_in_loop[bb->index])
1434 : 220261 : if (!analyze_block (bb))
1435 : : {
1436 : 15666 : linfo.rejected_p = true;
1437 : 15666 : break;
1438 : : }
1439 : :
1440 : 63907 : if (!linfo.rejected_p)
1441 : : {
1442 : : /* Process any queued address fragments, now that we have
1443 : : complete grouping information. */
1444 : : address_info *address;
1445 : : unsigned int i;
1446 : 103471 : FOR_EACH_VEC_ELT (m_address_list, i, address)
1447 : 55230 : analyze_address_fragment (*address);
1448 : : }
1449 : :
1450 : 63907 : m_address_table.empty ();
1451 : 63907 : m_address_list.truncate (0);
1452 : 63907 : obstack_free (&m_obstack, start_point);
1453 : : }
1454 : 24579 : }
1455 : :
1456 : 24579 : return m_num_conditions != 0;
1457 : : }
1458 : :
1459 : : /* Use the ranges in VRS to remove impossible versioning conditions from
1460 : : LOOP. */
1461 : :
1462 : : void
1463 : 3929 : loop_versioning::prune_loop_conditions (class loop *loop)
1464 : : {
1465 : 3929 : loop_info &li = get_loop_info (loop);
1466 : :
1467 : 3929 : int to_remove = -1;
1468 : 3929 : bitmap_iterator bi;
1469 : 3929 : unsigned int i;
1470 : 3929 : int_range_max r;
1471 : 5256 : EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi)
1472 : : {
1473 : 1327 : tree name = ssa_name (i);
1474 : 1327 : gimple *stmt = first_stmt (loop->header);
1475 : :
1476 : 2654 : if (get_range_query (cfun)->range_of_expr (r, name, stmt)
1477 : 2654 : && !r.contains_p (wi::one (TYPE_PRECISION (TREE_TYPE (name)))))
1478 : : {
1479 : 40 : if (dump_enabled_p ())
1480 : 2 : dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1481 : : "%T can never be 1 in this loop\n", name);
1482 : :
1483 : 40 : if (to_remove >= 0)
1484 : 0 : bitmap_clear_bit (&li.unity_names, to_remove);
1485 : 40 : to_remove = i;
1486 : 40 : m_num_conditions -= 1;
1487 : : }
1488 : : }
1489 : 3929 : if (to_remove >= 0)
1490 : 40 : bitmap_clear_bit (&li.unity_names, to_remove);
1491 : 3929 : }
1492 : :
1493 : : /* Remove any scheduled loop version conditions that will never be true.
1494 : : Return true if any remain. */
1495 : :
1496 : : bool
1497 : 642 : loop_versioning::prune_conditions ()
1498 : : {
1499 : 642 : AUTO_DUMP_SCOPE ("prune_loop_conditions",
1500 : : dump_user_location_t::from_function_decl (m_fn->decl));
1501 : :
1502 : 642 : calculate_dominance_info (CDI_DOMINATORS);
1503 : 642 : lv_dom_walker dom_walker (*this);
1504 : 642 : dom_walker.walk (ENTRY_BLOCK_PTR_FOR_FN (m_fn));
1505 : 642 : return m_num_conditions != 0;
1506 : 642 : }
1507 : :
1508 : : /* Merge the version checks for INNER into immediately-enclosing loop
1509 : : OUTER. */
1510 : :
1511 : : void
1512 : 464 : loop_versioning::merge_loop_info (class loop *outer, class loop *inner)
1513 : : {
1514 : 464 : loop_info &inner_li = get_loop_info (inner);
1515 : 464 : loop_info &outer_li = get_loop_info (outer);
1516 : :
1517 : 464 : if (dump_enabled_p ())
1518 : : {
1519 : 11 : bitmap_iterator bi;
1520 : 11 : unsigned int i;
1521 : 22 : EXECUTE_IF_SET_IN_BITMAP (&inner_li.unity_names, 0, i, bi)
1522 : 11 : if (!bitmap_bit_p (&outer_li.unity_names, i))
1523 : 11 : dump_printf_loc (MSG_NOTE, find_loop_location (inner),
1524 : : "hoisting check that %T == 1 to outer loop\n",
1525 : 11 : ssa_name (i));
1526 : : }
1527 : :
1528 : 464 : bitmap_ior_into (&outer_li.unity_names, &inner_li.unity_names);
1529 : 473 : if (loop_depth (outer_li.outermost) < loop_depth (inner_li.outermost))
1530 : 306 : outer_li.outermost = inner_li.outermost;
1531 : 464 : }
1532 : :
1533 : : /* Add LOOP to the queue of loops to version. */
1534 : :
1535 : : void
1536 : 1054 : loop_versioning::add_loop_to_queue (class loop *loop)
1537 : : {
1538 : 1054 : loop_info &li = get_loop_info (loop);
1539 : :
1540 : 1054 : if (dump_enabled_p ())
1541 : 68 : dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1542 : : "queuing this loop for versioning\n");
1543 : 1054 : m_loops_to_version.safe_push (loop);
1544 : :
1545 : : /* Don't try to version superloops. */
1546 : 1054 : li.rejected_p = true;
1547 : 1054 : }
1548 : :
1549 : : /* Decide whether the cost model would allow us to version LOOP,
1550 : : either directly or as part of a parent loop, and return true if so.
1551 : : This does not imply that the loop is actually worth versioning in its
1552 : : own right, just that it would be valid to version it if something
1553 : : benefited.
1554 : :
1555 : : We have already made this decision for all inner loops of LOOP. */
1556 : :
1557 : : bool
1558 : 3238 : loop_versioning::decide_whether_loop_is_versionable (class loop *loop)
1559 : : {
1560 : 3238 : loop_info &li = get_loop_info (loop);
1561 : :
1562 : 3238 : if (li.rejected_p)
1563 : : return false;
1564 : :
1565 : : /* Examine the decisions made for inner loops. */
1566 : 3019 : for (class loop *inner = loop->inner; inner; inner = inner->next)
1567 : : {
1568 : 576 : loop_info &inner_li = get_loop_info (inner);
1569 : 576 : if (inner_li.rejected_p)
1570 : : {
1571 : 6 : if (dump_enabled_p ())
1572 : 0 : dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1573 : : "not versioning this loop because one of its"
1574 : : " inner loops should not be versioned\n");
1575 : 6 : return false;
1576 : : }
1577 : :
1578 : 570 : if (inner_li.worth_versioning_p ())
1579 : 341 : li.subloops_benefit_p = true;
1580 : :
1581 : : /* Accumulate the number of instructions from subloops that are not
1582 : : the innermost, or that don't benefit from versioning. Only the
1583 : : instructions from innermost loops that benefit from versioning
1584 : : should be weighed against loop-versioning-max-inner-insns;
1585 : : everything else should be weighed against
1586 : : loop-versioning-max-outer-insns. */
1587 : 570 : if (!inner_li.worth_versioning_p () || inner->inner)
1588 : : {
1589 : 264 : if (dump_enabled_p ())
1590 : 0 : dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1591 : : "counting %d instructions from this loop"
1592 : : " against its parent loop\n", inner_li.num_insns);
1593 : 264 : li.num_insns += inner_li.num_insns;
1594 : : }
1595 : : }
1596 : :
1597 : : /* Enforce the size limits. */
1598 : 3832 : if (li.worth_versioning_p ())
1599 : : {
1600 : 1371 : unsigned int max_num_insns = max_insns_for_loop (loop);
1601 : 1371 : if (dump_enabled_p ())
1602 : 79 : dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1603 : : "this loop has %d instructions, against"
1604 : : " a versioning limit of %d\n",
1605 : : li.num_insns, max_num_insns);
1606 : 1371 : if (li.num_insns > max_num_insns)
1607 : : {
1608 : 10 : if (dump_enabled_p ())
1609 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION
1610 : : | MSG_PRIORITY_USER_FACING,
1611 : 0 : find_loop_location (loop),
1612 : : "this loop is too big to version");
1613 : 10 : return false;
1614 : : }
1615 : : }
1616 : :
1617 : : /* Hoist all version checks from subloops to this loop. */
1618 : 2897 : for (class loop *subloop = loop->inner; subloop; subloop = subloop->next)
1619 : 464 : merge_loop_info (loop, subloop);
1620 : :
1621 : : return true;
1622 : : }
1623 : :
1624 : : /* Decide which loops to version and add them to the versioning queue.
1625 : : Return true if there are any loops to version. */
1626 : :
1627 : : bool
1628 : 631 : loop_versioning::make_versioning_decisions ()
1629 : : {
1630 : 631 : AUTO_DUMP_SCOPE ("make_versioning_decisions",
1631 : : dump_user_location_t::from_function_decl (m_fn->decl));
1632 : :
1633 : 5131 : for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
1634 : : {
1635 : 3238 : loop_info &linfo = get_loop_info (loop);
1636 : 3238 : if (decide_whether_loop_is_versionable (loop))
1637 : : {
1638 : : /* Commit to versioning LOOP directly if we can't hoist the
1639 : : version checks any further. */
1640 : 7032 : if (linfo.worth_versioning_p ()
1641 : 2433 : && (loop_depth (loop) == 1 || linfo.outermost == loop))
1642 : 988 : add_loop_to_queue (loop);
1643 : : }
1644 : : else
1645 : : {
1646 : : /* We can't version this loop, so individually version any
1647 : : subloops that would benefit and haven't been versioned yet. */
1648 : 805 : linfo.rejected_p = true;
1649 : 1447 : for (class loop *subloop = loop->inner; subloop;
1650 : 642 : subloop = subloop->next)
1651 : 840 : if (get_loop_info (subloop).worth_versioning_p ())
1652 : 66 : add_loop_to_queue (subloop);
1653 : : }
1654 : 631 : }
1655 : :
1656 : 1262 : return !m_loops_to_version.is_empty ();
1657 : : }
1658 : :
1659 : : /* Attempt to implement loop versioning for LOOP, using the information
1660 : : cached in the associated loop_info. Return true on success. */
1661 : :
1662 : : bool
1663 : 1054 : loop_versioning::version_loop (class loop *loop)
1664 : : {
1665 : 1054 : loop_info &li = get_loop_info (loop);
1666 : :
1667 : : /* Build up a condition that selects the original loop instead of
1668 : : the simplified loop. */
1669 : 1054 : tree cond = boolean_false_node;
1670 : 1054 : bitmap_iterator bi;
1671 : 1054 : unsigned int i;
1672 : 2341 : EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi)
1673 : : {
1674 : 1287 : tree name = ssa_name (i);
1675 : 1287 : tree ne_one = fold_build2 (NE_EXPR, boolean_type_node, name,
1676 : : build_one_cst (TREE_TYPE (name)));
1677 : 1287 : cond = fold_build2 (TRUTH_OR_EXPR, boolean_type_node, cond, ne_one);
1678 : : }
1679 : :
1680 : : /* Convert the condition into a suitable gcond. */
1681 : 1054 : gimple_seq stmts = NULL;
1682 : 1054 : cond = force_gimple_operand_1 (cond, &stmts, is_gimple_condexpr_for_cond,
1683 : : NULL_TREE);
1684 : :
1685 : : /* Version the loop. */
1686 : 1054 : initialize_original_copy_tables ();
1687 : 1054 : basic_block cond_bb;
1688 : 1054 : li.optimized_loop = loop_version (loop, cond, &cond_bb,
1689 : : profile_probability::unlikely (),
1690 : : profile_probability::likely (),
1691 : : profile_probability::unlikely (),
1692 : : profile_probability::likely (), true);
1693 : 1054 : free_original_copy_tables ();
1694 : 1054 : if (!li.optimized_loop)
1695 : : {
1696 : 2 : if (dump_enabled_p ())
1697 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, find_loop_location (loop),
1698 : : "tried but failed to version this loop for when"
1699 : : " certain strides are 1\n");
1700 : 2 : return false;
1701 : : }
1702 : :
1703 : 1052 : if (dump_enabled_p ())
1704 : 68 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, find_loop_location (loop),
1705 : : "versioned this loop for when certain strides are 1\n");
1706 : :
1707 : : /* Insert the statements that feed COND. */
1708 : 1052 : if (stmts)
1709 : : {
1710 : 159 : gimple_stmt_iterator gsi = gsi_last_bb (cond_bb);
1711 : 159 : gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1712 : : }
1713 : :
1714 : : return true;
1715 : : }
1716 : :
1717 : : /* Attempt to version all loops in the versioning queue. */
1718 : :
1719 : : void
1720 : 631 : loop_versioning::implement_versioning_decisions ()
1721 : : {
1722 : : /* No AUTO_DUMP_SCOPE here since all messages are top-level and
1723 : : user-facing at this point. */
1724 : :
1725 : 631 : bool any_succeeded_p = false;
1726 : 631 : class loop *loop;
1727 : 631 : unsigned int i;
1728 : 1685 : FOR_EACH_VEC_ELT (m_loops_to_version, i, loop)
1729 : 1054 : if (version_loop (loop))
1730 : 1052 : any_succeeded_p = true;
1731 : 631 : if (!any_succeeded_p)
1732 : 631 : return;
1733 : :
1734 : 629 : update_ssa (TODO_update_ssa);
1735 : :
1736 : : /* Simplify the new loop, which is used when COND is false. */
1737 : 1681 : FOR_EACH_VEC_ELT (m_loops_to_version, i, loop)
1738 : : {
1739 : 1052 : loop_info &linfo = get_loop_info (loop);
1740 : 1052 : if (linfo.optimized_loop)
1741 : 1052 : name_prop (linfo).substitute_and_fold (linfo.optimized_loop->header);
1742 : : }
1743 : : }
1744 : :
1745 : : /* Run the pass and return a set of TODO_* flags. */
1746 : :
1747 : : unsigned int
1748 : 24579 : loop_versioning::run ()
1749 : : {
1750 : 24579 : gcc_assert (scev_initialized_p ());
1751 : :
1752 : 24579 : if (analyze_blocks ()
1753 : 642 : && prune_conditions ()
1754 : 25210 : && make_versioning_decisions ())
1755 : 631 : implement_versioning_decisions ();
1756 : :
1757 : 24579 : return 0;
1758 : : }
1759 : :
1760 : : /* Loop versioning pass. */
1761 : :
1762 : : const pass_data pass_data_loop_versioning =
1763 : : {
1764 : : GIMPLE_PASS, /* type */
1765 : : "lversion", /* name */
1766 : : OPTGROUP_LOOP, /* optinfo_flags */
1767 : : TV_LOOP_VERSIONING, /* tv_id */
1768 : : PROP_cfg, /* properties_required */
1769 : : 0, /* properties_provided */
1770 : : 0, /* properties_destroyed */
1771 : : 0, /* todo_flags_start */
1772 : : 0, /* todo_flags_finish */
1773 : : };
1774 : :
1775 : : class pass_loop_versioning : public gimple_opt_pass
1776 : : {
1777 : : public:
1778 : 273196 : pass_loop_versioning (gcc::context *ctxt)
1779 : 546392 : : gimple_opt_pass (pass_data_loop_versioning, ctxt)
1780 : : {}
1781 : :
1782 : : /* opt_pass methods: */
1783 : 221759 : bool gate (function *) final override
1784 : : {
1785 : 221759 : return flag_version_loops_for_strides;
1786 : : }
1787 : : unsigned int execute (function *) final override;
1788 : : };
1789 : :
1790 : : unsigned int
1791 : 24579 : pass_loop_versioning::execute (function *fn)
1792 : : {
1793 : 49158 : if (number_of_loops (fn) <= 1)
1794 : : return 0;
1795 : :
1796 : 24579 : enable_ranger (fn);
1797 : 24579 : unsigned int ret = loop_versioning (fn).run ();
1798 : 24579 : disable_ranger (fn);
1799 : 24579 : return ret;
1800 : : }
1801 : :
1802 : : } // anon namespace
1803 : :
1804 : : gimple_opt_pass *
1805 : 273196 : make_pass_loop_versioning (gcc::context *ctxt)
1806 : : {
1807 : 273196 : return new pass_loop_versioning (ctxt);
1808 : : }
|