Branch data Line data Source code
1 : : /* Loop versioning pass.
2 : : Copyright (C) 2018-2024 Free Software Foundation, Inc.
3 : :
4 : : This file is part of GCC.
5 : :
6 : : GCC is free software; you can redistribute it and/or modify it
7 : : under the terms of the GNU General Public License as published by the
8 : : Free Software Foundation; either version 3, or (at your option) any
9 : : later version.
10 : :
11 : : GCC is distributed in the hope that it will be useful, but WITHOUT
12 : : ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 : : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 : : for more details.
15 : :
16 : : You should have received a copy of the GNU General Public License
17 : : along with GCC; see the file COPYING3. If not see
18 : : <http://www.gnu.org/licenses/>. */
19 : :
20 : : #include "config.h"
21 : : #include "system.h"
22 : : #include "coretypes.h"
23 : : #include "backend.h"
24 : : #include "tree.h"
25 : : #include "gimple.h"
26 : : #include "gimple-iterator.h"
27 : : #include "tree-pass.h"
28 : : #include "gimplify-me.h"
29 : : #include "cfgloop.h"
30 : : #include "tree-ssa-loop.h"
31 : : #include "ssa.h"
32 : : #include "tree-scalar-evolution.h"
33 : : #include "tree-ssa-loop-ivopts.h"
34 : : #include "fold-const.h"
35 : : #include "tree-ssa-propagate.h"
36 : : #include "tree-inline.h"
37 : : #include "domwalk.h"
38 : : #include "tree-vectorizer.h"
39 : : #include "omp-general.h"
40 : : #include "predict.h"
41 : : #include "tree-into-ssa.h"
42 : : #include "gimple-range.h"
43 : : #include "tree-cfg.h"
44 : :
45 : : namespace {
46 : :
47 : : /* This pass looks for loops that could be simplified if certain loop
48 : : invariant conditions were true. It is effectively a form of loop
49 : : splitting in which the pass produces the split conditions itself,
50 : : instead of using ones that are already present in the IL.
51 : :
52 : : Versioning for when strides are 1
53 : : ---------------------------------
54 : :
55 : : At the moment the only thing the pass looks for are memory references
56 : : like:
57 : :
58 : : for (auto i : ...)
59 : : ...x[i * stride]...
60 : :
61 : : It considers changing such loops to:
62 : :
63 : : if (stride == 1)
64 : : for (auto i : ...) [A]
65 : : ...x[i]...
66 : : else
67 : : for (auto i : ...) [B]
68 : : ...x[i * stride]...
69 : :
70 : : This can have several benefits:
71 : :
72 : : (1) [A] is often easier or cheaper to vectorize than [B].
73 : :
74 : : (2) The scalar code in [A] is simpler than the scalar code in [B]
75 : : (if the loops cannot be vectorized or need an epilogue loop).
76 : :
77 : : (3) We might recognize [A] as a pattern, such as a memcpy or memset.
78 : :
79 : : (4) [A] has simpler address evolutions, which can help other passes
80 : : like loop interchange.
81 : :
82 : : The optimization is particularly useful for assumed-shape arrays in
83 : : Fortran, where the stride of the innermost dimension depends on the
84 : : array descriptor but is often equal to 1 in practice. For example:
85 : :
86 : : subroutine f1(x)
87 : : real :: x(:)
88 : : x(:) = 100
89 : : end subroutine f1
90 : :
91 : : generates the equivalent of:
92 : :
93 : : raw_stride = *x.dim[0].stride;
94 : : stride = raw_stride != 0 ? raw_stride : 1;
95 : : x_base = *x.data;
96 : : ...
97 : : tmp1 = stride * S;
98 : : tmp2 = tmp1 - stride;
99 : : *x_base[tmp2] = 1.0e+2;
100 : :
101 : : but in the common case that stride == 1, the last three statements
102 : : simplify to:
103 : :
104 : : tmp3 = S + -1;
105 : : *x_base[tmp3] = 1.0e+2;
106 : :
107 : : The optimization is in principle very simple. The difficult parts are:
108 : :
109 : : (a) deciding which parts of a general address calculation correspond
110 : : to the inner dimension of an array, since this usually isn't explicit
111 : : in the IL, and for C often isn't even explicit in the source code
112 : :
113 : : (b) estimating when the transformation is worthwhile
114 : :
115 : : Structure
116 : : ---------
117 : :
118 : : The pass has four phases:
119 : :
120 : : (1) Walk through the statements looking for and recording potential
121 : : versioning opportunities. Stop if there are none.
122 : :
123 : : (2) Use context-sensitive range information to see whether any versioning
124 : : conditions are impossible in practice. Remove them if so, and stop
125 : : if no opportunities remain.
126 : :
127 : : (We do this only after (1) to keep compile time down when no
128 : : versioning opportunities exist.)
129 : :
130 : : (3) Apply the cost model. Decide which versioning opportunities are
131 : : worthwhile and at which nesting level they should be applied.
132 : :
133 : : (4) Attempt to version all the loops selected by (3), so that:
134 : :
135 : : for (...)
136 : : ...
137 : :
138 : : becomes:
139 : :
140 : : if (!cond)
141 : : for (...) // Original loop
142 : : ...
143 : : else
144 : : for (...) // New loop
145 : : ...
146 : :
147 : : Use the version condition COND to simplify the new loop. */
148 : :
149 : : /* Enumerates the likelihood that a particular value indexes the inner
150 : : dimension of an array. */
151 : : enum inner_likelihood {
152 : : INNER_UNLIKELY,
153 : : INNER_DONT_KNOW,
154 : : INNER_LIKELY
155 : : };
156 : :
157 : : /* Information about one term of an address_info. */
158 : : struct address_term_info
159 : : {
160 : : /* The value of the term is EXPR * MULTIPLIER. */
161 : : tree expr;
162 : : unsigned HOST_WIDE_INT multiplier;
163 : :
164 : : /* The stride applied by EXPR in each iteration of some unrecorded loop,
165 : : or null if no stride has been identified. */
166 : : tree stride;
167 : :
168 : : /* Enumerates the likelihood that EXPR indexes the inner dimension
169 : : of an array. */
170 : : enum inner_likelihood inner_likelihood;
171 : :
172 : : /* True if STRIDE == 1 is a versioning opportunity when considered
173 : : in isolation. */
174 : : bool versioning_opportunity_p;
175 : : };
176 : :
177 : : /* Information about an address calculation, and the range of constant
178 : : offsets applied to it. */
179 : 133401 : class address_info
180 : : {
181 : : public:
182 : : static const unsigned int MAX_TERMS = 8;
183 : :
184 : : /* One statement that calculates the address. If multiple statements
185 : : share the same address, we only record the first. */
186 : : gimple *stmt;
187 : :
188 : : /* The loop containing STMT (cached for convenience). If multiple
189 : : statements share the same address, they all belong to this loop. */
190 : : class loop *loop;
191 : :
192 : : /* A decomposition of the calculation into a sum of terms plus an
193 : : optional base. When BASE is provided, it is never an SSA name.
194 : : Once initialization is complete, all members of TERMs are SSA names. */
195 : : tree base;
196 : : auto_vec<address_term_info, MAX_TERMS> terms;
197 : :
198 : : /* All bytes accessed from the address fall in the offset range
199 : : [MIN_OFFSET, MAX_OFFSET). */
200 : : HOST_WIDE_INT min_offset, max_offset;
201 : : };
202 : :
203 : : /* Stores addresses based on their base and terms (ignoring the offsets). */
204 : : struct address_info_hasher : nofree_ptr_hash <address_info>
205 : : {
206 : : static hashval_t hash (const address_info *);
207 : : static bool equal (const address_info *, const address_info *);
208 : : };
209 : :
210 : : /* Information about the versioning we'd like to apply to a loop. */
211 : 134738 : class loop_info
212 : : {
213 : : public:
214 : : bool worth_versioning_p () const;
215 : :
216 : : /* True if we've decided not to version this loop. The remaining
217 : : fields are meaningless if so. */
218 : : bool rejected_p;
219 : :
220 : : /* True if at least one subloop of this loop benefits from versioning. */
221 : : bool subloops_benefit_p;
222 : :
223 : : /* An estimate of the total number of instructions in the loop,
224 : : excluding those in subloops that benefit from versioning. */
225 : : unsigned int num_insns;
226 : :
227 : : /* The outermost loop that can handle all the version checks
228 : : described below. */
229 : : class loop *outermost;
230 : :
231 : : /* The first entry in the list of blocks that belong to this loop
232 : : (and not to subloops). m_next_block_in_loop provides the chain
233 : : pointers for the list. */
234 : : basic_block block_list;
235 : :
236 : : /* We'd like to version the loop for the case in which these SSA names
237 : : (keyed off their SSA_NAME_VERSION) are all equal to 1 at runtime. */
238 : : bitmap_head unity_names;
239 : :
240 : : /* If versioning succeeds, this points the version of the loop that
241 : : assumes the version conditions holds. */
242 : : class loop *optimized_loop;
243 : : };
244 : :
245 : : /* The main pass structure. */
246 : : class loop_versioning
247 : : {
248 : : public:
249 : : loop_versioning (function *);
250 : : ~loop_versioning ();
251 : : unsigned int run ();
252 : :
253 : : private:
254 : : /* Used to walk the dominator tree to find loop versioning conditions
255 : : that are always false. */
256 : 1316 : class lv_dom_walker : public dom_walker
257 : : {
258 : : public:
259 : : lv_dom_walker (loop_versioning &);
260 : :
261 : : edge before_dom_children (basic_block) final override;
262 : :
263 : : private:
264 : : /* The parent pass. */
265 : : loop_versioning &m_lv;
266 : : };
267 : :
268 : : /* Used to simplify statements based on conditions that are established
269 : : by the version checks. */
270 : 2214 : class name_prop : public substitute_and_fold_engine
271 : : {
272 : : public:
273 : 2214 : name_prop (loop_info &li) : m_li (li) {}
274 : : tree value_of_expr (tree name, gimple *) final override;
275 : :
276 : : private:
277 : : /* Information about the versioning we've performed on the loop. */
278 : : loop_info &m_li;
279 : : };
280 : :
281 : 2014554 : loop_info &get_loop_info (class loop *loop) { return m_loops[loop->num]; }
282 : :
283 : : unsigned int max_insns_for_loop (class loop *);
284 : : bool expensive_stmt_p (gimple *);
285 : :
286 : : void version_for_unity (gimple *, tree);
287 : : bool acceptable_multiplier_p (tree, unsigned HOST_WIDE_INT,
288 : : unsigned HOST_WIDE_INT * = 0);
289 : : bool acceptable_type_p (tree, unsigned HOST_WIDE_INT *);
290 : : bool multiply_term_by (address_term_info &, tree);
291 : : inner_likelihood get_inner_likelihood (tree, unsigned HOST_WIDE_INT);
292 : : void dump_inner_likelihood (address_info &, address_term_info &);
293 : : void analyze_stride (address_info &, address_term_info &,
294 : : tree, class loop *);
295 : : bool find_per_loop_multiplication (address_info &, address_term_info &);
296 : : bool analyze_term_using_scevs (address_info &, address_term_info &);
297 : : void analyze_arbitrary_term (address_info &, address_term_info &);
298 : : void analyze_address_fragment (address_info &);
299 : : void record_address_fragment (gimple *, unsigned HOST_WIDE_INT,
300 : : tree, unsigned HOST_WIDE_INT, HOST_WIDE_INT);
301 : : void analyze_expr (gimple *, tree);
302 : : bool analyze_block (basic_block);
303 : : bool analyze_blocks ();
304 : :
305 : : void prune_loop_conditions (class loop *);
306 : : bool prune_conditions ();
307 : :
308 : : void merge_loop_info (class loop *, class loop *);
309 : : void add_loop_to_queue (class loop *);
310 : : bool decide_whether_loop_is_versionable (class loop *);
311 : : bool make_versioning_decisions ();
312 : :
313 : : bool version_loop (class loop *);
314 : : void implement_versioning_decisions ();
315 : :
316 : : /* The function we're optimizing. */
317 : : function *m_fn;
318 : :
319 : : /* The obstack to use for all pass-specific bitmaps. */
320 : : bitmap_obstack m_bitmap_obstack;
321 : :
322 : : /* An obstack to use for general allocation. */
323 : : obstack m_obstack;
324 : :
325 : : /* The total number of loop version conditions we've found. */
326 : : unsigned int m_num_conditions;
327 : :
328 : : /* Assume that an address fragment of the form i * stride * scale
329 : : (for variable stride and constant scale) will not benefit from
330 : : versioning for stride == 1 when scale is greater than this value. */
331 : : unsigned HOST_WIDE_INT m_maximum_scale;
332 : :
333 : : /* Information about each loop. */
334 : : auto_vec<loop_info> m_loops;
335 : :
336 : : /* Used to form a linked list of blocks that belong to a loop,
337 : : started by loop_info::block_list. */
338 : : auto_vec<basic_block> m_next_block_in_loop;
339 : :
340 : : /* The list of loops that we've decided to version. */
341 : : auto_vec<class loop *> m_loops_to_version;
342 : :
343 : : /* A table of addresses in the current loop, keyed off their values
344 : : but not their offsets. */
345 : : hash_table <address_info_hasher> m_address_table;
346 : :
347 : : /* A list of all addresses in M_ADDRESS_TABLE, in a predictable order. */
348 : : auto_vec <address_info *, 32> m_address_list;
349 : : };
350 : :
351 : : /* If EXPR is an SSA name and not a default definition, return the
352 : : defining statement, otherwise return null. */
353 : :
354 : : static gimple *
355 : 1107017 : maybe_get_stmt (tree expr)
356 : : {
357 : 952357 : if (TREE_CODE (expr) == SSA_NAME && !SSA_NAME_IS_DEFAULT_DEF (expr))
358 : 0 : return SSA_NAME_DEF_STMT (expr);
359 : : return NULL;
360 : : }
361 : :
362 : : /* Like maybe_get_stmt, but also return null if the defining
363 : : statement isn't an assignment. */
364 : :
365 : : static gassign *
366 : 986922 : maybe_get_assign (tree expr)
367 : : {
368 : 2097458 : return safe_dyn_cast <gassign *> (maybe_get_stmt (expr));
369 : : }
370 : :
371 : : /* Return true if this pass should look through a cast of expression FROM
372 : : to type TYPE when analyzing pieces of an address. */
373 : :
374 : : static bool
375 : 53321 : look_through_cast_p (tree type, tree from)
376 : : {
377 : 53321 : return (INTEGRAL_TYPE_P (TREE_TYPE (from)) == INTEGRAL_TYPE_P (type)
378 : 53321 : && POINTER_TYPE_P (TREE_TYPE (from)) == POINTER_TYPE_P (type));
379 : : }
380 : :
381 : : /* Strip all conversions of integers or pointers from EXPR, regardless
382 : : of whether the conversions are nops. This is useful in the context
383 : : of this pass because we're not trying to fold or simulate the
384 : : expression; we just want to see how it's structured. */
385 : :
386 : : static tree
387 : 391222 : strip_casts (tree expr)
388 : : {
389 : 391222 : const unsigned int MAX_NITERS = 4;
390 : :
391 : 391222 : tree type = TREE_TYPE (expr);
392 : 391222 : while (CONVERT_EXPR_P (expr)
393 : 391414 : && look_through_cast_p (type, TREE_OPERAND (expr, 0)))
394 : 192 : expr = TREE_OPERAND (expr, 0);
395 : :
396 : 443533 : for (unsigned int niters = 0; niters < MAX_NITERS; ++niters)
397 : : {
398 : 443533 : gassign *assign = maybe_get_assign (expr);
399 : 443533 : if (assign
400 : 214921 : && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign))
401 : 496662 : && look_through_cast_p (type, gimple_assign_rhs1 (assign)))
402 : 52311 : expr = gimple_assign_rhs1 (assign);
403 : : else
404 : : break;
405 : : }
406 : 391222 : return expr;
407 : : }
408 : :
409 : : /* Compare two address_term_infos in the same address_info. */
410 : :
411 : : static int
412 : 276842 : compare_address_terms (const void *a_uncast, const void *b_uncast)
413 : : {
414 : 276842 : const address_term_info *a = (const address_term_info *) a_uncast;
415 : 276842 : const address_term_info *b = (const address_term_info *) b_uncast;
416 : :
417 : 276842 : if (a->expr != b->expr)
418 : 276498 : return SSA_NAME_VERSION (a->expr) < SSA_NAME_VERSION (b->expr) ? -1 : 1;
419 : :
420 : 344 : if (a->multiplier != b->multiplier)
421 : 266 : return a->multiplier < b->multiplier ? -1 : 1;
422 : :
423 : : return 0;
424 : : }
425 : :
426 : : /* Dump ADDRESS using flags FLAGS. */
427 : :
428 : : static void
429 : 420 : dump_address_info (dump_flags_t flags, address_info &address)
430 : : {
431 : 420 : if (address.base)
432 : 2 : dump_printf (flags, "%T + ", address.base);
433 : 1170 : for (unsigned int i = 0; i < address.terms.length (); ++i)
434 : : {
435 : 750 : if (i != 0)
436 : 330 : dump_printf (flags, " + ");
437 : 750 : dump_printf (flags, "%T", address.terms[i].expr);
438 : 750 : if (address.terms[i].multiplier != 1)
439 : 522 : dump_printf (flags, " * %wd", address.terms[i].multiplier);
440 : : }
441 : 840 : dump_printf (flags, " + [%wd, %wd]",
442 : 420 : address.min_offset, address.max_offset - 1);
443 : 420 : }
444 : :
445 : : /* Hash an address_info based on its base and terms. */
446 : :
447 : : hashval_t
448 : 184175 : address_info_hasher::hash (const address_info *info)
449 : : {
450 : 184175 : inchash::hash hash;
451 : 184175 : hash.add_int (info->base ? TREE_CODE (info->base) : 0);
452 : 368350 : hash.add_int (info->terms.length ());
453 : 453566 : for (unsigned int i = 0; i < info->terms.length (); ++i)
454 : : {
455 : 269391 : hash.add_int (SSA_NAME_VERSION (info->terms[i].expr));
456 : 269391 : hash.add_hwi (info->terms[i].multiplier);
457 : : }
458 : 184175 : return hash.end ();
459 : : }
460 : :
461 : : /* Return true if two address_infos have equal bases and terms. Other
462 : : properties might be different (such as the statement or constant
463 : : offset range). */
464 : :
465 : : bool
466 : 100462 : address_info_hasher::equal (const address_info *a, const address_info *b)
467 : : {
468 : 100462 : if (a->base != b->base
469 : 100462 : && (!a->base || !b->base || !operand_equal_p (a->base, b->base, 0)))
470 : 2739 : return false;
471 : :
472 : 293169 : if (a->terms.length () != b->terms.length ())
473 : : return false;
474 : :
475 : 192996 : for (unsigned int i = 0; i < a->terms.length (); ++i)
476 : 122729 : if (a->terms[i].expr != b->terms[i].expr
477 : 122729 : || a->terms[i].multiplier != b->terms[i].multiplier)
478 : : return false;
479 : :
480 : : return true;
481 : : }
482 : :
483 : : /* Return true if we want to version the loop, i.e. if we have a
484 : : specific reason for doing so and no specific reason not to. */
485 : :
486 : : bool
487 : 7072 : loop_info::worth_versioning_p () const
488 : : {
489 : 7072 : return (!rejected_p
490 : 5358 : && (!bitmap_empty_p (&unity_names) || subloops_benefit_p));
491 : : }
492 : :
493 : 658 : loop_versioning::lv_dom_walker::lv_dom_walker (loop_versioning &lv)
494 : 0 : : dom_walker (CDI_DOMINATORS), m_lv (lv)
495 : : {
496 : 0 : }
497 : :
498 : : /* Process BB before processing the blocks it dominates. */
499 : :
500 : : edge
501 : 28973 : loop_versioning::lv_dom_walker::before_dom_children (basic_block bb)
502 : : {
503 : 28973 : if (bb == bb->loop_father->header)
504 : 4091 : m_lv.prune_loop_conditions (bb->loop_father);
505 : :
506 : 28973 : return NULL;
507 : : }
508 : :
509 : : /* Decide whether to replace VAL with a new value in a versioned loop.
510 : : Return the new value if so, otherwise return null. */
511 : :
512 : : tree
513 : 65705 : loop_versioning::name_prop::value_of_expr (tree val, gimple *)
514 : : {
515 : 65705 : if (TREE_CODE (val) == SSA_NAME
516 : 65705 : && bitmap_bit_p (&m_li.unity_names, SSA_NAME_VERSION (val)))
517 : 1342 : return build_one_cst (TREE_TYPE (val));
518 : : return NULL_TREE;
519 : : }
520 : :
521 : : /* Initialize the structure to optimize FN. */
522 : :
523 : 25178 : loop_versioning::loop_versioning (function *fn)
524 : 25178 : : m_fn (fn),
525 : 25178 : m_num_conditions (0),
526 : 25178 : m_address_table (31)
527 : : {
528 : 25178 : unsigned m_nloops = number_of_loops (fn);
529 : 25178 : bitmap_obstack_initialize (&m_bitmap_obstack);
530 : 25178 : gcc_obstack_init (&m_obstack);
531 : :
532 : : /* Initialize the loop information. */
533 : 25178 : m_loops.safe_grow_cleared (m_nloops, true);
534 : 159916 : for (unsigned int i = 0; i < m_nloops; ++i)
535 : : {
536 : 134738 : m_loops[i].outermost = get_loop (m_fn, 0);
537 : 134738 : bitmap_initialize (&m_loops[i].unity_names, &m_bitmap_obstack);
538 : : }
539 : :
540 : : /* Initialize the list of blocks that belong to each loop. */
541 : 25178 : unsigned int nbbs = last_basic_block_for_fn (fn);
542 : 25178 : m_next_block_in_loop.safe_grow (nbbs, true);
543 : 25178 : basic_block bb;
544 : 655835 : FOR_EACH_BB_FN (bb, fn)
545 : : {
546 : 630657 : loop_info &li = get_loop_info (bb->loop_father);
547 : 630657 : m_next_block_in_loop[bb->index] = li.block_list;
548 : 630657 : li.block_list = bb;
549 : : }
550 : :
551 : : /* MAX_FIXED_MODE_SIZE should be a reasonable maximum scale for
552 : : unvectorizable code, since it is the largest size that can be
553 : : handled efficiently by scalar code. omp_max_vf calculates the
554 : : maximum number of bytes in a vector, when such a value is relevant
555 : : to loop optimization. */
556 : 25178 : m_maximum_scale = estimated_poly_value (omp_max_vf (false));
557 : 75534 : m_maximum_scale = MAX (m_maximum_scale, MAX_FIXED_MODE_SIZE);
558 : 25178 : }
559 : :
560 : 25178 : loop_versioning::~loop_versioning ()
561 : : {
562 : 25178 : bitmap_obstack_release (&m_bitmap_obstack);
563 : 25178 : obstack_free (&m_obstack, NULL);
564 : 25178 : }
565 : :
566 : : /* Return the maximum number of instructions allowed in LOOP before
567 : : it becomes too big for versioning.
568 : :
569 : : There are separate limits for inner and outer loops. The limit for
570 : : inner loops applies only to loops that benefit directly from versioning.
571 : : The limit for outer loops applies to all code in the outer loop and
572 : : its subloops that *doesn't* benefit directly from versioning; such code
573 : : would be "taken along for the ride". The idea is that if the cost of
574 : : the latter is small, it is better to version outer loops rather than
575 : : inner loops, both to reduce the number of repeated checks and to enable
576 : : more of the loop nest to be optimized as a natural nest (e.g. by loop
577 : : interchange or outer-loop vectorization). */
578 : :
579 : : unsigned int
580 : 1490 : loop_versioning::max_insns_for_loop (class loop *loop)
581 : : {
582 : 1490 : return (loop->inner
583 : 377 : ? param_loop_versioning_max_outer_insns
584 : 1113 : : param_loop_versioning_max_inner_insns);
585 : : }
586 : :
587 : : /* Return true if for cost reasons we should avoid versioning any loop
588 : : that contains STMT.
589 : :
590 : : Note that we don't need to check whether versioning is invalid for
591 : : correctness reasons, since the versioning process does that for us.
592 : : The conditions involved are too rare to be worth duplicating here. */
593 : :
594 : : bool
595 : 724527 : loop_versioning::expensive_stmt_p (gimple *stmt)
596 : : {
597 : 0 : if (gcall *call = dyn_cast <gcall *> (stmt))
598 : : /* Assume for now that the time spent in an "expensive" call would
599 : : overwhelm any saving from versioning. */
600 : 21596 : return !gimple_inexpensive_call_p (call);
601 : : return false;
602 : : }
603 : :
604 : : /* Record that we want to version the loop that contains STMT for the
605 : : case in which SSA name NAME is equal to 1. We already know that NAME
606 : : is invariant in the loop. */
607 : :
608 : : void
609 : 2042 : loop_versioning::version_for_unity (gimple *stmt, tree name)
610 : : {
611 : 2042 : class loop *loop = loop_containing_stmt (stmt);
612 : 2042 : loop_info &li = get_loop_info (loop);
613 : :
614 : 2042 : if (bitmap_set_bit (&li.unity_names, SSA_NAME_VERSION (name)))
615 : : {
616 : : /* This is the first time we've wanted to version LOOP for NAME.
617 : : Keep track of the outermost loop that can handle all versioning
618 : : checks in LI. */
619 : 1384 : class loop *outermost
620 : 1384 : = outermost_invariant_loop_for_expr (loop, name);
621 : 1623 : if (loop_depth (li.outermost) < loop_depth (outermost))
622 : 1086 : li.outermost = outermost;
623 : :
624 : 1384 : if (dump_enabled_p ())
625 : : {
626 : 70 : dump_printf_loc (MSG_NOTE, stmt, "want to version containing loop"
627 : : " for when %T == 1", name);
628 : 70 : if (outermost == loop)
629 : 23 : dump_printf (MSG_NOTE, "; cannot hoist check further");
630 : : else
631 : : {
632 : 57 : dump_printf (MSG_NOTE, "; could implement the check at loop"
633 : : " depth %d", loop_depth (outermost));
634 : 67 : if (loop_depth (li.outermost) > loop_depth (outermost))
635 : 0 : dump_printf (MSG_NOTE, ", but other checks only allow"
636 : 0 : " a depth of %d", loop_depth (li.outermost));
637 : : }
638 : 70 : dump_printf (MSG_NOTE, "\n");
639 : : }
640 : :
641 : 1384 : m_num_conditions += 1;
642 : : }
643 : : else
644 : : {
645 : : /* This is a duplicate request. */
646 : 658 : if (dump_enabled_p ())
647 : 6 : dump_printf_loc (MSG_NOTE, stmt, "already asked to version containing"
648 : : " loop for when %T == 1\n", name);
649 : : }
650 : 2042 : }
651 : :
652 : : /* Return true if OP1_TREE is constant and if in principle it is worth
653 : : versioning an address fragment of the form:
654 : :
655 : : i * OP1_TREE * OP2 * stride
656 : :
657 : : for the case in which stride == 1. This in practice means testing
658 : : whether:
659 : :
660 : : OP1_TREE * OP2 <= M_MAXIMUM_SCALE.
661 : :
662 : : If RESULT is nonnull, store OP1_TREE * OP2 there when returning true. */
663 : :
664 : : bool
665 : 301759 : loop_versioning::acceptable_multiplier_p (tree op1_tree,
666 : : unsigned HOST_WIDE_INT op2,
667 : : unsigned HOST_WIDE_INT *result)
668 : : {
669 : 301759 : if (tree_fits_uhwi_p (op1_tree))
670 : : {
671 : 285286 : unsigned HOST_WIDE_INT op1 = tree_to_uhwi (op1_tree);
672 : : /* The first part checks for overflow. */
673 : 285286 : if (op1 * op2 >= op2 && op1 * op2 <= m_maximum_scale)
674 : : {
675 : 262587 : if (result)
676 : 216095 : *result = op1 * op2;
677 : 262587 : return true;
678 : : }
679 : : }
680 : : return false;
681 : : }
682 : :
683 : : /* Return true if it is worth using loop versioning on a memory access
684 : : of type TYPE. Store the size of the access in *SIZE if so. */
685 : :
686 : : bool
687 : 193484 : loop_versioning::acceptable_type_p (tree type, unsigned HOST_WIDE_INT *size)
688 : : {
689 : 193484 : return (TYPE_SIZE_UNIT (type)
690 : 193484 : && acceptable_multiplier_p (TYPE_SIZE_UNIT (type), 1, size));
691 : : }
692 : :
693 : : /* See whether OP is constant and whether we can multiply TERM by that
694 : : constant without exceeding M_MAXIMUM_SCALE. Return true and update
695 : : TERM if so. */
696 : :
697 : : bool
698 : 78515 : loop_versioning::multiply_term_by (address_term_info &term, tree op)
699 : : {
700 : 0 : return acceptable_multiplier_p (op, term.multiplier, &term.multiplier);
701 : : }
702 : :
703 : : /* Decide whether an address fragment of the form STRIDE * MULTIPLIER
704 : : is likely to be indexing an innermost dimension, returning the result
705 : : as an INNER_* probability. */
706 : :
707 : : inner_likelihood
708 : 69666 : loop_versioning::get_inner_likelihood (tree stride,
709 : : unsigned HOST_WIDE_INT multiplier)
710 : : {
711 : 69666 : const unsigned int MAX_NITERS = 8;
712 : :
713 : : /* Iterate over possible values of STRIDE. Return INNER_LIKELY if at
714 : : least one of those values is likely to be for the innermost dimension.
715 : : Record in UNLIKELY_P if at least one of those values is unlikely to be
716 : : for the innermost dimension.
717 : :
718 : : E.g. for:
719 : :
720 : : stride = cond ? a * b : 1
721 : :
722 : : we should treat STRIDE as being a likely inner dimension, since
723 : : we know that it is 1 under at least some circumstances. (See the
724 : : Fortran example below.) However:
725 : :
726 : : stride = a * b
727 : :
728 : : on its own is unlikely to be for the innermost dimension, since
729 : : that would require both a and b to be 1 at runtime. */
730 : 69666 : bool unlikely_p = false;
731 : 69666 : tree worklist[MAX_NITERS];
732 : 69666 : unsigned int length = 0;
733 : 69666 : worklist[length++] = stride;
734 : 107938 : for (unsigned int i = 0; i < length; ++i)
735 : : {
736 : 84764 : tree expr = worklist[i];
737 : :
738 : 84764 : if (CONSTANT_CLASS_P (expr))
739 : : {
740 : : /* See if EXPR * MULTIPLIER would be consistent with an individual
741 : : access or a small grouped access. */
742 : 48990 : if (acceptable_multiplier_p (expr, multiplier))
743 : : return INNER_LIKELY;
744 : : else
745 : : unlikely_p = true;
746 : : }
747 : 70358 : else if (gimple *stmt = maybe_get_stmt (expr))
748 : : {
749 : : /* If EXPR is set by a PHI node, queue its arguments in case
750 : : we find one that is consistent with an inner dimension.
751 : :
752 : : An important instance of this is the Fortran handling of array
753 : : descriptors, which calculates the stride of the inner dimension
754 : : using a PHI equivalent of:
755 : :
756 : : raw_stride = a.dim[0].stride;
757 : : stride = raw_stride != 0 ? raw_stride : 1;
758 : :
759 : : (Strides for outer dimensions do not treat 0 specially.) */
760 : 34584 : if (gphi *phi = dyn_cast <gphi *> (stmt))
761 : : {
762 : 9622 : unsigned int nargs = gimple_phi_num_args (phi);
763 : 25114 : for (unsigned int j = 0; j < nargs && length < MAX_NITERS; ++j)
764 : 15492 : worklist[length++] = strip_casts (gimple_phi_arg_def (phi, j));
765 : : }
766 : : /* If the value is set by an assignment, expect it to be read
767 : : from memory (such as an array descriptor) rather than be
768 : : calculated. */
769 : 63234 : else if (gassign *assign = dyn_cast <gassign *> (stmt))
770 : : {
771 : 24646 : if (!gimple_assign_load_p (assign))
772 : 12310 : unlikely_p = true;
773 : : }
774 : : /* Things like calls don't really tell us anything. */
775 : : }
776 : : }
777 : :
778 : : /* We didn't find any possible values of STRIDE that were likely to be
779 : : for the innermost dimension. If we found one that was actively
780 : : unlikely to be for the innermost dimension, assume that that applies
781 : : to STRIDE too. */
782 : 23174 : return unlikely_p ? INNER_UNLIKELY : INNER_DONT_KNOW;
783 : : }
784 : :
785 : : /* Dump the likelihood that TERM's stride is for the innermost dimension.
786 : : ADDRESS is the address that contains TERM. */
787 : :
788 : : void
789 : 237 : loop_versioning::dump_inner_likelihood (address_info &address,
790 : : address_term_info &term)
791 : : {
792 : 237 : if (term.inner_likelihood == INNER_LIKELY)
793 : 82 : dump_printf_loc (MSG_NOTE, address.stmt, "%T is likely to be the"
794 : : " innermost dimension\n", term.stride);
795 : 155 : else if (term.inner_likelihood == INNER_UNLIKELY)
796 : 37 : dump_printf_loc (MSG_NOTE, address.stmt, "%T is probably not the"
797 : : " innermost dimension\n", term.stride);
798 : : else
799 : 118 : dump_printf_loc (MSG_NOTE, address.stmt, "cannot tell whether %T"
800 : : " is the innermost dimension\n", term.stride);
801 : 237 : }
802 : :
803 : : /* The caller has identified that STRIDE is the stride of interest
804 : : in TERM, and that the stride is applied in OP_LOOP. Record this
805 : : information in TERM, deciding whether STRIDE is likely to be for
806 : : the innermost dimension of an array and whether it represents a
807 : : versioning opportunity. ADDRESS is the address that contains TERM. */
808 : :
809 : : void
810 : 52886 : loop_versioning::analyze_stride (address_info &address,
811 : : address_term_info &term,
812 : : tree stride, class loop *op_loop)
813 : : {
814 : 52886 : term.stride = stride;
815 : :
816 : 52886 : term.inner_likelihood = get_inner_likelihood (stride, term.multiplier);
817 : 52886 : if (dump_enabled_p ())
818 : 197 : dump_inner_likelihood (address, term);
819 : :
820 : : /* To be a versioning opportunity we require:
821 : :
822 : : - The multiplier applied by TERM is equal to the access size,
823 : : so that when STRIDE is 1, the accesses in successive loop
824 : : iterations are consecutive.
825 : :
826 : : This is deliberately conservative. We could relax it to handle
827 : : other cases (such as those with gaps between iterations) if we
828 : : find any real testcases for which it's useful.
829 : :
830 : : - the stride is applied in the same loop as STMT rather than
831 : : in an outer loop. Although versioning for strides applied in
832 : : outer loops could help in some cases -- such as enabling
833 : : more loop interchange -- the savings are much lower than for
834 : : inner loops.
835 : :
836 : : - the stride is an SSA name that is invariant in STMT's loop,
837 : : since otherwise versioning isn't possible. */
838 : 52886 : unsigned HOST_WIDE_INT access_size = address.max_offset - address.min_offset;
839 : 52886 : if (term.multiplier == access_size
840 : 43902 : && address.loop == op_loop
841 : 41499 : && TREE_CODE (stride) == SSA_NAME
842 : 55234 : && expr_invariant_in_loop_p (address.loop, stride))
843 : : {
844 : 2337 : term.versioning_opportunity_p = true;
845 : 2337 : if (dump_enabled_p ())
846 : 88 : dump_printf_loc (MSG_NOTE, address.stmt, "%T == 1 is a versioning"
847 : : " opportunity\n", stride);
848 : : }
849 : 52886 : }
850 : :
851 : : /* See whether address term TERM (which belongs to ADDRESS) is the result
852 : : of multiplying a varying SSA name by a loop-invariant SSA name.
853 : : Return true and update TERM if so.
854 : :
855 : : This handles both cases that SCEV might handle, such as:
856 : :
857 : : for (int i = 0; i < n; ++i)
858 : : res += a[i * stride];
859 : :
860 : : and ones in which the term varies arbitrarily between iterations, such as:
861 : :
862 : : for (int i = 0; i < n; ++i)
863 : : res += a[index[i] * stride]; */
864 : :
865 : : bool
866 : 89964 : loop_versioning::find_per_loop_multiplication (address_info &address,
867 : : address_term_info &term)
868 : : {
869 : 89964 : gassign *mult = maybe_get_assign (term.expr);
870 : 113218 : if (!mult || gimple_assign_rhs_code (mult) != MULT_EXPR)
871 : : return false;
872 : :
873 : 6882 : class loop *mult_loop = loop_containing_stmt (mult);
874 : 6882 : if (!loop_outer (mult_loop))
875 : : return false;
876 : :
877 : 6616 : tree op1 = strip_casts (gimple_assign_rhs1 (mult));
878 : 13232 : tree op2 = strip_casts (gimple_assign_rhs2 (mult));
879 : 6616 : if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
880 : : return false;
881 : :
882 : 5864 : bool invariant1_p = expr_invariant_in_loop_p (mult_loop, op1);
883 : 5864 : bool invariant2_p = expr_invariant_in_loop_p (mult_loop, op2);
884 : 5864 : if (invariant1_p == invariant2_p)
885 : : return false;
886 : :
887 : : /* Make sure that the loop invariant is OP2 rather than OP1. */
888 : 5643 : if (invariant1_p)
889 : 3370 : std::swap (op1, op2);
890 : :
891 : 5643 : if (dump_enabled_p ())
892 : 90 : dump_printf_loc (MSG_NOTE, address.stmt, "address term %T = varying %T"
893 : : " * loop-invariant %T\n", term.expr, op1, op2);
894 : 5643 : analyze_stride (address, term, op2, mult_loop);
895 : 5643 : return true;
896 : : }
897 : :
898 : : /* Try to use scalar evolutions to find an address stride for TERM,
899 : : which belongs to ADDRESS. Return true and update TERM if so.
900 : :
901 : : Here we are interested in any evolution information we can find,
902 : : not just evolutions wrt ADDRESS->LOOP. For example, if we find that
903 : : an outer loop obviously iterates over the inner dimension of an array,
904 : : that information can help us eliminate worthless versioning opportunities
905 : : in inner loops. */
906 : :
907 : : bool
908 : 84321 : loop_versioning::analyze_term_using_scevs (address_info &address,
909 : : address_term_info &term)
910 : : {
911 : 121399 : gimple *setter = maybe_get_stmt (term.expr);
912 : 70656 : if (!setter)
913 : : return false;
914 : :
915 : 70656 : class loop *wrt_loop = loop_containing_stmt (setter);
916 : 70656 : if (!loop_outer (wrt_loop))
917 : : return false;
918 : :
919 : 57514 : tree chrec = strip_casts (analyze_scalar_evolution (wrt_loop, term.expr));
920 : 57514 : if (TREE_CODE (chrec) == POLYNOMIAL_CHREC)
921 : : {
922 : 47243 : if (dump_enabled_p ())
923 : 107 : dump_printf_loc (MSG_NOTE, address.stmt,
924 : : "address term %T = %T\n", term.expr, chrec);
925 : :
926 : : /* Peel casts and accumulate constant multiplications, up to the
927 : : limit allowed by M_MAXIMUM_SCALE. */
928 : 47243 : tree stride = strip_casts (CHREC_RIGHT (chrec));
929 : 47243 : while (TREE_CODE (stride) == MULT_EXPR
930 : 47243 : && multiply_term_by (term, TREE_OPERAND (stride, 1)))
931 : 0 : stride = strip_casts (TREE_OPERAND (stride, 0));
932 : :
933 : : gassign *assign;
934 : 47364 : while ((assign = maybe_get_assign (stride))
935 : 355 : && gimple_assign_rhs_code (assign) == MULT_EXPR
936 : 47485 : && multiply_term_by (term, gimple_assign_rhs2 (assign)))
937 : : {
938 : 121 : if (dump_enabled_p ())
939 : 24 : dump_printf_loc (MSG_NOTE, address.stmt,
940 : : "looking through %G", (gimple *) assign);
941 : 121 : stride = strip_casts (gimple_assign_rhs1 (assign));
942 : : }
943 : :
944 : 47243 : analyze_stride (address, term, stride, get_chrec_loop (chrec));
945 : 47243 : return true;
946 : : }
947 : :
948 : : return false;
949 : : }
950 : :
951 : : /* Address term TERM is an arbitrary term that provides no versioning
952 : : opportunities. Analyze it to see whether it contains any likely
953 : : inner strides, so that we don't mistakenly version for other
954 : : (less likely) candidates.
955 : :
956 : : This copes with invariant innermost indices such as:
957 : :
958 : : x(i, :) = 100
959 : :
960 : : where the "i" component of the address is invariant in the loop
961 : : but provides the real inner stride.
962 : :
963 : : ADDRESS is the address that contains TERM. */
964 : :
965 : : void
966 : 16200 : loop_versioning::analyze_arbitrary_term (address_info &address,
967 : : address_term_info &term)
968 : :
969 : : {
970 : : /* A multiplication offers two potential strides. Pick the one that
971 : : is most likely to be an innermost stride. */
972 : 16200 : tree expr = term.expr, alt = NULL_TREE;
973 : 16200 : gassign *mult = maybe_get_assign (expr);
974 : 28920 : if (mult && gimple_assign_rhs_code (mult) == MULT_EXPR)
975 : : {
976 : 580 : expr = strip_casts (gimple_assign_rhs1 (mult));
977 : 1160 : alt = strip_casts (gimple_assign_rhs2 (mult));
978 : : }
979 : 16200 : term.stride = expr;
980 : 16200 : term.inner_likelihood = get_inner_likelihood (expr, term.multiplier);
981 : 16200 : if (alt)
982 : : {
983 : 580 : inner_likelihood alt_l = get_inner_likelihood (alt, term.multiplier);
984 : 580 : if (alt_l > term.inner_likelihood)
985 : : {
986 : 219 : term.stride = alt;
987 : 219 : term.inner_likelihood = alt_l;
988 : : }
989 : : }
990 : 16200 : if (dump_enabled_p ())
991 : 40 : dump_inner_likelihood (address, term);
992 : 16200 : }
993 : :
994 : : /* Try to identify loop strides in ADDRESS and try to choose realistic
995 : : versioning opportunities based on these strides.
996 : :
997 : : The main difficulty here isn't finding strides that could be used
998 : : in a version check (that's pretty easy). The problem instead is to
999 : : avoid versioning for some stride S that is unlikely ever to be 1 at
1000 : : runtime. Versioning for S == 1 on its own would lead to unnecessary
1001 : : code bloat, while adding S == 1 to more realistic version conditions
1002 : : would lose the optimisation opportunity offered by those other conditions.
1003 : :
1004 : : For example, versioning for a stride of 1 in the Fortran code:
1005 : :
1006 : : integer :: a(:,:)
1007 : : a(1,:) = 1
1008 : :
1009 : : is not usually a good idea, since the assignment is iterating over
1010 : : an outer dimension and is relatively unlikely to have a stride of 1.
1011 : : (It isn't impossible, since the inner dimension might be 1, or the
1012 : : array might be transposed.) Similarly, in:
1013 : :
1014 : : integer :: a(:,:), b(:,:)
1015 : : b(:,1) = a(1,:)
1016 : :
1017 : : b(:,1) is relatively likely to have a stride of 1 while a(1,:) isn't.
1018 : : Versioning for when both strides are 1 would lose most of the benefit
1019 : : of versioning for b's access.
1020 : :
1021 : : The approach we take is as follows:
1022 : :
1023 : : - Analyze each term to see whether it has an identifiable stride,
1024 : : regardless of which loop applies the stride.
1025 : :
1026 : : - Evaluate the likelihood that each such stride is for the innermost
1027 : : dimension of an array, on the scale "likely", "don't know" or "unlikely".
1028 : :
1029 : : - If there is a single "likely" innermost stride, and that stride is
1030 : : applied in the loop that contains STMT, version the loop for when the
1031 : : stride is 1. This deals with the cases in which we're fairly
1032 : : confident of doing the right thing, such as the b(:,1) reference above.
1033 : :
1034 : : - If there are no "likely" innermost strides, and the loop that contains
1035 : : STMT uses a stride that we rated as "don't know", version for when
1036 : : that stride is 1. This is principally used for C code such as:
1037 : :
1038 : : for (int i = 0; i < n; ++i)
1039 : : a[i * x] = ...;
1040 : :
1041 : : and:
1042 : :
1043 : : for (int j = 0; j < n; ++j)
1044 : : for (int i = 0; i < n; ++i)
1045 : : a[i * x + j * y] = ...;
1046 : :
1047 : : where nothing in the way "x" and "y" are set gives a hint as to
1048 : : whether "i" iterates over the innermost dimension of the array.
1049 : : In these situations it seems reasonable to assume the
1050 : : programmer has nested the loops appropriately (although of course
1051 : : there are examples like GEMM in which this assumption doesn't hold
1052 : : for all accesses in the loop).
1053 : :
1054 : : This case is also useful for the Fortran equivalent of the
1055 : : above C code. */
1056 : :
1057 : : void
1058 : 57872 : loop_versioning::analyze_address_fragment (address_info &address)
1059 : : {
1060 : 57872 : if (dump_enabled_p ())
1061 : : {
1062 : 171 : dump_printf_loc (MSG_NOTE, address.stmt, "analyzing address fragment ");
1063 : 171 : dump_address_info (MSG_NOTE, address);
1064 : 171 : dump_printf (MSG_NOTE, "\n");
1065 : : }
1066 : :
1067 : : /* Analyze each component of the sum to see whether it involves an
1068 : : apparent stride.
1069 : :
1070 : : There is an overlap between the addresses that
1071 : : find_per_loop_multiplication and analyze_term_using_scevs can handle,
1072 : : but the former is much cheaper than SCEV analysis, so try it first. */
1073 : 295672 : for (unsigned int i = 0; i < address.terms.length (); ++i)
1074 : 89964 : if (!find_per_loop_multiplication (address, address.terms[i])
1075 : 84321 : && !analyze_term_using_scevs (address, address.terms[i])
1076 : 127042 : && !POINTER_TYPE_P (TREE_TYPE (address.terms[i].expr)))
1077 : 16200 : analyze_arbitrary_term (address, address.terms[i]);
1078 : :
1079 : : /* Check for strides that are likely to be for the innermost dimension.
1080 : :
1081 : : 1. If there is a single likely inner stride, if it is an SSA name,
1082 : : and if it is worth versioning the loop for when the SSA name
1083 : : equals 1, record that we want to do so.
1084 : :
1085 : : 2. Otherwise, if there any likely inner strides, bail out. This means
1086 : : one of:
1087 : :
1088 : : (a) There are multiple likely inner strides. This suggests we're
1089 : : confused and be can't be confident of doing the right thing.
1090 : :
1091 : : (b) There is a single likely inner stride and it is a constant
1092 : : rather than an SSA name. This can mean either that the access
1093 : : is a natural one without any variable strides, such as:
1094 : :
1095 : : for (int i = 0; i < n; ++i)
1096 : : a[i] += 1;
1097 : :
1098 : : or that a variable stride is applied to an outer dimension,
1099 : : such as:
1100 : :
1101 : : for (int i = 0; i < n; ++i)
1102 : : for (int j = 0; j < n; ++j)
1103 : : a[j * stride][i] += 1;
1104 : :
1105 : : (c) There is a single likely inner stride, and it is an SSA name,
1106 : : but it isn't a worthwhile versioning opportunity. This usually
1107 : : means that the variable stride is applied by an outer loop,
1108 : : such as:
1109 : :
1110 : : for (int i = 0; i < n; ++i)
1111 : : for (int j = 0; j < n; ++j)
1112 : : a[j][i * stride] += 1;
1113 : :
1114 : : or (using an example with a more natural loop nesting):
1115 : :
1116 : : for (int i = 0; i < n; ++i)
1117 : : for (int j = 0; j < n; ++j)
1118 : : a[i][j] += b[i * stride];
1119 : :
1120 : : in cases where b[i * stride] cannot (yet) be hoisted for
1121 : : aliasing reasons.
1122 : :
1123 : : 3. If there are no likely inner strides, fall through to the next
1124 : : set of checks.
1125 : :
1126 : : Pointer equality is enough to check for uniqueness in (1), since we
1127 : : only care about SSA names. */
1128 : : tree chosen_stride = NULL_TREE;
1129 : : tree version_stride = NULL_TREE;
1130 : 147612 : for (unsigned int i = 0; i < address.terms.length (); ++i)
1131 : 89886 : if (chosen_stride != address.terms[i].stride
1132 : 89886 : && address.terms[i].inner_likelihood == INNER_LIKELY)
1133 : : {
1134 : 44938 : if (chosen_stride)
1135 : : return;
1136 : 44792 : chosen_stride = address.terms[i].stride;
1137 : 44792 : if (address.terms[i].versioning_opportunity_p)
1138 : 663 : version_stride = chosen_stride;
1139 : : }
1140 : :
1141 : : /* If there are no likely inner strides, see if there is a single
1142 : : versioning opportunity for a stride that was rated as INNER_DONT_KNOW.
1143 : : See the comment above the function for the cases that this code
1144 : : handles. */
1145 : 57726 : if (!chosen_stride)
1146 : 38873 : for (unsigned int i = 0; i < address.terms.length (); ++i)
1147 : 25802 : if (version_stride != address.terms[i].stride
1148 : 15832 : && address.terms[i].inner_likelihood == INNER_DONT_KNOW
1149 : 35218 : && address.terms[i].versioning_opportunity_p)
1150 : : {
1151 : 1403 : if (version_stride)
1152 : : return;
1153 : : version_stride = address.terms[i].stride;
1154 : : }
1155 : :
1156 : 57717 : if (version_stride)
1157 : 2042 : version_for_unity (address.stmt, version_stride);
1158 : : }
1159 : :
1160 : : /* Treat EXPR * MULTIPLIER + OFFSET as a fragment of an address that addresses
1161 : : TYPE_SIZE bytes and record this address fragment for later processing.
1162 : : STMT is the statement that contains the address. */
1163 : :
1164 : : void
1165 : 157165 : loop_versioning::record_address_fragment (gimple *stmt,
1166 : : unsigned HOST_WIDE_INT type_size,
1167 : : tree expr,
1168 : : unsigned HOST_WIDE_INT multiplier,
1169 : : HOST_WIDE_INT offset)
1170 : : {
1171 : : /* We're only interested in computed values. */
1172 : 157165 : if (TREE_CODE (expr) != SSA_NAME)
1173 : 24048 : return;
1174 : :
1175 : : /* Quick exit if no part of the address is calculated in STMT's loop,
1176 : : since such addresses have no versioning opportunities. */
1177 : 141911 : class loop *loop = loop_containing_stmt (stmt);
1178 : 141911 : if (expr_invariant_in_loop_p (loop, expr))
1179 : : return;
1180 : :
1181 : : /* Set up an address_info for EXPR * MULTIPLIER. */
1182 : 133401 : address_info *address = XOBNEW (&m_obstack, address_info);
1183 : 133401 : new (address) address_info;
1184 : 133401 : address->stmt = stmt;
1185 : 133401 : address->loop = loop;
1186 : 133401 : address->base = NULL_TREE;
1187 : 133401 : address->terms.quick_grow (1);
1188 : 133401 : address->terms[0].expr = expr;
1189 : 133401 : address->terms[0].multiplier = multiplier;
1190 : 133401 : address->terms[0].stride = NULL_TREE;
1191 : 133401 : address->terms[0].inner_likelihood = INNER_UNLIKELY;
1192 : 133401 : address->terms[0].versioning_opportunity_p = false;
1193 : 133401 : address->min_offset = offset;
1194 : :
1195 : : /* Peel apart the expression into a sum of address_terms, where each
1196 : : term is multiplied by a constant. Treat a + b and a - b the same,
1197 : : since it doesn't matter for our purposes whether an address is
1198 : : increasing or decreasing. Distribute (a + b) * constant into
1199 : : a * constant + b * constant.
1200 : :
1201 : : We don't care which loop each term belongs to, since we want to
1202 : : examine as many candidate strides as possible when determining
1203 : : which is likely to be for the innermost dimension. We therefore
1204 : : don't limit the search to statements in STMT's loop. */
1205 : 523262 : for (unsigned int i = 0; i < address->terms.length (); )
1206 : : {
1207 : 389861 : if (gassign *assign = maybe_get_assign (address->terms[i].expr))
1208 : : {
1209 : 236179 : tree_code code = gimple_assign_rhs_code (assign);
1210 : 236179 : if (code == PLUS_EXPR
1211 : 236179 : || code == POINTER_PLUS_EXPR
1212 : 109825 : || code == MINUS_EXPR)
1213 : : {
1214 : 127939 : tree op1 = gimple_assign_rhs1 (assign);
1215 : 127939 : tree op2 = gimple_assign_rhs2 (assign);
1216 : 127939 : if (TREE_CODE (op2) == INTEGER_CST)
1217 : : {
1218 : 65590 : address->terms[i].expr = strip_casts (op1);
1219 : : /* This is heuristic only, so don't worry about truncation
1220 : : or overflow. */
1221 : 65590 : address->min_offset += (TREE_INT_CST_LOW (op2)
1222 : 65590 : * address->terms[i].multiplier);
1223 : 65590 : continue;
1224 : : }
1225 : 62349 : else if (address->terms.length () < address_info::MAX_TERMS)
1226 : : {
1227 : 62322 : unsigned int j = address->terms.length ();
1228 : 62322 : address->terms.quick_push (address->terms[i]);
1229 : 62322 : address->terms[i].expr = strip_casts (op1);
1230 : 62322 : address->terms[j].expr = strip_casts (op2);
1231 : 62322 : continue;
1232 : 62322 : }
1233 : : }
1234 : 108267 : if (code == MULT_EXPR)
1235 : : {
1236 : 78369 : tree op1 = gimple_assign_rhs1 (assign);
1237 : 78369 : tree op2 = gimple_assign_rhs2 (assign);
1238 : 78369 : if (multiply_term_by (address->terms[i], op2))
1239 : : {
1240 : 58809 : address->terms[i].expr = strip_casts (op1);
1241 : 58809 : continue;
1242 : : }
1243 : : }
1244 : 49458 : if (CONVERT_EXPR_CODE_P (code))
1245 : : {
1246 : 7417 : tree op1 = gimple_assign_rhs1 (assign);
1247 : 7417 : address->terms[i].expr = strip_casts (op1);
1248 : 7417 : continue;
1249 : 7417 : }
1250 : : }
1251 : 195723 : i += 1;
1252 : : }
1253 : :
1254 : : /* Peel off any symbolic pointer. */
1255 : 133401 : if (TREE_CODE (address->terms[0].expr) != SSA_NAME
1256 : 133401 : && address->terms[0].multiplier == 1)
1257 : : {
1258 : 5015 : if (address->terms.length () == 1)
1259 : : {
1260 : 0 : obstack_free (&m_obstack, address);
1261 : 0 : return;
1262 : : }
1263 : 5015 : address->base = address->terms[0].expr;
1264 : 5015 : address->terms.ordered_remove (0);
1265 : : }
1266 : :
1267 : : /* Require all remaining terms to be SSA names. (This could be false
1268 : : for unfolded statements, but they aren't worth dealing with.) */
1269 : 323363 : for (unsigned int i = 0; i < address->terms.length (); ++i)
1270 : 190246 : if (TREE_CODE (address->terms[i].expr) != SSA_NAME)
1271 : : {
1272 : 284 : obstack_free (&m_obstack, address);
1273 : 284 : return;
1274 : : }
1275 : :
1276 : : /* The loop above set MIN_OFFSET based on the first byte of the
1277 : : referenced data. Calculate the end + 1. */
1278 : 133117 : address->max_offset = address->min_offset + type_size;
1279 : :
1280 : : /* Put the terms into a canonical order for the hash table lookup below. */
1281 : 133117 : address->terms.qsort (compare_address_terms);
1282 : :
1283 : 133117 : if (dump_enabled_p ())
1284 : : {
1285 : 249 : dump_printf_loc (MSG_NOTE, stmt, "recording address fragment %T", expr);
1286 : 249 : if (multiplier != 1)
1287 : 118 : dump_printf (MSG_NOTE, " * %wd", multiplier);
1288 : 249 : dump_printf (MSG_NOTE, " = ");
1289 : 249 : dump_address_info (MSG_NOTE, *address);
1290 : 249 : dump_printf (MSG_NOTE, "\n");
1291 : : }
1292 : :
1293 : : /* Pool address information with the same terms (but potentially
1294 : : different offsets). */
1295 : 133117 : address_info **slot = m_address_table.find_slot (address, INSERT);
1296 : 133117 : if (address_info *old_address = *slot)
1297 : : {
1298 : : /* We've already seen an address with the same terms. Extend the
1299 : : offset range to account for this access. Doing this can paper
1300 : : over gaps, such as in:
1301 : :
1302 : : a[i * stride * 4] + a[i * stride * 4 + 3];
1303 : :
1304 : : where nothing references "+ 1" or "+ 2". However, the vectorizer
1305 : : handles such gapped accesses without problems, so it's not worth
1306 : : trying to exclude them. */
1307 : 70267 : if (old_address->min_offset > address->min_offset)
1308 : 17618 : old_address->min_offset = address->min_offset;
1309 : 70267 : if (old_address->max_offset < address->max_offset)
1310 : 13312 : old_address->max_offset = address->max_offset;
1311 : 70267 : obstack_free (&m_obstack, address);
1312 : : }
1313 : : else
1314 : : {
1315 : : /* This is the first time we've seen an address with these terms. */
1316 : 62850 : *slot = address;
1317 : 62850 : m_address_list.safe_push (address);
1318 : : }
1319 : : }
1320 : :
1321 : : /* Analyze expression EXPR, which occurs in STMT. */
1322 : :
1323 : : void
1324 : 1461018 : loop_versioning::analyze_expr (gimple *stmt, tree expr)
1325 : : {
1326 : 1461018 : unsigned HOST_WIDE_INT type_size;
1327 : :
1328 : 1627357 : while (handled_component_p (expr))
1329 : : {
1330 : : /* See whether we can use versioning to avoid a multiplication
1331 : : in an array index. */
1332 : 166339 : if (TREE_CODE (expr) == ARRAY_REF
1333 : 166339 : && acceptable_type_p (TREE_TYPE (expr), &type_size))
1334 : 96491 : record_address_fragment (stmt, type_size,
1335 : 96491 : TREE_OPERAND (expr, 1), type_size, 0);
1336 : 166339 : expr = TREE_OPERAND (expr, 0);
1337 : : }
1338 : :
1339 : : /* See whether we can use versioning to avoid a multiplication
1340 : : in the pointer calculation of a MEM_REF. */
1341 : 1461018 : if (TREE_CODE (expr) == MEM_REF
1342 : 1461018 : && acceptable_type_p (TREE_TYPE (expr), &type_size))
1343 : 60674 : record_address_fragment (stmt, type_size, TREE_OPERAND (expr, 0), 1,
1344 : : /* This is heuristic only, so don't worry
1345 : : about truncation or overflow. */
1346 : 60674 : TREE_INT_CST_LOW (TREE_OPERAND (expr, 1)));
1347 : :
1348 : : /* These would be easy to handle if they existed at this stage. */
1349 : 1461018 : gcc_checking_assert (TREE_CODE (expr) != TARGET_MEM_REF);
1350 : 1461018 : }
1351 : :
1352 : : /* Analyze all the statements in BB looking for useful version checks.
1353 : : Return true on success, false if something prevents the block from
1354 : : being versioned. */
1355 : :
1356 : : bool
1357 : 243471 : loop_versioning::analyze_block (basic_block bb)
1358 : : {
1359 : 243471 : class loop *loop = bb->loop_father;
1360 : 243471 : loop_info &li = get_loop_info (loop);
1361 : 1302220 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1362 : 815278 : gsi_next (&gsi))
1363 : : {
1364 : 832043 : gimple *stmt = gsi_stmt (gsi);
1365 : 832043 : if (is_gimple_debug (stmt))
1366 : 107516 : continue;
1367 : :
1368 : 746123 : if (expensive_stmt_p (stmt))
1369 : : {
1370 : 16765 : if (dump_enabled_p ())
1371 : 58 : dump_printf_loc (MSG_NOTE, stmt, "expensive statement"
1372 : : " prevents versioning: %G", stmt);
1373 : 16765 : return false;
1374 : : }
1375 : :
1376 : : /* Only look for direct versioning opportunities in inner loops
1377 : : since the benefit tends to be much smaller for outer loops. */
1378 : 707762 : if (!loop->inner)
1379 : : {
1380 : 594081 : unsigned int nops = gimple_num_ops (stmt);
1381 : 2243331 : for (unsigned int i = 0; i < nops; ++i)
1382 : 1649250 : if (tree op = gimple_op (stmt, i))
1383 : 1461018 : analyze_expr (stmt, op);
1384 : : }
1385 : :
1386 : : /* The point of the instruction limit is to prevent excessive
1387 : : code growth, so this is a size-based estimate even though
1388 : : the optimization is aimed at speed. */
1389 : 707762 : li.num_insns += estimate_num_insns (stmt, &eni_size_weights);
1390 : : }
1391 : :
1392 : : return true;
1393 : : }
1394 : :
1395 : : /* Analyze all the blocks in the function, looking for useful version checks.
1396 : : Return true if we found one. */
1397 : :
1398 : : bool
1399 : 25178 : loop_versioning::analyze_blocks ()
1400 : : {
1401 : 25178 : AUTO_DUMP_SCOPE ("analyze_blocks",
1402 : : dump_user_location_t::from_function_decl (m_fn->decl));
1403 : :
1404 : : /* For now we don't try to version the whole function, although
1405 : : versioning at that level could be useful in some cases. */
1406 : 25178 : get_loop_info (get_loop (m_fn, 0)).rejected_p = true;
1407 : :
1408 : 148895 : for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
1409 : : {
1410 : 73361 : loop_info &linfo = get_loop_info (loop);
1411 : :
1412 : : /* Ignore cold loops. */
1413 : 73361 : if (!optimize_loop_for_speed_p (loop))
1414 : 1397 : linfo.rejected_p = true;
1415 : :
1416 : : /* See whether an inner loop prevents versioning of this loop. */
1417 : 73361 : if (!linfo.rejected_p)
1418 : 84259 : for (class loop *inner = loop->inner; inner; inner = inner->next)
1419 : 16284 : if (get_loop_info (inner).rejected_p)
1420 : : {
1421 : 3989 : linfo.rejected_p = true;
1422 : 3989 : break;
1423 : : }
1424 : :
1425 : : /* If versioning the loop is still a possibility, examine the
1426 : : statements in the loop to look for versioning opportunities. */
1427 : 73361 : if (!linfo.rejected_p)
1428 : : {
1429 : 67975 : void *start_point = obstack_alloc (&m_obstack, 0);
1430 : :
1431 : 294681 : for (basic_block bb = linfo.block_list; bb;
1432 : 226706 : bb = m_next_block_in_loop[bb->index])
1433 : 243471 : if (!analyze_block (bb))
1434 : : {
1435 : 16765 : linfo.rejected_p = true;
1436 : 16765 : break;
1437 : : }
1438 : :
1439 : 67975 : if (!linfo.rejected_p)
1440 : : {
1441 : : /* Process any queued address fragments, now that we have
1442 : : complete grouping information. */
1443 : : address_info *address;
1444 : : unsigned int i;
1445 : 109082 : FOR_EACH_VEC_ELT (m_address_list, i, address)
1446 : 57872 : analyze_address_fragment (*address);
1447 : : }
1448 : :
1449 : 67975 : m_address_table.empty ();
1450 : 67975 : m_address_list.truncate (0);
1451 : 67975 : obstack_free (&m_obstack, start_point);
1452 : : }
1453 : 25178 : }
1454 : :
1455 : 25178 : return m_num_conditions != 0;
1456 : : }
1457 : :
1458 : : /* Use the ranges in VRS to remove impossible versioning conditions from
1459 : : LOOP. */
1460 : :
1461 : : void
1462 : 4091 : loop_versioning::prune_loop_conditions (class loop *loop)
1463 : : {
1464 : 4091 : loop_info &li = get_loop_info (loop);
1465 : :
1466 : 4091 : int to_remove = -1;
1467 : 4091 : bitmap_iterator bi;
1468 : 4091 : unsigned int i;
1469 : 4091 : int_range_max r;
1470 : 5475 : EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi)
1471 : : {
1472 : 1384 : tree name = ssa_name (i);
1473 : 1384 : gimple *stmt = first_stmt (loop->header);
1474 : :
1475 : 2768 : if (get_range_query (cfun)->range_of_expr (r, name, stmt)
1476 : 2768 : && !r.contains_p (wi::one (TYPE_PRECISION (TREE_TYPE (name)))))
1477 : : {
1478 : 32 : if (dump_enabled_p ())
1479 : 2 : dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1480 : : "%T can never be 1 in this loop\n", name);
1481 : :
1482 : 32 : if (to_remove >= 0)
1483 : 0 : bitmap_clear_bit (&li.unity_names, to_remove);
1484 : 32 : to_remove = i;
1485 : 32 : m_num_conditions -= 1;
1486 : : }
1487 : : }
1488 : 4091 : if (to_remove >= 0)
1489 : 32 : bitmap_clear_bit (&li.unity_names, to_remove);
1490 : 4091 : }
1491 : :
1492 : : /* Remove any scheduled loop version conditions that will never be true.
1493 : : Return true if any remain. */
1494 : :
1495 : : bool
1496 : 658 : loop_versioning::prune_conditions ()
1497 : : {
1498 : 658 : AUTO_DUMP_SCOPE ("prune_loop_conditions",
1499 : : dump_user_location_t::from_function_decl (m_fn->decl));
1500 : :
1501 : 658 : calculate_dominance_info (CDI_DOMINATORS);
1502 : 658 : lv_dom_walker dom_walker (*this);
1503 : 658 : dom_walker.walk (ENTRY_BLOCK_PTR_FOR_FN (m_fn));
1504 : 658 : return m_num_conditions != 0;
1505 : 658 : }
1506 : :
1507 : : /* Merge the version checks for INNER into immediately-enclosing loop
1508 : : OUTER. */
1509 : :
1510 : : void
1511 : 528 : loop_versioning::merge_loop_info (class loop *outer, class loop *inner)
1512 : : {
1513 : 528 : loop_info &inner_li = get_loop_info (inner);
1514 : 528 : loop_info &outer_li = get_loop_info (outer);
1515 : :
1516 : 528 : if (dump_enabled_p ())
1517 : : {
1518 : 11 : bitmap_iterator bi;
1519 : 11 : unsigned int i;
1520 : 22 : EXECUTE_IF_SET_IN_BITMAP (&inner_li.unity_names, 0, i, bi)
1521 : 11 : if (!bitmap_bit_p (&outer_li.unity_names, i))
1522 : 11 : dump_printf_loc (MSG_NOTE, find_loop_location (inner),
1523 : : "hoisting check that %T == 1 to outer loop\n",
1524 : 11 : ssa_name (i));
1525 : : }
1526 : :
1527 : 528 : bitmap_ior_into (&outer_li.unity_names, &inner_li.unity_names);
1528 : 541 : if (loop_depth (outer_li.outermost) < loop_depth (inner_li.outermost))
1529 : 366 : outer_li.outermost = inner_li.outermost;
1530 : 528 : }
1531 : :
1532 : : /* Add LOOP to the queue of loops to version. */
1533 : :
1534 : : void
1535 : 1109 : loop_versioning::add_loop_to_queue (class loop *loop)
1536 : : {
1537 : 1109 : loop_info &li = get_loop_info (loop);
1538 : :
1539 : 1109 : if (dump_enabled_p ())
1540 : 68 : dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1541 : : "queuing this loop for versioning\n");
1542 : 1109 : m_loops_to_version.safe_push (loop);
1543 : :
1544 : : /* Don't try to version superloops. */
1545 : 1109 : li.rejected_p = true;
1546 : 1109 : }
1547 : :
1548 : : /* Decide whether the cost model would allow us to version LOOP,
1549 : : either directly or as part of a parent loop, and return true if so.
1550 : : This does not imply that the loop is actually worth versioning in its
1551 : : own right, just that it would be valid to version it if something
1552 : : benefited.
1553 : :
1554 : : We have already made this decision for all inner loops of LOOP. */
1555 : :
1556 : : bool
1557 : 3396 : loop_versioning::decide_whether_loop_is_versionable (class loop *loop)
1558 : : {
1559 : 3396 : loop_info &li = get_loop_info (loop);
1560 : :
1561 : 3396 : if (li.rejected_p)
1562 : : return false;
1563 : :
1564 : : /* Examine the decisions made for inner loops. */
1565 : 3225 : for (class loop *inner = loop->inner; inner; inner = inner->next)
1566 : : {
1567 : 640 : loop_info &inner_li = get_loop_info (inner);
1568 : 640 : if (inner_li.rejected_p)
1569 : : {
1570 : 6 : if (dump_enabled_p ())
1571 : 0 : dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1572 : : "not versioning this loop because one of its"
1573 : : " inner loops should not be versioned\n");
1574 : 6 : return false;
1575 : : }
1576 : :
1577 : 634 : if (inner_li.worth_versioning_p ())
1578 : 405 : li.subloops_benefit_p = true;
1579 : :
1580 : : /* Accumulate the number of instructions from subloops that are not
1581 : : the innermost, or that don't benefit from versioning. Only the
1582 : : instructions from innermost loops that benefit from versioning
1583 : : should be weighed against loop-versioning-max-inner-insns;
1584 : : everything else should be weighed against
1585 : : loop-versioning-max-outer-insns. */
1586 : 634 : if (!inner_li.worth_versioning_p () || inner->inner)
1587 : : {
1588 : 300 : if (dump_enabled_p ())
1589 : 0 : dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1590 : : "counting %d instructions from this loop"
1591 : : " against its parent loop\n", inner_li.num_insns);
1592 : 300 : li.num_insns += inner_li.num_insns;
1593 : : }
1594 : : }
1595 : :
1596 : : /* Enforce the size limits. */
1597 : 4057 : if (li.worth_versioning_p ())
1598 : : {
1599 : 1490 : unsigned int max_num_insns = max_insns_for_loop (loop);
1600 : 1490 : if (dump_enabled_p ())
1601 : 79 : dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1602 : : "this loop has %d instructions, against"
1603 : : " a versioning limit of %d\n",
1604 : : li.num_insns, max_num_insns);
1605 : 1490 : if (li.num_insns > max_num_insns)
1606 : : {
1607 : 10 : if (dump_enabled_p ())
1608 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION
1609 : : | MSG_PRIORITY_USER_FACING,
1610 : 0 : find_loop_location (loop),
1611 : : "this loop is too big to version");
1612 : 10 : return false;
1613 : : }
1614 : : }
1615 : :
1616 : : /* Hoist all version checks from subloops to this loop. */
1617 : 3103 : for (class loop *subloop = loop->inner; subloop; subloop = subloop->next)
1618 : 528 : merge_loop_info (loop, subloop);
1619 : :
1620 : : return true;
1621 : : }
1622 : :
1623 : : /* Decide which loops to version and add them to the versioning queue.
1624 : : Return true if there are any loops to version. */
1625 : :
1626 : : bool
1627 : 649 : loop_versioning::make_versioning_decisions ()
1628 : : {
1629 : 649 : AUTO_DUMP_SCOPE ("make_versioning_decisions",
1630 : : dump_user_location_t::from_function_decl (m_fn->decl));
1631 : :
1632 : 5343 : for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
1633 : : {
1634 : 3396 : loop_info &linfo = get_loop_info (loop);
1635 : 3396 : if (decide_whether_loop_is_versionable (loop))
1636 : : {
1637 : : /* Commit to versioning LOOP directly if we can't hoist the
1638 : : version checks any further. */
1639 : 7451 : if (linfo.worth_versioning_p ()
1640 : 2575 : && (loop_depth (loop) == 1 || linfo.outermost == loop))
1641 : 1043 : add_loop_to_queue (loop);
1642 : : }
1643 : : else
1644 : : {
1645 : : /* We can't version this loop, so individually version any
1646 : : subloops that would benefit and haven't been versioned yet. */
1647 : 821 : linfo.rejected_p = true;
1648 : 1465 : for (class loop *subloop = loop->inner; subloop;
1649 : 644 : subloop = subloop->next)
1650 : 842 : if (get_loop_info (subloop).worth_versioning_p ())
1651 : 66 : add_loop_to_queue (subloop);
1652 : : }
1653 : 649 : }
1654 : :
1655 : 1298 : return !m_loops_to_version.is_empty ();
1656 : : }
1657 : :
1658 : : /* Attempt to implement loop versioning for LOOP, using the information
1659 : : cached in the associated loop_info. Return true on success. */
1660 : :
1661 : : bool
1662 : 1109 : loop_versioning::version_loop (class loop *loop)
1663 : : {
1664 : 1109 : loop_info &li = get_loop_info (loop);
1665 : :
1666 : : /* Build up a condition that selects the original loop instead of
1667 : : the simplified loop. */
1668 : 1109 : tree cond = boolean_false_node;
1669 : 1109 : bitmap_iterator bi;
1670 : 1109 : unsigned int i;
1671 : 2457 : EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi)
1672 : : {
1673 : 1348 : tree name = ssa_name (i);
1674 : 1348 : tree ne_one = fold_build2 (NE_EXPR, boolean_type_node, name,
1675 : : build_one_cst (TREE_TYPE (name)));
1676 : 1348 : cond = fold_build2 (TRUTH_OR_EXPR, boolean_type_node, cond, ne_one);
1677 : : }
1678 : :
1679 : : /* Convert the condition into a suitable gcond. */
1680 : 1109 : gimple_seq stmts = NULL;
1681 : 1109 : cond = force_gimple_operand_1 (cond, &stmts, is_gimple_condexpr_for_cond,
1682 : : NULL_TREE);
1683 : :
1684 : : /* Version the loop. */
1685 : 1109 : initialize_original_copy_tables ();
1686 : 1109 : basic_block cond_bb;
1687 : 1109 : li.optimized_loop = loop_version (loop, cond, &cond_bb,
1688 : : profile_probability::unlikely (),
1689 : : profile_probability::likely (),
1690 : : profile_probability::unlikely (),
1691 : : profile_probability::likely (), true);
1692 : 1109 : free_original_copy_tables ();
1693 : 1109 : if (!li.optimized_loop)
1694 : : {
1695 : 2 : if (dump_enabled_p ())
1696 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, find_loop_location (loop),
1697 : : "tried but failed to version this loop for when"
1698 : : " certain strides are 1\n");
1699 : 2 : return false;
1700 : : }
1701 : :
1702 : 1107 : if (dump_enabled_p ())
1703 : 68 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, find_loop_location (loop),
1704 : : "versioned this loop for when certain strides are 1\n");
1705 : :
1706 : : /* Insert the statements that feed COND. */
1707 : 1107 : if (stmts)
1708 : : {
1709 : 165 : gimple_stmt_iterator gsi = gsi_last_bb (cond_bb);
1710 : 165 : gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1711 : : }
1712 : :
1713 : : return true;
1714 : : }
1715 : :
1716 : : /* Attempt to version all loops in the versioning queue. */
1717 : :
1718 : : void
1719 : 649 : loop_versioning::implement_versioning_decisions ()
1720 : : {
1721 : : /* No AUTO_DUMP_SCOPE here since all messages are top-level and
1722 : : user-facing at this point. */
1723 : :
1724 : 649 : bool any_succeeded_p = false;
1725 : 649 : class loop *loop;
1726 : 649 : unsigned int i;
1727 : 1758 : FOR_EACH_VEC_ELT (m_loops_to_version, i, loop)
1728 : 1109 : if (version_loop (loop))
1729 : 1107 : any_succeeded_p = true;
1730 : 649 : if (!any_succeeded_p)
1731 : 649 : return;
1732 : :
1733 : 647 : update_ssa (TODO_update_ssa);
1734 : :
1735 : : /* Simplify the new loop, which is used when COND is false. */
1736 : 1754 : FOR_EACH_VEC_ELT (m_loops_to_version, i, loop)
1737 : : {
1738 : 1107 : loop_info &linfo = get_loop_info (loop);
1739 : 1107 : if (linfo.optimized_loop)
1740 : 1107 : name_prop (linfo).substitute_and_fold (linfo.optimized_loop->header);
1741 : : }
1742 : : }
1743 : :
1744 : : /* Run the pass and return a set of TODO_* flags. */
1745 : :
1746 : : unsigned int
1747 : 25178 : loop_versioning::run ()
1748 : : {
1749 : 25178 : gcc_assert (scev_initialized_p ());
1750 : :
1751 : 25178 : if (analyze_blocks ()
1752 : 658 : && prune_conditions ()
1753 : 25827 : && make_versioning_decisions ())
1754 : 649 : implement_versioning_decisions ();
1755 : :
1756 : 25178 : return 0;
1757 : : }
1758 : :
1759 : : /* Loop versioning pass. */
1760 : :
1761 : : const pass_data pass_data_loop_versioning =
1762 : : {
1763 : : GIMPLE_PASS, /* type */
1764 : : "lversion", /* name */
1765 : : OPTGROUP_LOOP, /* optinfo_flags */
1766 : : TV_LOOP_VERSIONING, /* tv_id */
1767 : : PROP_cfg, /* properties_required */
1768 : : 0, /* properties_provided */
1769 : : 0, /* properties_destroyed */
1770 : : 0, /* todo_flags_start */
1771 : : 0, /* todo_flags_finish */
1772 : : };
1773 : :
1774 : : class pass_loop_versioning : public gimple_opt_pass
1775 : : {
1776 : : public:
1777 : 280114 : pass_loop_versioning (gcc::context *ctxt)
1778 : 560228 : : gimple_opt_pass (pass_data_loop_versioning, ctxt)
1779 : : {}
1780 : :
1781 : : /* opt_pass methods: */
1782 : 225983 : bool gate (function *) final override
1783 : : {
1784 : 225983 : return flag_version_loops_for_strides;
1785 : : }
1786 : : unsigned int execute (function *) final override;
1787 : : };
1788 : :
1789 : : unsigned int
1790 : 25178 : pass_loop_versioning::execute (function *fn)
1791 : : {
1792 : 50356 : if (number_of_loops (fn) <= 1)
1793 : : return 0;
1794 : :
1795 : 25178 : enable_ranger (fn);
1796 : 25178 : unsigned int ret = loop_versioning (fn).run ();
1797 : 25178 : disable_ranger (fn);
1798 : 25178 : return ret;
1799 : : }
1800 : :
1801 : : } // anon namespace
1802 : :
1803 : : gimple_opt_pass *
1804 : 280114 : make_pass_loop_versioning (gcc::context *ctxt)
1805 : : {
1806 : 280114 : return new pass_loop_versioning (ctxt);
1807 : : }
|