Line data Source code
1 : /* Vectorizer Specific Loop Manipulations
2 : Copyright (C) 2003-2026 Free Software Foundation, Inc.
3 : Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 : and Ira Rosen <irar@il.ibm.com>
5 :
6 : This file is part of GCC.
7 :
8 : GCC is free software; you can redistribute it and/or modify it under
9 : the terms of the GNU General Public License as published by the Free
10 : Software Foundation; either version 3, or (at your option) any later
11 : version.
12 :
13 : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : for more details.
17 :
18 : You should have received a copy of the GNU General Public License
19 : along with GCC; see the file COPYING3. If not see
20 : <http://www.gnu.org/licenses/>. */
21 :
22 : #include "config.h"
23 : #include "system.h"
24 : #include "coretypes.h"
25 : #include "backend.h"
26 : #include "tree.h"
27 : #include "gimple.h"
28 : #include "cfghooks.h"
29 : #include "tree-pass.h"
30 : #include "ssa.h"
31 : #include "fold-const.h"
32 : #include "cfganal.h"
33 : #include "gimplify.h"
34 : #include "gimple-iterator.h"
35 : #include "gimplify-me.h"
36 : #include "tree-cfg.h"
37 : #include "tree-ssa-loop-manip.h"
38 : #include "tree-into-ssa.h"
39 : #include "tree-ssa.h"
40 : #include "cfgloop.h"
41 : #include "tree-scalar-evolution.h"
42 : #include "tree-vectorizer.h"
43 : #include "tree-ssa-loop-ivopts.h"
44 : #include "gimple-fold.h"
45 : #include "tree-ssa-loop-niter.h"
46 : #include "internal-fn.h"
47 : #include "stor-layout.h"
48 : #include "optabs-query.h"
49 : #include "vec-perm-indices.h"
50 : #include "insn-config.h"
51 : #include "rtl.h"
52 : #include "recog.h"
53 : #include "langhooks.h"
54 : #include "tree-vector-builder.h"
55 : #include "optabs-tree.h"
56 : #include "hierarchical_discriminator.h"
57 :
58 :
59 : /*************************************************************************
60 : Simple Loop Peeling Utilities
61 :
62 : Utilities to support loop peeling for vectorization purposes.
63 : *************************************************************************/
64 :
65 :
66 : /* Renames the use *OP_P. */
67 :
68 : static void
69 898323 : rename_use_op (use_operand_p op_p)
70 : {
71 898323 : tree new_name;
72 :
73 898323 : if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
74 : return;
75 :
76 894908 : new_name = get_current_def (USE_FROM_PTR (op_p));
77 :
78 : /* Something defined outside of the loop. */
79 894908 : if (!new_name)
80 : return;
81 :
82 : /* An ordinary ssa name defined in the loop. */
83 :
84 772526 : SET_USE (op_p, new_name);
85 : }
86 :
87 :
88 : /* Renames the variables in basic block BB. Allow renaming of PHI arguments
89 : on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
90 : true. */
91 :
92 : static void
93 110260 : rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
94 : {
95 110260 : gimple *stmt;
96 110260 : use_operand_p use_p;
97 110260 : ssa_op_iter iter;
98 110260 : edge e;
99 110260 : edge_iterator ei;
100 110260 : class loop *loop = bb->loop_father;
101 110260 : class loop *outer_loop = NULL;
102 :
103 110260 : if (rename_from_outer_loop)
104 : {
105 935 : gcc_assert (loop);
106 935 : outer_loop = loop_outer (loop);
107 : }
108 :
109 731344 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
110 510824 : gsi_next (&gsi))
111 : {
112 510824 : stmt = gsi_stmt (gsi);
113 1257378 : FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
114 746554 : rename_use_op (use_p);
115 : }
116 :
117 225603 : FOR_EACH_EDGE (e, ei, bb->preds)
118 : {
119 115343 : if (!flow_bb_inside_loop_p (loop, e->src))
120 : {
121 34488 : if (!rename_from_outer_loop)
122 34202 : continue;
123 286 : if (e->src != outer_loop->header)
124 : {
125 177 : if (outer_loop->inner->next)
126 : {
127 : /* If outer_loop has 2 inner loops, allow there to
128 : be an extra basic block which decides which of the
129 : two loops to use using LOOP_VECTORIZED. */
130 174 : if (!single_pred_p (e->src)
131 42 : || single_pred (e->src) != outer_loop->header)
132 132 : continue;
133 : }
134 : }
135 : }
136 184654 : for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
137 103645 : gsi_next (&gsi))
138 103645 : rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
139 : }
140 110260 : }
141 :
142 :
143 : struct adjust_info
144 : {
145 : tree from, to;
146 : basic_block bb;
147 : };
148 :
149 : /* A stack of values to be adjusted in debug stmts. We have to
150 : process them LIFO, so that the closest substitution applies. If we
151 : processed them FIFO, without the stack, we might substitute uses
152 : with a PHI DEF that would soon become non-dominant, and when we got
153 : to the suitable one, it wouldn't have anything to substitute any
154 : more. */
155 : static vec<adjust_info, va_heap> adjust_vec;
156 :
157 : /* Adjust any debug stmts that referenced AI->from values to use the
158 : loop-closed AI->to, if the references are dominated by AI->bb and
159 : not by the definition of AI->from. */
160 :
161 : static void
162 77934 : adjust_debug_stmts_now (adjust_info *ai)
163 : {
164 77934 : basic_block bbphi = ai->bb;
165 77934 : tree orig_def = ai->from;
166 77934 : tree new_def = ai->to;
167 77934 : imm_use_iterator imm_iter;
168 77934 : gimple *stmt;
169 77934 : basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
170 :
171 77934 : gcc_assert (dom_info_available_p (CDI_DOMINATORS));
172 :
173 : /* Adjust any debug stmts that held onto non-loop-closed
174 : references. */
175 389083 : FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
176 : {
177 233215 : use_operand_p use_p;
178 233215 : basic_block bbuse;
179 :
180 233215 : if (!is_gimple_debug (stmt))
181 175936 : continue;
182 :
183 57279 : gcc_assert (gimple_debug_bind_p (stmt));
184 :
185 57279 : bbuse = gimple_bb (stmt);
186 :
187 57279 : if ((bbuse == bbphi
188 57279 : || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
189 59111 : && !(bbuse == bbdef
190 916 : || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
191 : {
192 0 : if (new_def)
193 0 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
194 0 : SET_USE (use_p, new_def);
195 : else
196 : {
197 0 : gimple_debug_bind_reset_value (stmt);
198 0 : update_stmt (stmt);
199 : }
200 : }
201 77934 : }
202 77934 : }
203 :
204 : /* Adjust debug stmts as scheduled before. */
205 :
206 : static void
207 33175 : adjust_vec_debug_stmts (void)
208 : {
209 33175 : if (!MAY_HAVE_DEBUG_BIND_STMTS)
210 : return;
211 :
212 12378 : gcc_assert (adjust_vec.exists ());
213 :
214 90207 : while (!adjust_vec.is_empty ())
215 : {
216 77829 : adjust_debug_stmts_now (&adjust_vec.last ());
217 77829 : adjust_vec.pop ();
218 : }
219 : }
220 :
221 : /* Adjust any debug stmts that referenced FROM values to use the
222 : loop-closed TO, if the references are dominated by BB and not by
223 : the definition of FROM. If adjust_vec is non-NULL, adjustments
224 : will be postponed until adjust_vec_debug_stmts is called. */
225 :
226 : static void
227 105989 : adjust_debug_stmts (tree from, tree to, basic_block bb)
228 : {
229 105989 : adjust_info ai;
230 :
231 105989 : if (MAY_HAVE_DEBUG_BIND_STMTS
232 105989 : && TREE_CODE (from) == SSA_NAME
233 96626 : && ! SSA_NAME_IS_DEFAULT_DEF (from)
234 201258 : && ! virtual_operand_p (from))
235 : {
236 77934 : ai.from = from;
237 77934 : ai.to = to;
238 77934 : ai.bb = bb;
239 :
240 77934 : if (adjust_vec.exists ())
241 77829 : adjust_vec.safe_push (ai);
242 : else
243 105 : adjust_debug_stmts_now (&ai);
244 : }
245 105989 : }
246 :
247 : /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
248 : to adjust any debug stmts that referenced the old phi arg,
249 : presumably non-loop-closed references left over from other
250 : transformations. */
251 :
252 : static void
253 214647 : adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
254 : {
255 214647 : tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
256 :
257 214647 : gcc_assert (TREE_CODE (orig_def) != SSA_NAME
258 : || orig_def != new_def);
259 :
260 214647 : SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
261 :
262 214647 : if (MAY_HAVE_DEBUG_BIND_STMTS)
263 83079 : adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
264 : gimple_bb (update_phi));
265 214647 : }
266 :
267 : /* Define one loop rgroup control CTRL from loop LOOP. INIT_CTRL is the value
268 : that the control should have during the first iteration and NEXT_CTRL is the
269 : value that it should have on subsequent iterations. */
270 :
271 : static void
272 86 : vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
273 : tree next_ctrl)
274 : {
275 86 : gphi *phi = create_phi_node (ctrl, loop->header);
276 86 : add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
277 86 : add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
278 86 : }
279 :
280 : /* Add SEQ to the end of LOOP's preheader block. */
281 :
282 : static void
283 18 : add_preheader_seq (class loop *loop, gimple_seq seq)
284 : {
285 18 : if (seq)
286 : {
287 15 : edge pe = loop_preheader_edge (loop);
288 15 : basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
289 15 : gcc_assert (!new_bb);
290 : }
291 18 : }
292 :
293 : /* Add SEQ to the beginning of LOOP's header block. */
294 :
295 : static void
296 0 : add_header_seq (class loop *loop, gimple_seq seq)
297 : {
298 0 : if (seq)
299 : {
300 0 : gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
301 0 : gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
302 : }
303 0 : }
304 :
305 : /* Return true if the target can interleave elements of two vectors.
306 : OFFSET is 0 if the first half of the vectors should be interleaved
307 : or 1 if the second half should. When returning true, store the
308 : associated permutation in INDICES. */
309 :
310 : static bool
311 0 : interleave_supported_p (vec_perm_indices *indices, tree vectype,
312 : unsigned int offset)
313 : {
314 0 : poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
315 0 : poly_uint64 base = exact_div (nelts, 2) * offset;
316 0 : vec_perm_builder sel (nelts, 2, 3);
317 0 : for (unsigned int i = 0; i < 3; ++i)
318 : {
319 0 : sel.quick_push (base + i);
320 0 : sel.quick_push (base + i + nelts);
321 : }
322 0 : indices->new_vector (sel, 2, nelts);
323 0 : return can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
324 0 : *indices);
325 0 : }
326 :
327 : /* Try to use permutes to define the masks in DEST_RGM using the masks
328 : in SRC_RGM, given that the former has twice as many masks as the
329 : latter. Return true on success, adding any new statements to SEQ. */
330 :
331 : static bool
332 0 : vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
333 : rgroup_controls *src_rgm)
334 : {
335 0 : tree src_masktype = src_rgm->type;
336 0 : tree dest_masktype = dest_rgm->type;
337 0 : machine_mode src_mode = TYPE_MODE (src_masktype);
338 0 : insn_code icode1, icode2;
339 0 : if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
340 0 : && (icode1 = optab_handler (vec_unpacku_hi_optab,
341 : src_mode)) != CODE_FOR_nothing
342 0 : && (icode2 = optab_handler (vec_unpacku_lo_optab,
343 : src_mode)) != CODE_FOR_nothing)
344 : {
345 : /* Unpacking the source masks gives at least as many mask bits as
346 : we need. We can then VIEW_CONVERT any excess bits away. */
347 0 : machine_mode dest_mode = insn_data[icode1].operand[0].mode;
348 0 : gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
349 0 : tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
350 0 : for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
351 : {
352 0 : tree src = src_rgm->controls[i / 2];
353 0 : tree dest = dest_rgm->controls[i];
354 0 : tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
355 0 : ? VEC_UNPACK_HI_EXPR
356 : : VEC_UNPACK_LO_EXPR);
357 0 : gassign *stmt;
358 0 : if (dest_masktype == unpack_masktype)
359 0 : stmt = gimple_build_assign (dest, code, src);
360 : else
361 : {
362 0 : tree temp = make_ssa_name (unpack_masktype);
363 0 : stmt = gimple_build_assign (temp, code, src);
364 0 : gimple_seq_add_stmt (seq, stmt);
365 0 : stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
366 : build1 (VIEW_CONVERT_EXPR,
367 : dest_masktype, temp));
368 : }
369 0 : gimple_seq_add_stmt (seq, stmt);
370 : }
371 : return true;
372 : }
373 0 : vec_perm_indices indices[2];
374 0 : if (dest_masktype == src_masktype
375 0 : && interleave_supported_p (&indices[0], src_masktype, 0)
376 0 : && interleave_supported_p (&indices[1], src_masktype, 1))
377 : {
378 : /* The destination requires twice as many mask bits as the source, so
379 : we can use interleaving permutes to double up the number of bits. */
380 : tree masks[2];
381 0 : for (unsigned int i = 0; i < 2; ++i)
382 0 : masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
383 0 : for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
384 : {
385 0 : tree src = src_rgm->controls[i / 2];
386 0 : tree dest = dest_rgm->controls[i];
387 0 : gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
388 0 : src, src, masks[i & 1]);
389 0 : gimple_seq_add_stmt (seq, stmt);
390 : }
391 0 : return true;
392 : }
393 : return false;
394 0 : }
395 :
396 : /* Populate DEST_RGM->controls, given that they should add up to STEP.
397 :
398 : STEP = MIN_EXPR <ivtmp_34, VF>;
399 :
400 : First length (MIN (X, VF/N)):
401 : loop_len_15 = MIN_EXPR <STEP, VF/N>;
402 :
403 : Second length:
404 : tmp = STEP - loop_len_15;
405 : loop_len_16 = MIN (tmp, VF/N);
406 :
407 : Third length:
408 : tmp2 = tmp - loop_len_16;
409 : loop_len_17 = MIN (tmp2, VF/N);
410 :
411 : Last length:
412 : loop_len_18 = tmp2 - loop_len_17;
413 : */
414 :
415 : static void
416 0 : vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
417 : rgroup_controls *dest_rgm, tree step)
418 : {
419 0 : tree ctrl_type = dest_rgm->type;
420 0 : poly_uint64 nitems_per_ctrl
421 0 : = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
422 0 : tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
423 :
424 0 : for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
425 : {
426 0 : tree ctrl = dest_rgm->controls[i];
427 0 : if (i == 0)
428 : {
429 : /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N]. */
430 0 : gassign *assign
431 0 : = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
432 0 : gimple_seq_add_stmt (seq, assign);
433 : }
434 0 : else if (i == dest_rgm->controls.length () - 1)
435 : {
436 : /* Last iteration: Remain capped to the range [0, VF/N]. */
437 0 : gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step,
438 0 : dest_rgm->controls[i - 1]);
439 0 : gimple_seq_add_stmt (seq, assign);
440 : }
441 : else
442 : {
443 : /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N]. */
444 0 : step = gimple_build (seq, MINUS_EXPR, iv_type, step,
445 0 : dest_rgm->controls[i - 1]);
446 0 : gassign *assign
447 0 : = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
448 0 : gimple_seq_add_stmt (seq, assign);
449 : }
450 : }
451 0 : }
452 :
453 : /* Stores the standard position for induction variable increment in belonging to
454 : LOOP_EXIT (just before the exit condition of the given exit to BSI.
455 : INSERT_AFTER is set to true if the increment should be inserted after
456 : *BSI. */
457 :
458 : void
459 62018 : vect_iv_increment_position (edge loop_exit, gimple_stmt_iterator *bsi,
460 : bool *insert_after)
461 : {
462 62018 : basic_block bb = loop_exit->src;
463 62018 : *bsi = gsi_last_bb (bb);
464 62018 : *insert_after = false;
465 62018 : }
466 :
467 : /* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
468 : for all the rgroup controls in RGC and return a control that is nonzero
469 : when the loop needs to iterate. Add any new preheader statements to
470 : PREHEADER_SEQ. Use LOOP_COND_GSI to insert code before the exit gcond.
471 :
472 : RGC belongs to loop LOOP. The loop originally iterated NITERS
473 : times and has been vectorized according to LOOP_VINFO.
474 :
475 : If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
476 : starts with NITERS_SKIP dummy iterations of the scalar loop before
477 : the real work starts. The mask elements for these dummy iterations
478 : must be 0, to ensure that the extra iterations do not have an effect.
479 :
480 : It is known that:
481 :
482 : NITERS * RGC->max_nscalars_per_iter * RGC->factor
483 :
484 : does not overflow. However, MIGHT_WRAP_P says whether an induction
485 : variable that starts at 0 and has step:
486 :
487 : VF * RGC->max_nscalars_per_iter * RGC->factor
488 :
489 : might overflow before hitting a value above:
490 :
491 : (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
492 :
493 : This means that we cannot guarantee that such an induction variable
494 : would ever hit a value that produces a set of all-false masks or zero
495 : lengths for RGC.
496 :
497 : Note: the cost of the code generated by this function is modeled
498 : by vect_estimate_min_profitable_iters, so changes here may need
499 : corresponding changes there. */
500 :
501 : static tree
502 0 : vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
503 : gimple_seq *preheader_seq,
504 : gimple_seq *header_seq,
505 : gimple_stmt_iterator loop_cond_gsi,
506 : rgroup_controls *rgc, tree niters,
507 : tree niters_skip, bool might_wrap_p,
508 : tree *iv_step, tree *compare_step)
509 : {
510 0 : tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
511 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
512 0 : bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
513 :
514 0 : tree ctrl_type = rgc->type;
515 0 : unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
516 0 : poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
517 0 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
518 0 : tree length_limit = NULL_TREE;
519 : /* For length, we need length_limit to ensure length in range. */
520 0 : if (!use_masks_p)
521 0 : length_limit = build_int_cst (compare_type, nitems_per_ctrl);
522 :
523 : /* Calculate the maximum number of item values that the rgroup
524 : handles in total, the number that it handles for each iteration
525 : of the vector loop, and the number that it should skip during the
526 : first iteration of the vector loop. */
527 0 : tree nitems_total = niters;
528 0 : tree nitems_step = build_int_cst (iv_type, vf);
529 0 : tree nitems_skip = niters_skip;
530 0 : if (nitems_per_iter != 1)
531 : {
532 : /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
533 : these multiplications don't overflow. */
534 0 : tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
535 0 : tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
536 0 : nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
537 : nitems_total, compare_factor);
538 0 : nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
539 : nitems_step, iv_factor);
540 0 : if (nitems_skip)
541 0 : nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
542 : nitems_skip, compare_factor);
543 : }
544 :
545 : /* Create an induction variable that counts the number of items
546 : processed. */
547 0 : tree index_before_incr, index_after_incr;
548 0 : gimple_stmt_iterator incr_gsi;
549 0 : bool insert_after;
550 0 : edge exit_e = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
551 0 : vect_iv_increment_position (exit_e, &incr_gsi, &insert_after);
552 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
553 : {
554 : /* Create an IV that counts down from niters_total and whose step
555 : is the (variable) amount processed in the current iteration:
556 : ...
557 : _10 = (unsigned long) count_12(D);
558 : ...
559 : # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
560 : _36 = (MIN_EXPR | SELECT_VL) <ivtmp_9, POLY_INT_CST [4, 4]>;
561 : ...
562 : vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
563 : ...
564 : ivtmp_35 = ivtmp_9 - POLY_INT_CST [4, 4];
565 : ...
566 : if (ivtmp_9 > POLY_INT_CST [4, 4])
567 : goto <bb 4>; [83.33%]
568 : else
569 : goto <bb 5>; [16.67%]
570 : */
571 0 : nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
572 0 : tree step = rgc->controls.length () == 1 ? rgc->controls[0]
573 0 : : make_ssa_name (iv_type);
574 : /* Create decrement IV. */
575 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
576 : {
577 0 : create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
578 : insert_after, &index_before_incr, &index_after_incr);
579 0 : tree vectype = build_zero_cst (rgc->type);
580 0 : tree len = gimple_build (header_seq, IFN_SELECT_VL, iv_type,
581 : index_before_incr, nitems_step,
582 : vectype);
583 0 : gimple_seq_add_stmt (header_seq, gimple_build_assign (step, len));
584 : }
585 : else
586 : {
587 0 : create_iv (nitems_total, MINUS_EXPR, nitems_step, NULL_TREE, loop,
588 : &incr_gsi, insert_after, &index_before_incr,
589 : &index_after_incr);
590 0 : gimple_seq_add_stmt (header_seq,
591 0 : gimple_build_assign (step, MIN_EXPR,
592 : index_before_incr,
593 : nitems_step));
594 : }
595 0 : *iv_step = step;
596 0 : *compare_step = nitems_step;
597 0 : return LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) ? index_after_incr
598 0 : : index_before_incr;
599 : }
600 :
601 : /* Create increment IV. */
602 0 : create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
603 : loop, &incr_gsi, insert_after, &index_before_incr,
604 : &index_after_incr);
605 :
606 0 : tree zero_index = build_int_cst (compare_type, 0);
607 0 : tree test_index, test_limit, first_limit;
608 0 : gimple_stmt_iterator *test_gsi;
609 0 : if (might_wrap_p)
610 : {
611 : /* In principle the loop should stop iterating once the incremented
612 : IV reaches a value greater than or equal to:
613 :
614 : NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
615 :
616 : However, there's no guarantee that this addition doesn't overflow
617 : the comparison type, or that the IV hits a value above it before
618 : wrapping around. We therefore adjust the limit down by one
619 : IV step:
620 :
621 : (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
622 : -[infinite-prec] NITEMS_STEP
623 :
624 : and compare the IV against this limit _before_ incrementing it.
625 : Since the comparison type is unsigned, we actually want the
626 : subtraction to saturate at zero:
627 :
628 : (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
629 : -[sat] NITEMS_STEP
630 :
631 : And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
632 :
633 : NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
634 :
635 : where the rightmost subtraction can be done directly in
636 : COMPARE_TYPE. */
637 0 : test_index = index_before_incr;
638 0 : tree adjust = gimple_convert (preheader_seq, compare_type,
639 : nitems_step);
640 0 : if (nitems_skip)
641 0 : adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
642 : adjust, nitems_skip);
643 0 : test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
644 : nitems_total, adjust);
645 0 : test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
646 : test_limit, adjust);
647 0 : test_gsi = &incr_gsi;
648 :
649 : /* Get a safe limit for the first iteration. */
650 0 : if (nitems_skip)
651 : {
652 : /* The first vector iteration can handle at most NITEMS_STEP
653 : items. NITEMS_STEP <= CONST_LIMIT, and adding
654 : NITEMS_SKIP to that cannot overflow. */
655 0 : tree const_limit = build_int_cst (compare_type,
656 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)
657 0 : * nitems_per_iter);
658 0 : first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
659 : nitems_total, const_limit);
660 0 : first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
661 : first_limit, nitems_skip);
662 : }
663 : else
664 : /* For the first iteration it doesn't matter whether the IV hits
665 : a value above NITEMS_TOTAL. That only matters for the latch
666 : condition. */
667 : first_limit = nitems_total;
668 : }
669 : else
670 : {
671 : /* Test the incremented IV, which will always hit a value above
672 : the bound before wrapping. */
673 0 : test_index = index_after_incr;
674 0 : test_limit = nitems_total;
675 0 : if (nitems_skip)
676 0 : test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
677 : test_limit, nitems_skip);
678 : test_gsi = &loop_cond_gsi;
679 :
680 : first_limit = test_limit;
681 : }
682 :
683 : /* Convert the IV value to the comparison type (either a no-op or
684 : a demotion). */
685 0 : gimple_seq test_seq = NULL;
686 0 : test_index = gimple_convert (&test_seq, compare_type, test_index);
687 0 : gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
688 :
689 : /* Provide a definition of each control in the group. */
690 0 : tree next_ctrl = NULL_TREE;
691 0 : tree ctrl;
692 0 : unsigned int i;
693 0 : FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
694 : {
695 : /* Previous controls will cover BIAS items. This control covers the
696 : next batch. */
697 0 : poly_uint64 bias = nitems_per_ctrl * i;
698 0 : tree bias_tree = build_int_cst (compare_type, bias);
699 :
700 : /* See whether the first iteration of the vector loop is known
701 : to have a full control. */
702 0 : poly_uint64 const_limit;
703 0 : bool first_iteration_full
704 0 : = (poly_int_tree_p (first_limit, &const_limit)
705 0 : && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
706 :
707 : /* Rather than have a new IV that starts at BIAS and goes up to
708 : TEST_LIMIT, prefer to use the same 0-based IV for each control
709 : and adjust the bound down by BIAS. */
710 0 : tree this_test_limit = test_limit;
711 0 : if (i != 0)
712 : {
713 0 : this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
714 : compare_type, this_test_limit,
715 : bias_tree);
716 0 : this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
717 : compare_type, this_test_limit,
718 : bias_tree);
719 : }
720 :
721 : /* Create the initial control. First include all items that
722 : are within the loop limit. */
723 0 : tree init_ctrl = NULL_TREE;
724 0 : if (!first_iteration_full)
725 : {
726 0 : tree start, end;
727 0 : if (first_limit == test_limit)
728 : {
729 : /* Use a natural test between zero (the initial IV value)
730 : and the loop limit. The "else" block would be valid too,
731 : but this choice can avoid the need to load BIAS_TREE into
732 : a register. */
733 : start = zero_index;
734 : end = this_test_limit;
735 : }
736 : else
737 : {
738 : /* FIRST_LIMIT is the maximum number of items handled by the
739 : first iteration of the vector loop. Test the portion
740 : associated with this control. */
741 0 : start = bias_tree;
742 0 : end = first_limit;
743 : }
744 :
745 0 : if (use_masks_p)
746 0 : init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
747 : start, end, "max_mask");
748 : else
749 : {
750 0 : init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
751 0 : gimple_seq seq = vect_gen_len (init_ctrl, start,
752 : end, length_limit);
753 0 : gimple_seq_add_seq (preheader_seq, seq);
754 : }
755 : }
756 :
757 : /* Now AND out the bits that are within the number of skipped
758 : items. */
759 0 : poly_uint64 const_skip;
760 0 : if (nitems_skip
761 0 : && !(poly_int_tree_p (nitems_skip, &const_skip)
762 0 : && known_le (const_skip, bias)))
763 : {
764 0 : gcc_assert (use_masks_p);
765 0 : tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
766 : bias_tree, nitems_skip);
767 0 : if (init_ctrl)
768 0 : init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
769 : init_ctrl, unskipped_mask);
770 : else
771 : init_ctrl = unskipped_mask;
772 : }
773 :
774 0 : if (!init_ctrl)
775 : {
776 : /* First iteration is full. */
777 0 : if (use_masks_p)
778 0 : init_ctrl = build_minus_one_cst (ctrl_type);
779 : else
780 : init_ctrl = length_limit;
781 : }
782 :
783 : /* Get the control value for the next iteration of the loop. */
784 0 : if (use_masks_p)
785 : {
786 0 : gimple_seq stmts = NULL;
787 0 : next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
788 : this_test_limit, "next_mask");
789 0 : gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
790 : }
791 : else
792 : {
793 0 : next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
794 0 : gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
795 : length_limit);
796 0 : gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
797 : }
798 :
799 0 : vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
800 : }
801 :
802 0 : int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
803 0 : if (partial_load_bias != 0)
804 : {
805 0 : tree adjusted_len = rgc->bias_adjusted_ctrl;
806 0 : gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
807 0 : rgc->controls[0],
808 : build_int_cst
809 0 : (TREE_TYPE (rgc->controls[0]),
810 0 : partial_load_bias));
811 0 : gimple_seq_add_stmt (header_seq, minus);
812 : }
813 :
814 : return next_ctrl;
815 : }
816 :
817 : /* Set up the iteration condition and rgroup controls for LOOP, given
818 : that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
819 : loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is
820 : the number of iterations of the original scalar loop that should be
821 : handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as
822 : for vect_set_loop_condition.
823 :
824 : Insert the branch-back condition before LOOP_COND_GSI and return the
825 : final gcond. */
826 :
827 : static gcond *
828 0 : vect_set_loop_condition_partial_vectors (class loop *loop, edge exit_edge,
829 : loop_vec_info loop_vinfo, tree niters,
830 : tree final_iv, bool niters_maybe_zero,
831 : gimple_stmt_iterator loop_cond_gsi)
832 : {
833 0 : gimple_seq preheader_seq = NULL;
834 0 : gimple_seq header_seq = NULL;
835 :
836 0 : bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
837 0 : tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
838 0 : unsigned int compare_precision = TYPE_PRECISION (compare_type);
839 0 : tree orig_niters = niters;
840 :
841 : /* Type of the initial value of NITERS. */
842 0 : tree ni_actual_type = TREE_TYPE (niters);
843 0 : unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
844 0 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
845 0 : if (niters_skip)
846 0 : niters_skip = gimple_convert (&preheader_seq, compare_type, niters_skip);
847 :
848 : /* Convert NITERS to the same size as the compare. */
849 0 : if (compare_precision > ni_actual_precision
850 0 : && niters_maybe_zero)
851 : {
852 : /* We know that there is always at least one iteration, so if the
853 : count is zero then it must have wrapped. Cope with this by
854 : subtracting 1 before the conversion and adding 1 to the result. */
855 0 : gcc_assert (TYPE_UNSIGNED (ni_actual_type));
856 0 : niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
857 : niters, build_minus_one_cst (ni_actual_type));
858 0 : niters = gimple_convert (&preheader_seq, compare_type, niters);
859 0 : niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
860 : niters, build_one_cst (compare_type));
861 : }
862 : else
863 0 : niters = gimple_convert (&preheader_seq, compare_type, niters);
864 :
865 : /* Iterate over all the rgroups and fill in their controls. We could use
866 : the first control from any rgroup for the loop condition; here we
867 : arbitrarily pick the last. */
868 0 : tree test_ctrl = NULL_TREE;
869 0 : tree iv_step = NULL_TREE;
870 0 : tree compare_step = NULL_TREE;
871 0 : rgroup_controls *rgc;
872 0 : rgroup_controls *iv_rgc = nullptr;
873 0 : unsigned int i;
874 0 : auto_vec<rgroup_controls> *controls = use_masks_p
875 0 : ? &LOOP_VINFO_MASKS (loop_vinfo).rgc_vec
876 : : &LOOP_VINFO_LENS (loop_vinfo);
877 0 : FOR_EACH_VEC_ELT (*controls, i, rgc)
878 0 : if (!rgc->controls.is_empty ())
879 : {
880 : /* First try using permutes. This adds a single vector
881 : instruction to the loop for each mask, but needs no extra
882 : loop invariants or IVs. */
883 0 : unsigned int nmasks = i + 1;
884 0 : if (use_masks_p && (nmasks & 1) == 0)
885 : {
886 0 : rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
887 0 : if (!half_rgc->controls.is_empty ()
888 0 : && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
889 0 : continue;
890 : }
891 :
892 0 : if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
893 0 : || !iv_rgc
894 0 : || (iv_rgc->max_nscalars_per_iter * iv_rgc->factor
895 0 : != rgc->max_nscalars_per_iter * rgc->factor))
896 : {
897 : /* See whether zero-based IV would ever generate all-false masks
898 : or zero length before wrapping around. */
899 0 : bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
900 :
901 : /* Set up all controls for this group. */
902 0 : test_ctrl
903 0 : = vect_set_loop_controls_directly (loop, loop_vinfo,
904 : &preheader_seq, &header_seq,
905 : loop_cond_gsi, rgc, niters,
906 : niters_skip, might_wrap_p,
907 : &iv_step, &compare_step);
908 :
909 0 : iv_rgc = rgc;
910 : }
911 :
912 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
913 0 : && rgc->controls.length () > 1)
914 : {
915 : /* vect_set_loop_controls_directly creates an IV whose step
916 : is equal to the expected sum of RGC->controls. Use that
917 : information to populate RGC->controls. */
918 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
919 0 : gcc_assert (iv_step);
920 0 : vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, iv_step);
921 : }
922 : }
923 :
924 : /* Emit all accumulated statements. */
925 0 : add_preheader_seq (loop, preheader_seq);
926 0 : add_header_seq (loop, header_seq);
927 :
928 : /* Get a boolean result that tells us whether to iterate. */
929 0 : gcond *cond_stmt;
930 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
931 0 : && !LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
932 : {
933 0 : gcc_assert (compare_step);
934 0 : tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
935 0 : cond_stmt = gimple_build_cond (code, test_ctrl, compare_step, NULL_TREE,
936 : NULL_TREE);
937 0 : }
938 : else
939 : {
940 0 : tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
941 0 : tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
942 0 : cond_stmt
943 0 : = gimple_build_cond (code, test_ctrl, zero_ctrl, NULL_TREE, NULL_TREE);
944 : }
945 0 : gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
946 :
947 : /* The loop iterates (NITERS - 1) / VF + 1 times.
948 : Subtract one from this to get the latch count. */
949 0 : tree step = build_int_cst (compare_type,
950 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo));
951 0 : tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
952 : build_minus_one_cst (compare_type));
953 0 : loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
954 : niters_minus_one, step);
955 :
956 0 : if (final_iv)
957 : {
958 0 : gassign *assign;
959 : /* If vectorizing an inverted early break loop we have to restart the
960 : scalar loop at niters - vf. This matches what we do in
961 : vect_gen_vector_loop_niters_mult_vf for non-masked loops. */
962 0 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
963 : {
964 0 : tree ftype = TREE_TYPE (orig_niters);
965 0 : tree vf = build_int_cst (ftype, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
966 0 : assign = gimple_build_assign (final_iv, MINUS_EXPR, orig_niters, vf);
967 : }
968 : else
969 0 : assign = gimple_build_assign (final_iv, orig_niters);
970 0 : gsi_insert_on_edge_immediate (exit_edge, assign);
971 : }
972 :
973 0 : return cond_stmt;
974 : }
975 :
976 : /* Set up the iteration condition and rgroup controls for LOOP in AVX512
977 : style, given that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the
978 : vectorized loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is
979 : the number of iterations of the original scalar loop that should be
980 : handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as
981 : for vect_set_loop_condition.
982 :
983 : Insert the branch-back condition before LOOP_COND_GSI and return the
984 : final gcond. */
985 :
986 : static gcond *
987 18 : vect_set_loop_condition_partial_vectors_avx512 (class loop *loop,
988 : edge exit_edge,
989 : loop_vec_info loop_vinfo, tree niters,
990 : tree final_iv,
991 : bool niters_maybe_zero,
992 : gimple_stmt_iterator loop_cond_gsi)
993 : {
994 18 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
995 18 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
996 18 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
997 18 : tree orig_niters = niters;
998 18 : gimple_seq preheader_seq = NULL;
999 :
1000 : /* Create an IV that counts down from niters and whose step
1001 : is the number of iterations processed in the current iteration.
1002 : Produce the controls with compares like the following.
1003 :
1004 : # iv_2 = PHI <niters, iv_3>
1005 : rem_4 = MIN <iv_2, VF>;
1006 : remv_6 = { rem_4, rem_4, rem_4, ... }
1007 : mask_5 = { 0, 0, 1, 1, 2, 2, ... } < remv6;
1008 : iv_3 = iv_2 - VF;
1009 : if (iv_2 > VF)
1010 : continue;
1011 :
1012 : Where the constant is built with elements at most VF - 1 and
1013 : repetitions according to max_nscalars_per_iter which is guarnateed
1014 : to be the same within a group. */
1015 :
1016 : /* Convert NITERS to the determined IV type. */
1017 18 : if (TYPE_PRECISION (iv_type) > TYPE_PRECISION (TREE_TYPE (niters))
1018 18 : && niters_maybe_zero)
1019 : {
1020 : /* We know that there is always at least one iteration, so if the
1021 : count is zero then it must have wrapped. Cope with this by
1022 : subtracting 1 before the conversion and adding 1 to the result. */
1023 0 : gcc_assert (TYPE_UNSIGNED (TREE_TYPE (niters)));
1024 0 : niters = gimple_build (&preheader_seq, PLUS_EXPR, TREE_TYPE (niters),
1025 0 : niters, build_minus_one_cst (TREE_TYPE (niters)));
1026 0 : niters = gimple_convert (&preheader_seq, iv_type, niters);
1027 0 : niters = gimple_build (&preheader_seq, PLUS_EXPR, iv_type,
1028 : niters, build_one_cst (iv_type));
1029 : }
1030 : else
1031 18 : niters = gimple_convert (&preheader_seq, iv_type, niters);
1032 :
1033 : /* Bias the initial value of the IV in case we need to skip iterations
1034 : at the beginning. */
1035 18 : tree niters_adj = niters;
1036 18 : if (niters_skip)
1037 : {
1038 1 : tree skip = gimple_convert (&preheader_seq, iv_type, niters_skip);
1039 1 : niters_adj = gimple_build (&preheader_seq, PLUS_EXPR,
1040 : iv_type, niters, skip);
1041 : }
1042 :
1043 : /* The iteration step is the vectorization factor. */
1044 18 : tree iv_step = build_int_cst (iv_type, vf);
1045 :
1046 : /* Create the decrement IV. */
1047 18 : tree index_before_incr, index_after_incr;
1048 18 : gimple_stmt_iterator incr_gsi;
1049 18 : bool insert_after;
1050 18 : vect_iv_increment_position (exit_edge, &incr_gsi, &insert_after);
1051 18 : create_iv (niters_adj, MINUS_EXPR, iv_step, NULL_TREE, loop,
1052 : &incr_gsi, insert_after, &index_before_incr,
1053 : &index_after_incr);
1054 :
1055 : /* Iterate over all the rgroups and fill in their controls. */
1056 80 : for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1057 : {
1058 26 : if (rgc.controls.is_empty ())
1059 6 : continue;
1060 :
1061 20 : tree ctrl_type = rgc.type;
1062 20 : poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type);
1063 :
1064 20 : tree vectype = rgc.compare_type;
1065 :
1066 : /* index_after_incr is the IV specifying the remaining iterations in
1067 : the next iteration. */
1068 20 : tree rem = index_after_incr;
1069 : /* When the data type for the compare to produce the mask is
1070 : smaller than the IV type we need to saturate. Saturate to
1071 : the smallest possible value (IV_TYPE) so we only have to
1072 : saturate once (CSE will catch redundant ones we add). */
1073 20 : if (TYPE_PRECISION (TREE_TYPE (vectype)) < TYPE_PRECISION (iv_type))
1074 9 : rem = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
1075 : UNKNOWN_LOCATION,
1076 9 : MIN_EXPR, TREE_TYPE (rem), rem, iv_step);
1077 20 : rem = gimple_convert (&incr_gsi, false, GSI_CONTINUE_LINKING,
1078 20 : UNKNOWN_LOCATION, TREE_TYPE (vectype), rem);
1079 :
1080 : /* Build a data vector composed of the remaining iterations. */
1081 20 : rem = gimple_build_vector_from_val (&incr_gsi, false, GSI_CONTINUE_LINKING,
1082 : UNKNOWN_LOCATION, vectype, rem);
1083 :
1084 : /* Provide a definition of each vector in the control group. */
1085 20 : tree next_ctrl = NULL_TREE;
1086 20 : tree first_rem = NULL_TREE;
1087 20 : tree ctrl;
1088 20 : unsigned int i;
1089 238 : FOR_EACH_VEC_ELT_REVERSE (rgc.controls, i, ctrl)
1090 : {
1091 : /* Previous controls will cover BIAS items. This control covers the
1092 : next batch. */
1093 86 : poly_uint64 bias = nitems_per_ctrl * i;
1094 :
1095 : /* Build the constant to compare the remaining iters against,
1096 : this is sth like { 0, 0, 1, 1, 2, 2, 3, 3, ... } appropriately
1097 : split into pieces. */
1098 86 : unsigned n = TYPE_VECTOR_SUBPARTS (ctrl_type).to_constant ();
1099 86 : tree_vector_builder builder (vectype, n, 1);
1100 1266 : for (unsigned i = 0; i < n; ++i)
1101 : {
1102 1180 : unsigned HOST_WIDE_INT val
1103 1180 : = (i + bias.to_constant ()) / rgc.max_nscalars_per_iter;
1104 1180 : gcc_assert (val < vf.to_constant ());
1105 1180 : builder.quick_push (build_int_cst (TREE_TYPE (vectype), val));
1106 : }
1107 86 : tree cmp_series = builder.build ();
1108 :
1109 : /* Create the initial control. First include all items that
1110 : are within the loop limit. */
1111 86 : tree init_ctrl = NULL_TREE;
1112 86 : poly_uint64 const_limit;
1113 : /* See whether the first iteration of the vector loop is known
1114 : to have a full control. */
1115 86 : if (poly_int_tree_p (niters, &const_limit)
1116 86 : && known_ge (const_limit, (i + 1) * nitems_per_ctrl))
1117 1 : init_ctrl = build_minus_one_cst (ctrl_type);
1118 : else
1119 : {
1120 : /* The remaining work items initially are niters. Saturate,
1121 : splat and compare. */
1122 85 : if (!first_rem)
1123 : {
1124 19 : first_rem = niters;
1125 19 : if (TYPE_PRECISION (TREE_TYPE (vectype))
1126 19 : < TYPE_PRECISION (iv_type))
1127 9 : first_rem = gimple_build (&preheader_seq,
1128 9 : MIN_EXPR, TREE_TYPE (first_rem),
1129 : first_rem, iv_step);
1130 19 : first_rem = gimple_convert (&preheader_seq, TREE_TYPE (vectype),
1131 : first_rem);
1132 19 : first_rem = gimple_build_vector_from_val (&preheader_seq,
1133 : vectype, first_rem);
1134 : }
1135 85 : init_ctrl = gimple_build (&preheader_seq, LT_EXPR, ctrl_type,
1136 : cmp_series, first_rem);
1137 : }
1138 :
1139 : /* Now AND out the bits that are within the number of skipped
1140 : items. */
1141 86 : poly_uint64 const_skip;
1142 86 : if (niters_skip
1143 86 : && !(poly_int_tree_p (niters_skip, &const_skip)
1144 1 : && known_le (const_skip, bias)))
1145 : {
1146 : /* For integer mode masks it's cheaper to shift out the bits
1147 : since that avoids loading a constant. */
1148 1 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (ctrl_type)) == MODE_INT);
1149 1 : init_ctrl = gimple_build (&preheader_seq, VIEW_CONVERT_EXPR,
1150 1 : lang_hooks.types.type_for_mode
1151 1 : (TYPE_MODE (ctrl_type), 1),
1152 : init_ctrl);
1153 : /* ??? But when the shift amount isn't constant this requires
1154 : a round-trip to GRPs. We could apply the bias to either
1155 : side of the compare instead. */
1156 2 : tree shift = gimple_build (&preheader_seq, MINUS_EXPR,
1157 1 : TREE_TYPE (niters_skip), niters_skip,
1158 1 : build_int_cst (TREE_TYPE (niters_skip),
1159 1 : bias));
1160 2 : shift = gimple_build (&preheader_seq, MULT_EXPR,
1161 1 : TREE_TYPE (niters_skip), shift,
1162 1 : build_int_cst (TREE_TYPE (niters_skip),
1163 1 : rgc.max_nscalars_per_iter));
1164 1 : init_ctrl = gimple_build (&preheader_seq, LSHIFT_EXPR,
1165 1 : TREE_TYPE (init_ctrl),
1166 : init_ctrl, shift);
1167 1 : init_ctrl = gimple_build (&preheader_seq, VIEW_CONVERT_EXPR,
1168 : ctrl_type, init_ctrl);
1169 : }
1170 :
1171 : /* Get the control value for the next iteration of the loop. */
1172 86 : next_ctrl = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
1173 : UNKNOWN_LOCATION,
1174 : LT_EXPR, ctrl_type, cmp_series, rem);
1175 :
1176 86 : vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
1177 86 : }
1178 : }
1179 :
1180 : /* Emit all accumulated statements. */
1181 18 : add_preheader_seq (loop, preheader_seq);
1182 :
1183 : /* Adjust the exit test using the decrementing IV. */
1184 18 : tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
1185 : /* When we peel for alignment with niter_skip != 0 this can
1186 : cause niter + niter_skip to wrap and since we are comparing the
1187 : value before the decrement here we get a false early exit.
1188 : We can't compare the value after decrement either because that
1189 : decrement could wrap as well as we're not doing a saturating
1190 : decrement. To avoid this situation we force a larger
1191 : iv_type. */
1192 18 : gcond *cond_stmt = gimple_build_cond (code, index_before_incr, iv_step,
1193 : NULL_TREE, NULL_TREE);
1194 18 : gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
1195 :
1196 : /* The loop iterates (NITERS - 1 + NITERS_SKIP) / VF + 1 times.
1197 : Subtract one from this to get the latch count. */
1198 18 : tree niters_minus_one
1199 18 : = fold_build2 (PLUS_EXPR, TREE_TYPE (orig_niters), orig_niters,
1200 : build_minus_one_cst (TREE_TYPE (orig_niters)));
1201 18 : tree niters_adj2 = fold_convert (iv_type, niters_minus_one);
1202 18 : if (niters_skip)
1203 1 : niters_adj2 = fold_build2 (PLUS_EXPR, iv_type, niters_minus_one,
1204 : fold_convert (iv_type, niters_skip));
1205 18 : loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, iv_type,
1206 : niters_adj2, iv_step);
1207 :
1208 18 : if (final_iv)
1209 : {
1210 0 : gassign *assign;
1211 : /* If vectorizing an inverted early break loop we have to restart the
1212 : scalar loop at niters - vf. This matches what we do in
1213 : vect_gen_vector_loop_niters_mult_vf for non-masked loops. */
1214 0 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
1215 : {
1216 0 : tree ftype = TREE_TYPE (orig_niters);
1217 0 : tree vf = build_int_cst (ftype, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1218 0 : assign = gimple_build_assign (final_iv, MINUS_EXPR, orig_niters, vf);
1219 : }
1220 : else
1221 0 : assign = gimple_build_assign (final_iv, orig_niters);
1222 0 : gsi_insert_on_edge_immediate (exit_edge, assign);
1223 : }
1224 :
1225 18 : return cond_stmt;
1226 : }
1227 :
1228 :
1229 : /* Like vect_set_loop_condition, but handle the case in which the vector
1230 : loop handles exactly VF scalars per iteration. */
1231 :
1232 : static gcond *
1233 61933 : vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge exit_edge,
1234 : class loop *loop, tree niters, tree step,
1235 : tree final_iv, bool niters_maybe_zero,
1236 : gimple_stmt_iterator loop_cond_gsi)
1237 : {
1238 61933 : tree indx_before_incr, indx_after_incr;
1239 61933 : gcond *cond_stmt;
1240 61933 : gcond *orig_cond;
1241 61933 : edge pe = loop_preheader_edge (loop);
1242 61933 : gimple_stmt_iterator incr_gsi;
1243 61933 : bool insert_after;
1244 61933 : enum tree_code code;
1245 61933 : tree niters_type = TREE_TYPE (niters);
1246 :
1247 61933 : orig_cond = get_loop_exit_condition (exit_edge);
1248 61933 : gcc_assert (orig_cond);
1249 61933 : loop_cond_gsi = gsi_for_stmt (orig_cond);
1250 :
1251 61933 : tree init, limit;
1252 61933 : if (!niters_maybe_zero && integer_onep (step))
1253 : {
1254 : /* In this case we can use a simple 0-based IV:
1255 :
1256 : A:
1257 : x = 0;
1258 : do
1259 : {
1260 : ...
1261 : x += 1;
1262 : }
1263 : while (x < NITERS); */
1264 61933 : code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
1265 61933 : init = build_zero_cst (niters_type);
1266 61933 : limit = niters;
1267 : }
1268 : else
1269 : {
1270 : /* The following works for all values of NITERS except 0:
1271 :
1272 : B:
1273 : x = 0;
1274 : do
1275 : {
1276 : ...
1277 : x += STEP;
1278 : }
1279 : while (x <= NITERS - STEP);
1280 :
1281 : so that the loop continues to iterate if x + STEP - 1 < NITERS
1282 : but stops if x + STEP - 1 >= NITERS.
1283 :
1284 : However, if NITERS is zero, x never hits a value above NITERS - STEP
1285 : before wrapping around. There are two obvious ways of dealing with
1286 : this:
1287 :
1288 : - start at STEP - 1 and compare x before incrementing it
1289 : - start at -1 and compare x after incrementing it
1290 :
1291 : The latter is simpler and is what we use. The loop in this case
1292 : looks like:
1293 :
1294 : C:
1295 : x = -1;
1296 : do
1297 : {
1298 : ...
1299 : x += STEP;
1300 : }
1301 : while (x < NITERS - STEP);
1302 :
1303 : In both cases the loop limit is NITERS - STEP. */
1304 0 : gimple_seq seq = NULL;
1305 0 : limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
1306 0 : limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
1307 0 : if (seq)
1308 : {
1309 0 : basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1310 0 : gcc_assert (!new_bb);
1311 : }
1312 0 : if (niters_maybe_zero)
1313 : {
1314 : /* Case C. */
1315 0 : code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
1316 0 : init = build_all_ones_cst (niters_type);
1317 : }
1318 : else
1319 : {
1320 : /* Case B. */
1321 0 : code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
1322 0 : init = build_zero_cst (niters_type);
1323 : }
1324 : }
1325 :
1326 61933 : vect_iv_increment_position (exit_edge, &incr_gsi, &insert_after);
1327 61933 : create_iv (init, PLUS_EXPR, step, NULL_TREE, loop,
1328 : &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
1329 61933 : indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
1330 : true, NULL_TREE, true,
1331 : GSI_SAME_STMT);
1332 61933 : limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
1333 : true, GSI_SAME_STMT);
1334 :
1335 61933 : cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
1336 : NULL_TREE);
1337 :
1338 61933 : gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
1339 :
1340 : /* Record the number of latch iterations. */
1341 61933 : if (limit == niters)
1342 : /* Case A: the loop iterates NITERS times. Subtract one to get the
1343 : latch count. */
1344 61933 : loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
1345 : build_int_cst (niters_type, 1));
1346 : else
1347 : /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
1348 : Subtract one from this to get the latch count. */
1349 0 : loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
1350 : limit, step);
1351 :
1352 61933 : if (final_iv)
1353 : {
1354 0 : gassign *assign;
1355 0 : gcc_assert (single_pred_p (exit_edge->dest));
1356 0 : tree phi_dest
1357 0 : = integer_zerop (init) ? final_iv : copy_ssa_name (indx_after_incr);
1358 : /* Make sure to maintain LC SSA form here and elide the subtraction
1359 : if the value is zero. */
1360 0 : gphi *phi = create_phi_node (phi_dest, exit_edge->dest);
1361 0 : add_phi_arg (phi, indx_after_incr, exit_edge, UNKNOWN_LOCATION);
1362 0 : if (!integer_zerop (init))
1363 : {
1364 0 : assign = gimple_build_assign (final_iv, MINUS_EXPR,
1365 : phi_dest, init);
1366 0 : gimple_stmt_iterator gsi = gsi_after_labels (exit_edge->dest);
1367 0 : gsi_insert_before (&gsi, assign, GSI_SAME_STMT);
1368 : }
1369 : }
1370 :
1371 61933 : return cond_stmt;
1372 : }
1373 :
1374 : /* If we're using fully-masked loops, make LOOP iterate:
1375 :
1376 : N == (NITERS - 1) / STEP + 1
1377 :
1378 : times. When NITERS is zero, this is equivalent to making the loop
1379 : execute (1 << M) / STEP times, where M is the precision of NITERS.
1380 : NITERS_MAYBE_ZERO is true if this last case might occur.
1381 :
1382 : If we're not using fully-masked loops, make LOOP iterate:
1383 :
1384 : N == (NITERS - STEP) / STEP + 1
1385 :
1386 : times, where NITERS is known to be outside the range [1, STEP - 1].
1387 : This is equivalent to making the loop execute NITERS / STEP times
1388 : when NITERS is nonzero and (1 << M) / STEP times otherwise.
1389 : NITERS_MAYBE_ZERO again indicates whether this last case might occur.
1390 :
1391 : If FINAL_IV is nonnull, it is an SSA name that should be set to
1392 : N * STEP on exit from the loop.
1393 :
1394 : Assumption: the exit-condition of LOOP is the last stmt in the loop. */
1395 :
1396 : void
1397 61951 : vect_set_loop_condition (class loop *loop, edge loop_e, loop_vec_info loop_vinfo,
1398 : tree niters, tree step, tree final_iv,
1399 : bool niters_maybe_zero)
1400 : {
1401 61951 : gcond *cond_stmt;
1402 61951 : gcond *orig_cond = get_loop_exit_condition (loop_e);
1403 61951 : gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
1404 :
1405 61951 : if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1406 : {
1407 18 : if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_avx512)
1408 18 : cond_stmt = vect_set_loop_condition_partial_vectors_avx512 (loop, loop_e,
1409 : loop_vinfo,
1410 : niters, final_iv,
1411 : niters_maybe_zero,
1412 : loop_cond_gsi);
1413 : else
1414 0 : cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_e,
1415 : loop_vinfo,
1416 : niters, final_iv,
1417 : niters_maybe_zero,
1418 : loop_cond_gsi);
1419 : }
1420 : else
1421 61933 : cond_stmt = vect_set_loop_condition_normal (loop_vinfo, loop_e, loop,
1422 : niters,
1423 : step, final_iv,
1424 : niters_maybe_zero,
1425 : loop_cond_gsi);
1426 :
1427 : /* Remove old loop exit test. */
1428 61951 : stmt_vec_info orig_cond_info;
1429 61951 : if (loop_vinfo
1430 61951 : && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
1431 61523 : loop_vinfo->remove_stmt (orig_cond_info);
1432 : else
1433 428 : gsi_remove (&loop_cond_gsi, true);
1434 :
1435 61951 : if (dump_enabled_p ())
1436 11222 : dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
1437 : (gimple *) cond_stmt);
1438 61951 : }
1439 :
1440 : /* Get the virtual operand live on E. The precondition on this is valid
1441 : immediate dominators and an actual virtual definition dominating E. */
1442 : /* ??? Costly band-aid. For the use in question we can populate a
1443 : live-on-exit/end-of-BB virtual operand when copying stmts. */
1444 :
1445 : static tree
1446 10 : get_live_virtual_operand_on_edge (edge e)
1447 : {
1448 10 : basic_block bb = e->src;
1449 22 : do
1450 : {
1451 61 : for (auto gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1452 : {
1453 37 : gimple *stmt = gsi_stmt (gsi);
1454 55 : if (gimple_vdef (stmt))
1455 10 : return gimple_vdef (stmt);
1456 39 : if (gimple_vuse (stmt))
1457 : return gimple_vuse (stmt);
1458 : }
1459 8 : if (gphi *vphi = get_virtual_phi (bb))
1460 2 : return gimple_phi_result (vphi);
1461 6 : bb = get_immediate_dominator (CDI_DOMINATORS, bb);
1462 6 : }
1463 : while (1);
1464 : }
1465 :
1466 : /* Given LOOP this function generates a new copy of it and puts it
1467 : on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
1468 : non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1469 : basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1470 : entry or exit of LOOP. If FLOW_LOOPS then connect LOOP to SCALAR_LOOP as a
1471 : continuation. This is correct for cases where one loop continues from the
1472 : other like in the vectorizer, but not true for uses in e.g. loop distribution
1473 : where the contents of the loop body are split but the iteration space of both
1474 : copies remains the same.
1475 :
1476 : If UPDATED_DOMS is not NULL it is update with the list of basic blocks whoms
1477 : dominators were updated during the peeling. When doing early break vectorization
1478 : then LOOP_VINFO needs to be provided and is used to keep track of any newly created
1479 : memory references that need to be updated should we decide to vectorize. */
1480 :
1481 : class loop *
1482 34334 : slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
1483 : class loop *scalar_loop,
1484 : edge scalar_exit, edge e, edge *new_e,
1485 : bool flow_loops,
1486 : vec<basic_block> *updated_doms,
1487 : bool uncounted_p, bool create_main_e,
1488 : bool redirect_exits)
1489 : {
1490 34334 : class loop *new_loop;
1491 34334 : basic_block *new_bbs, *bbs, *pbbs;
1492 34334 : bool at_exit;
1493 34334 : bool was_imm_dom;
1494 34334 : basic_block exit_dest;
1495 34334 : edge exit, new_exit;
1496 34334 : bool duplicate_outer_loop = false;
1497 :
1498 34334 : exit = loop_exit;
1499 34334 : at_exit = (e == exit);
1500 34334 : if (!at_exit && e != loop_preheader_edge (loop))
1501 : return NULL;
1502 :
1503 34334 : if (scalar_loop == NULL)
1504 : {
1505 31805 : scalar_loop = loop;
1506 31805 : scalar_exit = loop_exit;
1507 : }
1508 2529 : else if (scalar_loop == loop)
1509 0 : scalar_exit = loop_exit;
1510 : else
1511 : {
1512 : /* Loop has been version, match exits up using the aux index. */
1513 7587 : for (edge exit : get_loop_exit_edges (scalar_loop))
1514 2529 : if (exit->aux == loop_exit->aux)
1515 : {
1516 2529 : scalar_exit = exit;
1517 2529 : break;
1518 2529 : }
1519 :
1520 2529 : gcc_assert (scalar_exit);
1521 : }
1522 :
1523 34334 : bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1524 34334 : pbbs = bbs + 1;
1525 34334 : get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1526 : /* Allow duplication of outer loops. */
1527 34334 : if (scalar_loop->inner)
1528 132 : duplicate_outer_loop = true;
1529 :
1530 : /* Generate new loop structure. */
1531 34334 : new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1532 34334 : duplicate_subloops (scalar_loop, new_loop);
1533 :
1534 34334 : exit_dest = exit->dest;
1535 34334 : was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1536 34334 : exit_dest) == exit->src ?
1537 : true : false);
1538 :
1539 : /* Also copy the pre-header, this avoids jumping through hoops to
1540 : duplicate the loop entry PHI arguments. Create an empty
1541 : pre-header unconditionally for this. */
1542 34334 : basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1543 34334 : edge entry_e = single_pred_edge (preheader);
1544 34334 : bbs[0] = preheader;
1545 34334 : new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1546 :
1547 34334 : copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1548 : &scalar_exit, 1, &new_exit, NULL,
1549 : at_exit ? loop->latch : e->src, true);
1550 34334 : exit = loop_exit;
1551 34334 : basic_block new_preheader = new_bbs[0];
1552 :
1553 34334 : gcc_assert (new_exit);
1554 :
1555 : /* Record the new loop exit information. new_loop doesn't have SCEV data and
1556 : so we must initialize the exit information. */
1557 34334 : if (new_e)
1558 33175 : *new_e = new_exit;
1559 :
1560 : /* Before installing PHI arguments make sure that the edges
1561 : into them match that of the scalar loop we analyzed. This
1562 : makes sure the SLP tree matches up between the main vectorized
1563 : loop and the epilogue vectorized copies. */
1564 34334 : if (single_succ_edge (preheader)->dest_idx
1565 34334 : != single_succ_edge (new_bbs[0])->dest_idx)
1566 : {
1567 29335 : basic_block swap_bb = new_bbs[1];
1568 29335 : gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1569 29335 : std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1570 29335 : EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1571 29335 : EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1572 : }
1573 34334 : if (duplicate_outer_loop)
1574 : {
1575 132 : class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1576 132 : if (loop_preheader_edge (scalar_loop)->dest_idx
1577 132 : != loop_preheader_edge (new_inner_loop)->dest_idx)
1578 : {
1579 99 : basic_block swap_bb = new_inner_loop->header;
1580 99 : gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1581 99 : std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1582 99 : EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1583 99 : EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1584 : }
1585 : }
1586 :
1587 34334 : add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1588 :
1589 : /* Skip new preheader since it's deleted if copy loop is added at entry. */
1590 178928 : for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1591 110260 : rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1592 :
1593 : /* Rename the exit uses. */
1594 104158 : for (edge exit : get_loop_exit_edges (new_loop))
1595 35490 : for (auto gsi = gsi_start_phis (exit->dest);
1596 83614 : !gsi_end_p (gsi); gsi_next (&gsi))
1597 : {
1598 48124 : tree orig_def = PHI_ARG_DEF_FROM_EDGE (gsi.phi (), exit);
1599 48124 : rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), exit));
1600 48124 : if (MAY_HAVE_DEBUG_BIND_STMTS)
1601 22910 : adjust_debug_stmts (orig_def, PHI_RESULT (gsi.phi ()), exit->dest);
1602 34334 : }
1603 :
1604 34334 : auto loop_exits = get_loop_exit_edges (loop);
1605 34334 : bool has_multiple_exits_p = loop_exits.length () > 1;
1606 :
1607 : /* If REDIRECT_EXITS is false we leave the alternative exits untouched and
1608 : treat the duplication as if the loop only had the main exit. */
1609 34334 : bool redirect_multiple_exits_p = redirect_exits && has_multiple_exits_p;
1610 34334 : auto_vec<basic_block> doms;
1611 :
1612 34334 : if (at_exit) /* Add the loop copy at exit. */
1613 : {
1614 32747 : if (scalar_loop != loop && new_exit->dest != exit_dest)
1615 : {
1616 2525 : new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1617 2525 : flush_pending_stmts (new_exit);
1618 : }
1619 :
1620 32747 : bool need_virtual_phi = get_virtual_phi (loop->header);
1621 :
1622 : /* For the main loop exit preserve the LC PHI nodes. For vectorization
1623 : we need them to continue or finalize reductions. Since we do not
1624 : copy the loop exit blocks we have to materialize PHIs at the
1625 : new destination before redirecting edges. */
1626 32747 : for (auto gsi_from = gsi_start_phis (loop_exit->dest);
1627 78080 : !gsi_end_p (gsi_from); gsi_next (&gsi_from))
1628 : {
1629 45333 : tree res = gimple_phi_result (*gsi_from);
1630 45333 : create_phi_node (copy_ssa_name (res), new_preheader);
1631 : }
1632 32747 : edge e = redirect_edge_and_branch (loop_exit, new_preheader);
1633 32747 : gcc_assert (e == loop_exit);
1634 32747 : flush_pending_stmts (loop_exit);
1635 32747 : set_immediate_dominator (CDI_DOMINATORS, new_preheader, loop_exit->src);
1636 :
1637 : /* If we ended up choosing an exit leading to a path not using memory
1638 : we can end up without a virtual LC PHI. Create it when it is
1639 : needed because of the epilog loop continuation. */
1640 32747 : if (need_virtual_phi && !get_virtual_phi (loop_exit->dest))
1641 : {
1642 8 : tree header_def = gimple_phi_result (get_virtual_phi (loop->header));
1643 8 : gphi *vphi = create_phi_node (copy_ssa_name (header_def),
1644 : new_preheader);
1645 8 : add_phi_arg (vphi, get_live_virtual_operand_on_edge (loop_exit),
1646 : loop_exit, UNKNOWN_LOCATION);
1647 : }
1648 :
1649 32747 : basic_block main_loop_exit_block = new_preheader;
1650 32747 : basic_block alt_loop_exit_block = new_preheader;
1651 : /* When we redirect the other exits create the CFG
1652 : below to funnel everything through the merge block:
1653 : | loop_exit | alt1 | altN
1654 : v v ... v
1655 : main_loop_exit_block: alt_loop_exit_block:
1656 : | /
1657 : v v
1658 : new_preheader:
1659 : where in the new preheader we need merge PHIs for
1660 : the continuation values into the epilogue header.
1661 : Do not bother with exit PHIs for the early exits but
1662 : their live virtual operand. We'll fix up things below. */
1663 32747 : if (redirect_multiple_exits_p || uncounted_p)
1664 : {
1665 644 : edge loop_e = single_succ_edge (new_preheader);
1666 644 : new_preheader = split_edge (loop_e);
1667 :
1668 644 : if (redirect_exits)
1669 : {
1670 638 : gphi *vphi = NULL;
1671 638 : alt_loop_exit_block = new_preheader;
1672 3312 : for (auto exit : loop_exits)
1673 1398 : if (exit != loop_exit)
1674 : {
1675 760 : tree vphi_def = NULL_TREE;
1676 760 : if (gphi *evphi = get_virtual_phi (exit->dest))
1677 452 : vphi_def = gimple_phi_arg_def_from_edge (evphi, exit);
1678 760 : edge res
1679 760 : = redirect_edge_and_branch (exit, alt_loop_exit_block);
1680 760 : gcc_assert (res == exit);
1681 760 : redirect_edge_var_map_clear (exit);
1682 :
1683 760 : if (alt_loop_exit_block == new_preheader)
1684 615 : alt_loop_exit_block = split_edge (exit);
1685 760 : if (!need_virtual_phi)
1686 316 : continue;
1687 :
1688 : /* When the edge has no virtual LC PHI get at the live
1689 : virtual operand by other means. */
1690 444 : if (!vphi_def)
1691 2 : vphi_def = get_live_virtual_operand_on_edge (exit);
1692 :
1693 444 : if (!vphi)
1694 410 : vphi = create_phi_node (copy_ssa_name (vphi_def),
1695 : alt_loop_exit_block);
1696 : else
1697 : /* Edge redirection might re-allocate the PHI node
1698 : so we have to rediscover it. */
1699 34 : vphi = get_virtual_phi (alt_loop_exit_block);
1700 444 : add_phi_arg (vphi, vphi_def, exit, UNKNOWN_LOCATION);
1701 : }
1702 : }
1703 :
1704 644 : set_immediate_dominator (CDI_DOMINATORS, new_preheader,
1705 : loop->header);
1706 :
1707 : /* Fix up the profile counts of the new exit blocks.
1708 : main_loop_exit_block was created by duplicating the
1709 : preheader, so needs its count scaling according to the main
1710 : exit edge's probability. The remaining count from the
1711 : preheader goes to the alt_loop_exit_block, since all
1712 : alternative exits have been redirected there. */
1713 644 : main_loop_exit_block->count = loop_exit->count ();
1714 644 : alt_loop_exit_block->count
1715 644 : = preheader->count - main_loop_exit_block->count;
1716 : }
1717 :
1718 : /* Adjust the epilog loop PHI entry values to continue iteration.
1719 : This adds remaining necessary LC PHI nodes to the main exit
1720 : and creates merge PHIs when we have multiple exits with
1721 : their appropriate continuation. */
1722 32747 : if (flow_loops)
1723 : {
1724 32747 : edge loop_entry = single_succ_edge (new_preheader);
1725 32747 : bool peeled_iters = (uncounted_p
1726 32747 : || single_pred (loop->latch) != loop_exit->src);
1727 :
1728 : /* Record the new SSA names in the cache so that we can skip
1729 : materializing them again when we fill in the rest of the LC SSA
1730 : variables. */
1731 32747 : hash_map <tree, tree> new_phi_args;
1732 32747 : for (auto psi = gsi_start_phis (main_loop_exit_block);
1733 78088 : !gsi_end_p (psi); gsi_next (&psi))
1734 : {
1735 45341 : gphi *phi = *psi;
1736 45341 : tree new_arg = gimple_phi_arg_def_from_edge (phi, loop_exit);
1737 45341 : if (TREE_CODE (new_arg) != SSA_NAME)
1738 315 : continue;
1739 :
1740 : /* If the loop doesn't have a virtual def then only possibly keep
1741 : the epilog LC PHI for it and avoid creating new defs. */
1742 45116 : if (virtual_operand_p (new_arg) && !need_virtual_phi)
1743 : {
1744 90 : auto gsi = gsi_for_stmt (phi);
1745 90 : remove_phi_node (&gsi, true);
1746 90 : continue;
1747 90 : }
1748 :
1749 : /* If we decided not to remove the PHI node we should also not
1750 : rematerialize it later on. */
1751 45026 : new_phi_args.put (new_arg, gimple_phi_result (phi));
1752 : }
1753 :
1754 : /* Create the merge PHI nodes in new_preheader and populate the
1755 : arguments for the exits. */
1756 32747 : if (redirect_multiple_exits_p)
1757 : {
1758 615 : for (auto gsi_from = gsi_start_phis (loop->header),
1759 615 : gsi_to = gsi_start_phis (new_loop->header);
1760 2236 : !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1761 1621 : gsi_next (&gsi_from), gsi_next (&gsi_to))
1762 : {
1763 1621 : gimple *from_phi = gsi_stmt (gsi_from);
1764 1621 : gimple *to_phi = gsi_stmt (gsi_to);
1765 :
1766 : /* When the vector loop is peeled then we need to use the
1767 : value at start of the loop, otherwise the main loop exit
1768 : should use the final iter value. */
1769 1621 : tree new_arg;
1770 1621 : if (peeled_iters)
1771 71 : new_arg = gimple_phi_result (from_phi);
1772 : else
1773 1550 : new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1774 : loop_latch_edge (loop));
1775 :
1776 : /* Check if we've already created a new phi node during edge
1777 : redirection and re-use it if so. Otherwise create a
1778 : LC PHI node to feed the merge PHI. */
1779 1621 : tree *res;
1780 3242 : if (virtual_operand_p (new_arg))
1781 : {
1782 : /* Use the existing virtual LC SSA from exit block. */
1783 410 : gphi *vphi = get_virtual_phi (main_loop_exit_block);
1784 410 : new_arg = gimple_phi_result (vphi);
1785 : }
1786 1211 : else if ((res = new_phi_args.get (new_arg)))
1787 105 : new_arg = *res;
1788 : else
1789 : {
1790 : /* Create the LC PHI node for the exit. */
1791 1106 : tree new_def = copy_ssa_name (new_arg);
1792 1106 : gphi *lc_phi
1793 1106 : = create_phi_node (new_def, main_loop_exit_block);
1794 1106 : SET_PHI_ARG_DEF (lc_phi, 0, new_arg);
1795 1106 : new_arg = new_def;
1796 : }
1797 :
1798 : /* Create the PHI node in the merge block merging the
1799 : main and early exit values. */
1800 1621 : tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
1801 1621 : gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
1802 1621 : edge main_e = single_succ_edge (main_loop_exit_block);
1803 1621 : SET_PHI_ARG_DEF_ON_EDGE (lcssa_phi, main_e, new_arg);
1804 :
1805 : /* And adjust the epilog entry value. */
1806 1621 : adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
1807 : }
1808 : }
1809 :
1810 32747 : if (redirect_multiple_exits_p)
1811 : {
1812 : /* After creating the merge PHIs handle the early exits those
1813 : should use the values at the start of the loop. */
1814 615 : for (auto gsi_from = gsi_start_phis (loop->header),
1815 615 : gsi_to = gsi_start_phis (new_preheader);
1816 2236 : !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1817 1621 : gsi_next (&gsi_from), gsi_next (&gsi_to))
1818 : {
1819 1621 : gimple *from_phi = gsi_stmt (gsi_from);
1820 1621 : gimple *to_phi = gsi_stmt (gsi_to);
1821 :
1822 : /* Now update the virtual PHI nodes with the right value. */
1823 1621 : tree alt_arg = gimple_phi_result (from_phi);
1824 3242 : if (virtual_operand_p (alt_arg))
1825 : {
1826 410 : gphi *vphi = get_virtual_phi (alt_loop_exit_block);
1827 410 : alt_arg = gimple_phi_result (vphi);
1828 : }
1829 : /* For other live args we didn't create LC PHI nodes.
1830 : Do so here. */
1831 : else
1832 : {
1833 1211 : tree alt_def = copy_ssa_name (alt_arg);
1834 1211 : gphi *lc_phi
1835 1211 : = create_phi_node (alt_def, alt_loop_exit_block);
1836 2711 : for (unsigned i = 0; i < gimple_phi_num_args (lc_phi);
1837 : ++i)
1838 1500 : SET_PHI_ARG_DEF (lc_phi, i, alt_arg);
1839 : alt_arg = alt_def;
1840 : }
1841 :
1842 : /* The merge PHIs live in NEW_PREHEADER; their
1843 : alternative argument always comes from the
1844 : successor edge of ALT_LOOP_EXIT_BLOCK. */
1845 1621 : edge alt_e = single_succ_edge (alt_loop_exit_block);
1846 1621 : SET_PHI_ARG_DEF_ON_EDGE (to_phi, alt_e, alt_arg);
1847 : }
1848 : }
1849 :
1850 : /* For the single exit case only create the missing LC PHI nodes
1851 : for the continuation of the loop IVs that are not also already
1852 : reductions and thus had LC PHI nodes on the exit already. When
1853 : we are not redirecting the alternative exits the layout is:
1854 :
1855 : loop_exit ---> new_preheader ---> epilog
1856 : alt_exit ---------------> original dest
1857 : */
1858 615 : if (!redirect_multiple_exits_p)
1859 : {
1860 32132 : for (auto gsi_from = gsi_start_phis (loop->header),
1861 32132 : gsi_to = gsi_start_phis (new_loop->header);
1862 122449 : !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1863 90317 : gsi_next (&gsi_from), gsi_next (&gsi_to))
1864 : {
1865 90317 : gimple *from_phi = gsi_stmt (gsi_from);
1866 90317 : gimple *to_phi = gsi_stmt (gsi_to);
1867 90317 : tree new_arg;
1868 :
1869 : /* Use the value on the exiting path. When the exit is from
1870 : the latch edge we want the post-iteration value on that
1871 : edge; when the exit is from the loop header (before the
1872 : latch ever executes) we must use the current header value,
1873 : otherwise we pick up a name that was never defined. */
1874 90317 : if (!has_multiple_exits_p && !uncounted_p)
1875 90038 : new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1876 : loop_latch_edge (loop));
1877 : else
1878 279 : new_arg = gimple_phi_result (from_phi);
1879 :
1880 : /* Re-use the virtual LC PHI we already built when we are not
1881 : redirecting the other exits to avoid creating duplicate
1882 : virtual SSA names. */
1883 180634 : if (virtual_operand_p (new_arg))
1884 : {
1885 24454 : if (gphi *vphi = get_virtual_phi (main_loop_exit_block))
1886 : {
1887 24454 : adjust_phi_and_debug_stmts (to_phi, loop_entry,
1888 : gimple_phi_result (vphi));
1889 42328 : continue;
1890 : }
1891 : }
1892 :
1893 : /* Check if we've already created a new phi node during edge
1894 : redirection. If we have, only propagate the value
1895 : downwards. */
1896 65863 : if (tree *res = new_phi_args.get (new_arg))
1897 : {
1898 : /* Check if the new dest block already contains a use. */
1899 17874 : gimple *stmt = SSA_NAME_DEF_STMT (*res);
1900 :
1901 : /* If the value already exist, just update the destination
1902 : and if it doesn't we want a new node. */
1903 17874 : if (gimple_bb (stmt) == main_loop_exit_block)
1904 : {
1905 17874 : adjust_phi_and_debug_stmts (to_phi, loop_entry, *res);
1906 17874 : continue;
1907 : }
1908 : else
1909 0 : new_arg = *res;
1910 : }
1911 :
1912 47989 : tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
1913 47989 : gphi *lcssa_phi = create_phi_node (new_res, main_loop_exit_block);
1914 47989 : SET_PHI_ARG_DEF (lcssa_phi, loop_exit->dest_idx, new_arg);
1915 47989 : adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
1916 : }
1917 : }
1918 32747 : }
1919 :
1920 32747 : if (was_imm_dom || duplicate_outer_loop)
1921 32473 : set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1922 :
1923 : /* And remove the non-necessary forwarder again. Keep the other
1924 : one so we have a proper pre-header for the loop at the exit edge. */
1925 32747 : redirect_edge_pred (single_succ_edge (preheader),
1926 : single_pred (preheader));
1927 32747 : delete_basic_block (preheader);
1928 32747 : set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1929 32747 : loop_preheader_edge (scalar_loop)->src);
1930 :
1931 : /* Finally after wiring the new epilogue we need to update its main exit
1932 : to the original function exit we recorded. Other exits are already
1933 : correct. */
1934 32747 : if (has_multiple_exits_p || uncounted_p)
1935 : {
1936 825 : class loop *update_loop = new_loop;
1937 825 : doms = get_all_dominated_blocks (CDI_DOMINATORS, loop->header);
1938 19201 : for (unsigned i = 0; i < doms.length (); ++i)
1939 18376 : if (flow_bb_inside_loop_p (loop, doms[i]))
1940 2579 : doms.unordered_remove (i);
1941 :
1942 4241 : for (edge e : get_loop_exit_edges (update_loop))
1943 : {
1944 1766 : edge ex;
1945 1766 : edge_iterator ei;
1946 3573 : FOR_EACH_EDGE (ex, ei, e->dest->succs)
1947 : {
1948 : /* Find the first non-fallthrough block as fall-throughs can't
1949 : dominate other blocks. */
1950 1807 : if (single_succ_p (ex->dest))
1951 : {
1952 951 : doms.safe_push (ex->dest);
1953 951 : ex = single_succ_edge (ex->dest);
1954 : }
1955 1807 : doms.safe_push (ex->dest);
1956 : }
1957 1766 : doms.safe_push (e->dest);
1958 825 : }
1959 :
1960 825 : iterate_fix_dominators (CDI_DOMINATORS, doms, false);
1961 825 : if (updated_doms)
1962 825 : updated_doms->safe_splice (doms);
1963 : }
1964 : }
1965 : else /* Add the copy at entry. */
1966 : {
1967 : /* Copy the current loop LC PHI nodes between the original loop exit
1968 : block and the new loop header. This allows us to later split the
1969 : preheader block and still find the right LC nodes. */
1970 1587 : if (flow_loops)
1971 428 : for (auto gsi_from = gsi_start_phis (new_loop->header),
1972 428 : gsi_to = gsi_start_phis (loop->header);
1973 1345 : !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1974 917 : gsi_next (&gsi_from), gsi_next (&gsi_to))
1975 : {
1976 917 : gimple *from_phi = gsi_stmt (gsi_from);
1977 917 : gimple *to_phi = gsi_stmt (gsi_to);
1978 917 : tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1979 : loop_latch_edge (new_loop));
1980 917 : adjust_phi_and_debug_stmts (to_phi, loop_preheader_edge (loop),
1981 : new_arg);
1982 : }
1983 :
1984 1587 : if (scalar_loop != loop)
1985 : {
1986 : /* Remove the non-necessary forwarder of scalar_loop again. */
1987 4 : redirect_edge_pred (single_succ_edge (preheader),
1988 : single_pred (preheader));
1989 4 : delete_basic_block (preheader);
1990 4 : set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1991 4 : loop_preheader_edge (scalar_loop)->src);
1992 4 : preheader = split_edge (loop_preheader_edge (loop));
1993 4 : entry_e = single_pred_edge (preheader);
1994 : }
1995 :
1996 1587 : redirect_edge_and_branch_force (entry_e, new_preheader);
1997 1587 : flush_pending_stmts (entry_e);
1998 1587 : set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1999 :
2000 :
2001 : /* `vect_set_loop_condition' replaces the condition in the main exit of
2002 : loop. For counted loops, this is the IV counting exit, so in the case
2003 : of the prolog loop, we are replacing the old IV counting exit limit of
2004 : total loop niters for the new limit of the prolog niters, as desired.
2005 : For uncounted loops, we don't have an IV-counting exit to replace, so
2006 : we add a dummy exit to be consumed by `vect_set_loop_condition' later
2007 : on. */
2008 1587 : if (create_main_e)
2009 : {
2010 31 : edge to_latch_e = single_pred_edge (new_loop->latch);
2011 31 : bool latch_is_false = to_latch_e->flags & EDGE_FALSE_VALUE ? true
2012 : : false;
2013 :
2014 : /* Add new bb for duplicate exit. */
2015 31 : basic_block bbcond = split_edge (to_latch_e);
2016 31 : gimple_stmt_iterator a = gsi_last_bb (bbcond);
2017 :
2018 : /* Fix flags for the edge leading to the latch. */
2019 31 : to_latch_e = find_edge (bbcond, new_loop->latch);
2020 31 : to_latch_e->flags &= ~EDGE_FALLTHRU;
2021 31 : to_latch_e->flags |= latch_is_false ? EDGE_FALSE_VALUE
2022 : : EDGE_TRUE_VALUE;
2023 :
2024 : /* Build the condition. */
2025 31 : tree cone = build_int_cst (sizetype, 1);
2026 31 : tree czero = build_int_cst (sizetype, 0);
2027 31 : gcond *cond_copy = gimple_build_cond (NE_EXPR, cone, czero, NULL_TREE,
2028 : NULL_TREE);
2029 :
2030 31 : gsi_insert_after (&a, cond_copy, GSI_NEW_STMT);
2031 :
2032 : /* Add edge for exiting the loop via new condition. */
2033 38 : edge dup_exit = make_edge (bbcond, new_exit->dest, latch_is_false
2034 : ? EDGE_TRUE_VALUE : EDGE_FALSE_VALUE);
2035 :
2036 31 : profile_probability probability = profile_probability::even ();
2037 31 : to_latch_e->probability = dup_exit->probability = probability;
2038 :
2039 31 : set_immediate_dominator (CDI_DOMINATORS, dup_exit->src,
2040 : new_exit->src);
2041 31 : new_exit = dup_exit;
2042 31 : *new_e = new_exit;
2043 : }
2044 :
2045 1587 : redirect_edge_and_branch_force (new_exit, preheader);
2046 1587 : flush_pending_stmts (new_exit);
2047 1587 : set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
2048 :
2049 : /* And remove the non-necessary forwarder again. Keep the other
2050 : one so we have a proper pre-header for the loop at the exit edge. */
2051 1587 : redirect_edge_pred (single_succ_edge (new_preheader),
2052 : single_pred (new_preheader));
2053 1587 : delete_basic_block (new_preheader);
2054 1587 : set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
2055 1587 : loop_preheader_edge (new_loop)->src);
2056 :
2057 : /* Update dominators for multiple exits. */
2058 1587 : if (has_multiple_exits_p || create_main_e)
2059 : {
2060 1103 : for (edge alt_e : loop_exits)
2061 : {
2062 437 : if ((alt_e == loop_exit) && !create_main_e)
2063 191 : continue;
2064 246 : basic_block old_dom
2065 246 : = get_immediate_dominator (CDI_DOMINATORS, alt_e->dest);
2066 246 : if (flow_bb_inside_loop_p (loop, old_dom))
2067 : {
2068 101 : auto_vec<basic_block, 8> queue;
2069 101 : for (auto son = first_dom_son (CDI_DOMINATORS, old_dom);
2070 325 : son; son = next_dom_son (CDI_DOMINATORS, son))
2071 224 : if (!flow_bb_inside_loop_p (loop, son))
2072 123 : queue.safe_push (son);
2073 426 : for (auto son : queue)
2074 123 : set_immediate_dominator (CDI_DOMINATORS,
2075 : son, get_bb_copy (old_dom));
2076 101 : }
2077 : }
2078 : }
2079 :
2080 : /* When loop_exit != scalar_exit due to if-conversion loop versioning,
2081 : the `scalar_exit' now has two incoming edges, one from the if-converted
2082 : and one from the peeled prolog loop. It is therefore dominated by a
2083 : common block between these. Update its dominator accordingly. */
2084 222 : if (create_main_e && loop_exit != scalar_exit)
2085 0 : set_immediate_dominator (CDI_DOMINATORS, scalar_exit->dest,
2086 : recompute_dominator (CDI_DOMINATORS,
2087 : scalar_exit->dest));
2088 : }
2089 :
2090 34334 : free (new_bbs);
2091 34334 : free (bbs);
2092 :
2093 34334 : checking_verify_dominators (CDI_DOMINATORS);
2094 :
2095 34334 : return new_loop;
2096 34334 : }
2097 :
2098 :
2099 : /* Given the condition expression COND, put it as the last statement of
2100 : GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
2101 : DOM_BB; return the skip edge. GUARD_TO is the target basic block to
2102 : skip the loop. PROBABILITY is the skip edge's probability. Mark the
2103 : new edge as irreducible if IRREDUCIBLE_P is true. */
2104 :
2105 : static edge
2106 50416 : slpeel_add_loop_guard (basic_block guard_bb, tree cond,
2107 : basic_block guard_to, basic_block dom_bb,
2108 : profile_probability probability, bool irreducible_p)
2109 : {
2110 50416 : gimple_stmt_iterator gsi;
2111 50416 : edge new_e, enter_e;
2112 50416 : gcond *cond_stmt;
2113 50416 : gimple_seq gimplify_stmt_list = NULL;
2114 :
2115 50416 : enter_e = EDGE_SUCC (guard_bb, 0);
2116 50416 : enter_e->flags &= ~EDGE_FALLTHRU;
2117 50416 : enter_e->flags |= EDGE_FALSE_VALUE;
2118 50416 : gsi = gsi_last_bb (guard_bb);
2119 :
2120 50416 : cond = force_gimple_operand_1 (cond, &gimplify_stmt_list,
2121 : is_gimple_condexpr_for_cond, NULL_TREE);
2122 50416 : if (gimplify_stmt_list)
2123 22775 : gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
2124 :
2125 50416 : cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
2126 50416 : gsi = gsi_last_bb (guard_bb);
2127 50416 : gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
2128 :
2129 : /* Add new edge to connect guard block to the merge/loop-exit block. */
2130 50416 : new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
2131 :
2132 50416 : new_e->probability = probability;
2133 50416 : if (irreducible_p)
2134 16 : new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
2135 :
2136 50416 : enter_e->probability = probability.invert ();
2137 50416 : set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
2138 :
2139 : /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
2140 50416 : if (enter_e->dest->loop_father->header == enter_e->dest)
2141 474 : split_edge (enter_e);
2142 :
2143 50416 : return new_e;
2144 : }
2145 :
2146 :
2147 : /* This function verifies that the following restrictions apply to LOOP:
2148 : (1) it consists of exactly 2 basic blocks - header, and an empty latch
2149 : for innermost loop and 5 basic blocks for outer-loop.
2150 : (2) it is single entry, single exit
2151 : (3) its exit condition is the last stmt in the header
2152 : (4) E is the entry/exit edge of LOOP.
2153 : */
2154 :
2155 : bool
2156 496391 : slpeel_can_duplicate_loop_p (const class loop *loop, const_edge exit_e,
2157 : const_edge e)
2158 : {
2159 496391 : edge entry_e = loop_preheader_edge (loop);
2160 496391 : gcond *orig_cond = get_loop_exit_condition (exit_e);
2161 496391 : gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
2162 :
2163 : /* All loops have an outer scope; the only case loop->outer is NULL is for
2164 : the function itself. */
2165 496391 : if (!loop_outer (loop)
2166 496391 : || !empty_block_p (loop->latch)
2167 : || !exit_e
2168 : /* Verify that new loop exit condition can be trivially modified. */
2169 496391 : || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
2170 992782 : || (e != exit_e && e != entry_e))
2171 0 : return false;
2172 :
2173 496391 : basic_block *bbs = XNEWVEC (basic_block, loop->num_nodes);
2174 496391 : get_loop_body_with_size (loop, bbs, loop->num_nodes);
2175 496391 : bool ret = can_copy_bbs_p (bbs, loop->num_nodes);
2176 496391 : free (bbs);
2177 496391 : return ret;
2178 : }
2179 :
2180 : /* Function find_loop_location.
2181 :
2182 : Extract the location of the loop in the source code.
2183 : If the loop is not well formed for vectorization, an estimated
2184 : location is calculated.
2185 : Return the loop location if succeed and NULL if not. */
2186 :
2187 : dump_user_location_t
2188 3558046 : find_loop_location (class loop *loop)
2189 : {
2190 3558046 : gimple *stmt = NULL;
2191 3558046 : basic_block bb;
2192 3558046 : gimple_stmt_iterator si;
2193 :
2194 3558046 : if (!loop)
2195 0 : return dump_user_location_t ();
2196 :
2197 : /* For the root of the loop tree return the function location. */
2198 3558046 : if (!loop_outer (loop))
2199 0 : return dump_user_location_t::from_function_decl (cfun->decl);
2200 :
2201 3558046 : if (loops_state_satisfies_p (LOOPS_HAVE_RECORDED_EXITS))
2202 : {
2203 : /* We only care about the loop location, so use any exit with location
2204 : information. */
2205 11163019 : for (edge e : get_loop_exit_edges (loop))
2206 : {
2207 3653472 : stmt = get_loop_exit_condition (e);
2208 :
2209 3653472 : if (stmt
2210 3653472 : && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
2211 3105207 : return stmt;
2212 3558046 : }
2213 : }
2214 :
2215 : /* If we got here the loop is probably not "well formed",
2216 : try to estimate the loop location */
2217 :
2218 452839 : if (!loop->header)
2219 0 : return dump_user_location_t ();
2220 :
2221 452839 : bb = loop->header;
2222 :
2223 1600246 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2224 : {
2225 1028292 : stmt = gsi_stmt (si);
2226 1028292 : if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
2227 333724 : return stmt;
2228 : }
2229 :
2230 119115 : return dump_user_location_t ();
2231 : }
2232 :
2233 : /* Return true if the phi described by STMT_INFO defines an IV of the
2234 : loop to be vectorized. */
2235 :
2236 : static bool
2237 1330814 : iv_phi_p (stmt_vec_info stmt_info)
2238 : {
2239 1330814 : gphi *phi = as_a <gphi *> (stmt_info->stmt);
2240 2661628 : if (virtual_operand_p (PHI_RESULT (phi)))
2241 : return false;
2242 :
2243 1053811 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2244 1053811 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2245 156698 : return false;
2246 :
2247 : return true;
2248 : }
2249 :
2250 : /* Return true if vectorizer can peel for nonlinear iv. */
2251 : static bool
2252 7772 : vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo,
2253 : stmt_vec_info stmt_info)
2254 : {
2255 7772 : enum vect_induction_op_type induction_type
2256 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
2257 7772 : tree niters_skip;
2258 : /* Init_expr will be update by vect_update_ivs_after_vectorizer,
2259 : if niters or vf is unkown:
2260 : For shift, when shift mount >= precision, there would be UD.
2261 : For mult, don't known how to generate
2262 : init_expr * pow (step, niters) for variable niters.
2263 : For neg unknown niters are ok, since niters of vectorized main loop
2264 : will always be multiple of 2.
2265 : See also PR113163, PR114196 and PR114485. */
2266 7772 : if (!LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()
2267 7772 : || LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2268 7772 : || (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2269 3102 : && induction_type != vect_step_op_neg))
2270 : {
2271 2918 : if (dump_enabled_p ())
2272 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2273 : "Peeling for epilogue is not supported"
2274 : " for this nonlinear induction"
2275 : " when iteration count is unknown or"
2276 : " when using partial vectorization.\n");
2277 2918 : return false;
2278 : }
2279 :
2280 4854 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
2281 266 : && induction_type == vect_step_op_mul)
2282 : {
2283 24 : if (dump_enabled_p ())
2284 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2285 : "Peeling for is not supported for nonlinear mult"
2286 : " induction using partial vectorization.\n");
2287 24 : return false;
2288 : }
2289 :
2290 : /* Avoid compile time hog on vect_peel_nonlinear_iv_init. */
2291 4588 : if (induction_type == vect_step_op_mul)
2292 : {
2293 409 : tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
2294 409 : tree type = TREE_TYPE (step_expr);
2295 :
2296 812 : if (wi::exact_log2 (wi::to_wide (step_expr)) == -1
2297 409 : && LOOP_VINFO_INT_NITERS(loop_vinfo) >= TYPE_PRECISION (type))
2298 : {
2299 6 : if (dump_enabled_p ())
2300 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2301 : "Avoid compile time hog on"
2302 : " vect_peel_nonlinear_iv_init"
2303 : " for nonlinear induction vec_step_op_mul"
2304 : " when iteration count is too big.\n");
2305 6 : return false;
2306 : }
2307 : }
2308 :
2309 : /* Also doens't support peel for neg when niter is variable.
2310 : ??? generate something like niter_expr & 1 ? init_expr : -init_expr? */
2311 4824 : niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
2312 4824 : if ((niters_skip != NULL_TREE
2313 0 : && (TREE_CODE (niters_skip) != INTEGER_CST
2314 0 : || (HOST_WIDE_INT) TREE_INT_CST_LOW (niters_skip) < 0))
2315 4824 : || (!vect_use_loop_mask_for_alignment_p (loop_vinfo)
2316 4824 : && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0))
2317 : {
2318 4 : if (dump_enabled_p ())
2319 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2320 : "Peeling for alignement is not supported"
2321 : " for nonlinear induction when niters_skip"
2322 : " is not constant.\n");
2323 4 : return false;
2324 : }
2325 :
2326 : return true;
2327 : }
2328 :
2329 : /* Function vect_can_advance_ivs_p
2330 :
2331 : In case the number of iterations that LOOP iterates is unknown at compile
2332 : time, an epilog loop will be generated, and the loop induction variables
2333 : (IVs) will be "advanced" to the value they are supposed to take just before
2334 : the epilog loop. Here we check that the access function of the loop IVs
2335 : and the expression that represents the loop bound are simple enough.
2336 : These restrictions will be relaxed in the future. */
2337 :
2338 : bool
2339 499885 : vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
2340 : {
2341 499885 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2342 499885 : basic_block bb = loop->header;
2343 499885 : gphi_iterator gsi;
2344 :
2345 : /* Analyze phi functions of the loop header. */
2346 :
2347 499885 : if (dump_enabled_p ())
2348 25170 : dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
2349 1733175 : for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2350 : {
2351 1237255 : tree evolution_part;
2352 1237255 : enum vect_induction_op_type induction_type;
2353 :
2354 1237255 : gphi *phi = gsi.phi ();
2355 1237255 : stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
2356 1237255 : if (dump_enabled_p ())
2357 72199 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
2358 : phi_info->stmt);
2359 :
2360 : /* Skip virtual phi's. The data dependences that are associated with
2361 : virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
2362 :
2363 : Skip reduction phis. */
2364 1237255 : if (!iv_phi_p (phi_info))
2365 : {
2366 390875 : if (dump_enabled_p ())
2367 25649 : dump_printf_loc (MSG_NOTE, vect_location,
2368 : "reduc or virtual phi. skip.\n");
2369 390875 : continue;
2370 : }
2371 :
2372 846380 : induction_type = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
2373 846380 : if (induction_type != vect_step_op_add)
2374 : {
2375 7772 : if (!vect_can_peel_nonlinear_iv_p (loop_vinfo, phi_info))
2376 : return false;
2377 :
2378 4820 : continue;
2379 : }
2380 :
2381 : /* Analyze the evolution function. */
2382 :
2383 838608 : evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
2384 838608 : if (evolution_part == NULL_TREE)
2385 : {
2386 981 : if (dump_enabled_p ())
2387 78 : dump_printf (MSG_MISSED_OPTIMIZATION,
2388 : "No access function or evolution.\n");
2389 981 : return false;
2390 : }
2391 :
2392 : /* FORNOW: We do not transform initial conditions of IVs
2393 : which evolution functions are not invariants in the loop. */
2394 :
2395 837627 : if (!expr_invariant_in_loop_p (loop, evolution_part))
2396 : {
2397 32 : if (dump_enabled_p ())
2398 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2399 : "evolution not invariant in loop.\n");
2400 32 : return false;
2401 : }
2402 :
2403 : /* FORNOW: We do not transform initial conditions of IVs
2404 : which evolution functions are a polynomial of degree >= 2. */
2405 :
2406 2070885 : if (tree_is_chrec (evolution_part))
2407 : {
2408 0 : if (dump_enabled_p ())
2409 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2410 : "evolution is chrec.\n");
2411 0 : return false;
2412 : }
2413 : }
2414 :
2415 : return true;
2416 : }
2417 :
2418 :
2419 : /* Function vect_update_ivs_after_vectorizer.
2420 :
2421 : "Advance" the induction variables of LOOP to the value they should take
2422 : after the execution of LOOP. This is currently necessary because the
2423 : vectorizer does not handle induction variables that are used after the
2424 : loop. Such a situation occurs when the last iterations of LOOP are
2425 : peeled, because:
2426 : 1. We introduced new uses after LOOP for IVs that were not originally used
2427 : after LOOP: the IVs of LOOP are now used by an epilog loop.
2428 : 2. LOOP is going to be vectorized; this means that it will iterate N/VF
2429 : times, whereas the loop IVs should be bumped N times.
2430 :
2431 : Input:
2432 : - LOOP - a loop that is going to be vectorized. The last few iterations
2433 : of LOOP were peeled.
2434 : - NITERS - the number of iterations that LOOP executes (before it is
2435 : vectorized). i.e, the number of times the ivs should be bumped.
2436 : - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
2437 : coming out from LOOP on which there are uses of the LOOP ivs
2438 : (this is the path from LOOP->exit to epilog_loop->preheader).
2439 :
2440 : The new definitions of the ivs are placed in LOOP->exit.
2441 : The phi args associated with the edge UPDATE_E in the bb
2442 : UPDATE_E->dest are updated accordingly.
2443 :
2444 : - EARLY_EXIT_P - Indicates whether the exit is an early exit rather than
2445 : the main latch exit.
2446 :
2447 : Assumption 1: Like the rest of the vectorizer, this function assumes
2448 : a single loop exit that has a single predecessor.
2449 :
2450 : Assumption 2: The phi nodes in the LOOP header and in update_bb are
2451 : organized in the same order.
2452 :
2453 : Assumption 3: The access function of the ivs is simple enough (see
2454 : vect_can_advance_ivs_p). This assumption will be relaxed in the future.
2455 :
2456 : Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
2457 : coming out of LOOP on which the ivs of LOOP are used (this is the path
2458 : that leads to the epilog loop; other paths skip the epilog loop). This
2459 : path starts with the edge UPDATE_E, and its destination (denoted update_bb)
2460 : needs to have its phis updated.
2461 : */
2462 :
2463 : static void
2464 33362 : vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
2465 : tree niters, edge update_e,
2466 : bool early_exit_p)
2467 : {
2468 33362 : gphi_iterator gsi, gsi1;
2469 33362 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2470 33362 : basic_block update_bb = update_e->dest;
2471 33362 : basic_block exit_bb = update_e->src;
2472 : /* Check to see if this is an empty loop pre-header block. If it exists
2473 : we need to use the edge from that block -> loop header for updates but
2474 : must use the original exit_bb to add any new adjustment because there
2475 : can be a skip_epilog edge bypassing the epilog and so the loop pre-header
2476 : too. */
2477 33437 : if (empty_block_p (update_bb) && single_succ_p (update_bb))
2478 : {
2479 75 : update_e = single_succ_edge (update_bb);
2480 75 : update_bb = update_e->dest;
2481 : }
2482 33362 : gimple_stmt_iterator last_gsi = gsi_last_bb (exit_bb);
2483 :
2484 33362 : for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
2485 126921 : !gsi_end_p (gsi) && !gsi_end_p (gsi1);
2486 93559 : gsi_next (&gsi), gsi_next (&gsi1))
2487 : {
2488 93559 : tree init_expr;
2489 93559 : tree step_expr, off;
2490 93559 : tree type;
2491 93559 : tree var, ni, ni_name;
2492 :
2493 93559 : gphi *phi = gsi.phi ();
2494 93559 : gphi *phi1 = gsi1.phi ();
2495 93559 : stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
2496 93559 : if (dump_enabled_p ())
2497 12237 : dump_printf_loc (MSG_NOTE, vect_location,
2498 : "vect_update_ivs_after_vectorizer: phi: %G",
2499 : (gimple *) phi);
2500 :
2501 : /* Skip reduction and virtual phis. */
2502 93559 : if (!iv_phi_p (phi_info))
2503 : {
2504 42826 : if (dump_enabled_p ())
2505 4762 : dump_printf_loc (MSG_NOTE, vect_location,
2506 : "reduc or virtual phi. skip.\n");
2507 42826 : continue;
2508 : }
2509 :
2510 50733 : type = TREE_TYPE (gimple_phi_result (phi));
2511 50733 : step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
2512 50733 : step_expr = unshare_expr (step_expr);
2513 :
2514 : /* FORNOW: We do not support IVs whose evolution function is a polynomial
2515 : of degree >= 2 or exponential. */
2516 50733 : gcc_assert (!tree_is_chrec (step_expr));
2517 :
2518 50733 : init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
2519 50733 : gimple_seq stmts = NULL;
2520 50733 : enum vect_induction_op_type induction_type
2521 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
2522 :
2523 50733 : if (induction_type == vect_step_op_add)
2524 : {
2525 50605 : tree stype = TREE_TYPE (step_expr);
2526 50605 : off = fold_build2 (MULT_EXPR, stype,
2527 : fold_convert (stype, niters), step_expr);
2528 :
2529 50605 : if (POINTER_TYPE_P (type))
2530 3552 : ni = fold_build_pointer_plus (init_expr, off);
2531 : else
2532 47053 : ni = fold_convert (type,
2533 : fold_build2 (PLUS_EXPR, stype,
2534 : fold_convert (stype, init_expr),
2535 : off));
2536 : }
2537 : /* Don't bother call vect_peel_nonlinear_iv_init. */
2538 128 : else if (induction_type == vect_step_op_neg)
2539 : ni = init_expr;
2540 : else
2541 84 : ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
2542 : niters, step_expr,
2543 : induction_type, early_exit_p);
2544 :
2545 50733 : var = create_tmp_var (type, "tmp");
2546 :
2547 50733 : gimple_seq new_stmts = NULL;
2548 50733 : ni_name = force_gimple_operand (ni, &new_stmts, false, var);
2549 :
2550 : /* Exit_bb shouldn't be empty, but we also can't insert after a ctrl
2551 : statements. */
2552 50733 : if (!gsi_end_p (last_gsi) && !is_ctrl_stmt (gsi_stmt (last_gsi)))
2553 : {
2554 248 : gsi_insert_seq_after (&last_gsi, stmts, GSI_SAME_STMT);
2555 248 : gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
2556 : }
2557 : else
2558 : {
2559 50485 : gsi_insert_seq_before (&last_gsi, stmts, GSI_SAME_STMT);
2560 50485 : gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
2561 : }
2562 :
2563 : /* Update the PHI argument on the requested edge. */
2564 50733 : adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
2565 : }
2566 33362 : }
2567 :
2568 : /* Return a gimple value containing the misalignment (measured in vector
2569 : elements) for the loop described by LOOP_VINFO, i.e. how many elements
2570 : it is away from a perfectly aligned address. Add any new statements
2571 : to SEQ. */
2572 :
2573 : static tree
2574 202 : get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
2575 : {
2576 202 : dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2577 202 : stmt_vec_info stmt_info = dr_info->stmt;
2578 202 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2579 :
2580 202 : poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
2581 202 : unsigned HOST_WIDE_INT target_align_c;
2582 202 : tree target_align_minus_1;
2583 :
2584 202 : bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
2585 202 : size_zero_node) < 0;
2586 202 : tree offset = (negative
2587 202 : ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
2588 : * TREE_INT_CST_LOW
2589 : (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
2590 199 : : size_zero_node);
2591 202 : tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
2592 : stmt_info, seq,
2593 : offset);
2594 202 : tree type = unsigned_type_for (TREE_TYPE (start_addr));
2595 202 : if (target_align.is_constant (&target_align_c))
2596 202 : target_align_minus_1 = build_int_cst (type, target_align_c - 1);
2597 : else
2598 : {
2599 : tree vla = build_int_cst (type, target_align);
2600 : target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla,
2601 : build_int_cst (type, 1));
2602 : }
2603 :
2604 202 : HOST_WIDE_INT elem_size
2605 202 : = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
2606 404 : tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
2607 :
2608 : /* Create: misalign_in_bytes = addr & (target_align - 1). */
2609 202 : tree int_start_addr = fold_convert (type, start_addr);
2610 202 : tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
2611 : target_align_minus_1);
2612 :
2613 : /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
2614 202 : tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
2615 : elem_size_log);
2616 :
2617 202 : return misalign_in_elems;
2618 : }
2619 :
2620 : /* Function vect_gen_prolog_loop_niters
2621 :
2622 : Generate the number of iterations which should be peeled as prolog for the
2623 : loop represented by LOOP_VINFO. It is calculated as the misalignment of
2624 : DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
2625 : As a result, after the execution of this loop, the data reference DR will
2626 : refer to an aligned location. The following computation is generated:
2627 :
2628 : If the misalignment of DR is known at compile time:
2629 : addr_mis = int mis = DR_MISALIGNMENT (dr);
2630 : Else, compute address misalignment in bytes:
2631 : addr_mis = addr & (target_align - 1)
2632 :
2633 : prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
2634 :
2635 : (elem_size = element type size; an element is the scalar element whose type
2636 : is the inner type of the vectype)
2637 :
2638 : The computations will be emitted at the end of BB. We also compute and
2639 : store upper bound (included) of the result in BOUND.
2640 :
2641 : When the step of the data-ref in the loop is not 1 (as in interleaved data
2642 : and SLP), the number of iterations of the prolog must be divided by the step
2643 : (which is equal to the size of interleaved group).
2644 :
2645 : The above formulas assume that VF == number of elements in the vector. This
2646 : may not hold when there are multiple-types in the loop.
2647 : In this case, for some data-references in the loop the VF does not represent
2648 : the number of elements that fit in the vector. Therefore, instead of VF we
2649 : use TYPE_VECTOR_SUBPARTS. */
2650 :
2651 : static tree
2652 428 : vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
2653 : basic_block bb, poly_int64 *bound)
2654 : {
2655 428 : dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2656 428 : tree var;
2657 428 : tree niters_type
2658 428 : = LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo) ? sizetype
2659 428 : : TREE_TYPE (LOOP_VINFO_NITERS
2660 : (loop_vinfo));
2661 428 : gimple_seq stmts = NULL, new_stmts = NULL;
2662 428 : tree iters, iters_name;
2663 428 : stmt_vec_info stmt_info = dr_info->stmt;
2664 428 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2665 428 : poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
2666 :
2667 428 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2668 : {
2669 226 : int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2670 :
2671 226 : if (dump_enabled_p ())
2672 217 : dump_printf_loc (MSG_NOTE, vect_location,
2673 : "known peeling = %d.\n", npeel);
2674 :
2675 226 : iters = build_int_cst (niters_type, npeel);
2676 226 : *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2677 : }
2678 : else
2679 : {
2680 202 : tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
2681 202 : tree type = TREE_TYPE (misalign_in_elems);
2682 202 : HOST_WIDE_INT elem_size
2683 202 : = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
2684 : /* We only do prolog peeling if the target alignment is known at compile
2685 : time. */
2686 202 : poly_uint64 align_in_elems =
2687 202 : exact_div (target_align, elem_size);
2688 202 : tree align_in_elems_minus_1 =
2689 202 : build_int_cst (type, align_in_elems - 1);
2690 202 : tree align_in_elems_tree = build_int_cst (type, align_in_elems);
2691 :
2692 : /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
2693 : & (align_in_elems - 1)). */
2694 202 : bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
2695 202 : size_zero_node) < 0;
2696 202 : if (negative)
2697 3 : iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
2698 : align_in_elems_tree);
2699 : else
2700 199 : iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
2701 : misalign_in_elems);
2702 202 : iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
2703 202 : iters = fold_convert (niters_type, iters);
2704 202 : *bound = align_in_elems;
2705 : }
2706 :
2707 428 : if (dump_enabled_p ())
2708 276 : dump_printf_loc (MSG_NOTE, vect_location,
2709 : "niters for prolog loop: %T\n", iters);
2710 :
2711 428 : var = create_tmp_var (niters_type, "prolog_loop_niters");
2712 428 : iters_name = force_gimple_operand (iters, &new_stmts, false, var);
2713 :
2714 428 : if (new_stmts)
2715 202 : gimple_seq_add_seq (&stmts, new_stmts);
2716 428 : if (stmts)
2717 : {
2718 202 : gcc_assert (single_succ_p (bb));
2719 202 : gimple_stmt_iterator gsi = gsi_last_bb (bb);
2720 202 : if (gsi_end_p (gsi))
2721 42 : gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2722 : else
2723 160 : gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
2724 : }
2725 428 : return iters_name;
2726 : }
2727 :
2728 :
2729 : /* Function vect_update_init_of_dr
2730 :
2731 : If CODE is PLUS, the vector loop starts NITERS iterations after the
2732 : scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
2733 : iterations before the scalar one (using masking to skip inactive
2734 : elements). This function updates the information recorded in DR to
2735 : account for the difference. Specifically, it updates the OFFSET
2736 : field of DR_INFO. */
2737 :
2738 : static void
2739 24323 : vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
2740 : {
2741 24323 : struct data_reference *dr = dr_info->dr;
2742 24323 : tree offset = dr_info->offset;
2743 24323 : if (!offset)
2744 24323 : offset = build_zero_cst (sizetype);
2745 :
2746 24323 : niters = fold_build2 (MULT_EXPR, sizetype,
2747 : fold_convert (sizetype, niters),
2748 : fold_convert (sizetype, DR_STEP (dr)));
2749 24323 : offset = fold_build2 (code, sizetype,
2750 : fold_convert (sizetype, offset), niters);
2751 24323 : dr_info->offset = offset;
2752 24323 : }
2753 :
2754 :
2755 : /* Function vect_update_inits_of_drs
2756 :
2757 : Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
2758 : CODE and NITERS are as for vect_update_inits_of_dr. */
2759 :
2760 : void
2761 7252 : vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
2762 : tree_code code)
2763 : {
2764 7252 : unsigned int i;
2765 7252 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2766 7252 : struct data_reference *dr;
2767 :
2768 7252 : DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
2769 :
2770 : /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
2771 : here, but since we might use these niters to update the epilogues niters
2772 : and data references we can't insert them here as this definition might not
2773 : always dominate its uses. */
2774 7252 : if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
2775 4570 : niters = fold_convert (sizetype, niters);
2776 :
2777 39131 : FOR_EACH_VEC_ELT (datarefs, i, dr)
2778 : {
2779 24747 : dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2780 24747 : if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
2781 24323 : && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
2782 24323 : vect_update_init_of_dr (dr_info, niters, code);
2783 : }
2784 7252 : }
2785 :
2786 : /* For the information recorded in LOOP_VINFO prepare the loop for peeling
2787 : by masking. This involves calculating the number of iterations to
2788 : be peeled and then aligning all memory references appropriately. */
2789 :
2790 : void
2791 1 : vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
2792 : {
2793 1 : tree misalign_in_elems;
2794 1 : tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
2795 :
2796 1 : gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
2797 :
2798 : /* From the information recorded in LOOP_VINFO get the number of iterations
2799 : that need to be skipped via masking. */
2800 1 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2801 : {
2802 1 : poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2803 1 : - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
2804 1 : misalign_in_elems = build_int_cst (type, misalign);
2805 : }
2806 : else
2807 : {
2808 0 : gimple_seq seq1 = NULL, seq2 = NULL;
2809 0 : misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
2810 0 : misalign_in_elems = fold_convert (type, misalign_in_elems);
2811 0 : misalign_in_elems = force_gimple_operand (misalign_in_elems,
2812 : &seq2, true, NULL_TREE);
2813 0 : gimple_seq_add_seq (&seq1, seq2);
2814 0 : if (seq1)
2815 : {
2816 0 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2817 0 : basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
2818 0 : gcc_assert (!new_bb);
2819 : }
2820 : }
2821 :
2822 1 : if (dump_enabled_p ())
2823 1 : dump_printf_loc (MSG_NOTE, vect_location,
2824 : "misalignment for fully-masked loop: %T\n",
2825 : misalign_in_elems);
2826 :
2827 1 : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
2828 :
2829 1 : vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
2830 1 : }
2831 :
2832 : /* This function builds ni_name = number of iterations. Statements
2833 : are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
2834 : it to TRUE if new ssa_var is generated. */
2835 :
2836 : tree
2837 61994 : vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
2838 : {
2839 61994 : if (LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo))
2840 : return NULL_TREE;
2841 61920 : tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
2842 61920 : if (TREE_CODE (ni) == INTEGER_CST)
2843 : return ni;
2844 : else
2845 : {
2846 25849 : tree ni_name, var;
2847 25849 : gimple_seq stmts = NULL;
2848 25849 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2849 :
2850 25849 : var = create_tmp_var (TREE_TYPE (ni), "niters");
2851 25849 : ni_name = force_gimple_operand (ni, &stmts, false, var);
2852 25849 : if (stmts)
2853 : {
2854 24790 : gsi_insert_seq_on_edge_immediate (pe, stmts);
2855 24790 : if (new_var_p != NULL)
2856 212 : *new_var_p = true;
2857 : }
2858 :
2859 25849 : return ni_name;
2860 : }
2861 : }
2862 :
2863 : /* Calculate the number of iterations above which vectorized loop will be
2864 : preferred than scalar loop. NITERS_PROLOG is the number of iterations
2865 : of prolog loop. If it's integer const, the integer number is also passed
2866 : in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
2867 : number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
2868 : value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
2869 : threshold below which the scalar (rather than vectorized) loop will be
2870 : executed. This function stores the upper bound (inclusive) of the result
2871 : in BOUND_SCALAR. */
2872 :
2873 : static tree
2874 24834 : vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
2875 : poly_int64 bound_prolog, poly_int64 bound_epilog,
2876 : int th, poly_uint64 *bound_scalar,
2877 : bool check_profitability)
2878 : {
2879 24834 : tree type = TREE_TYPE (niters_prolog);
2880 24834 : tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
2881 : build_int_cst (type, bound_epilog));
2882 :
2883 24834 : *bound_scalar = bound_prolog + bound_epilog;
2884 24834 : if (check_profitability)
2885 : {
2886 : /* TH indicates the minimum niters of vectorized loop, while we
2887 : compute the maximum niters of scalar loop. */
2888 15919 : th--;
2889 : /* Peeling for constant times. */
2890 15919 : if (int_niters_prolog >= 0)
2891 : {
2892 15881 : *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
2893 15881 : return build_int_cst (type, *bound_scalar);
2894 : }
2895 : /* Peeling an unknown number of times. Note that both BOUND_PROLOG
2896 : and BOUND_EPILOG are inclusive upper bounds. */
2897 38 : if (known_ge (th, bound_prolog + bound_epilog))
2898 : {
2899 0 : *bound_scalar = th;
2900 0 : return build_int_cst (type, th);
2901 : }
2902 : /* Need to do runtime comparison. */
2903 38 : else if (maybe_gt (th, bound_epilog))
2904 : {
2905 38 : *bound_scalar = upper_bound (*bound_scalar, th);
2906 38 : return fold_build2 (MAX_EXPR, type,
2907 : build_int_cst (type, th), niters);
2908 : }
2909 : }
2910 : return niters;
2911 : }
2912 :
2913 : /* NITERS is the number of times that the original scalar loop executes
2914 : after peeling. Work out the maximum number of iterations N that can
2915 : be handled by the vectorized form of the loop and then either:
2916 :
2917 : a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
2918 :
2919 : niters_vector = N
2920 :
2921 : b) set *STEP_VECTOR_PTR to one and generate:
2922 :
2923 : niters_vector = N / vf
2924 :
2925 : In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
2926 : any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
2927 : is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).
2928 :
2929 : Case (a) is used for LOOP_VINFO_USING_PARTIAL_VECTORS_P or if VF is
2930 : variable. As stated above, NITERS_VECTOR then equals the number
2931 : of scalar iterations and vect_set_loop_condition will handle the
2932 : step. As opposed to (b) we don't know anything about NITER_VECTOR's
2933 : range here.
2934 : */
2935 :
2936 : void
2937 33497 : vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
2938 : tree *niters_vector_ptr, tree *step_vector_ptr,
2939 : bool niters_no_overflow)
2940 : {
2941 33497 : tree ni_minus_gap, var;
2942 33497 : tree niters_vector, step_vector;
2943 33497 : tree type = niters ? TREE_TYPE (niters) : sizetype;
2944 33497 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2945 33497 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2946 :
2947 : /* If epilogue loop is required because of data accesses with gaps, we
2948 : subtract one iteration from the total number of iterations here for
2949 : correct calculation of RATIO. */
2950 33497 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2951 : {
2952 372 : ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
2953 : build_one_cst (type));
2954 372 : if (!is_gimple_val (ni_minus_gap))
2955 : {
2956 170 : var = create_tmp_var (type, "ni_gap");
2957 170 : gimple *stmts = NULL;
2958 170 : ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
2959 : true, var);
2960 170 : gsi_insert_seq_on_edge_immediate (pe, stmts);
2961 : }
2962 : }
2963 : else
2964 : ni_minus_gap = niters;
2965 :
2966 : /* To silence some unexpected warnings, simply initialize to 0. */
2967 33497 : unsigned HOST_WIDE_INT const_vf = 0;
2968 33497 : if (vf.is_constant (&const_vf)
2969 33497 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2970 : {
2971 : /* Create: niters / vf, which is equivalent to niters >> log2(vf) when
2972 : vf is a power of two, and when not we approximate using a
2973 : truncating division. */
2974 : /* If it's known that niters == number of latch executions + 1 doesn't
2975 : overflow, we can generate niters / vf; otherwise we generate
2976 : (niters - vf) / vf + 1 by using the fact that we know ratio
2977 : will be at least one. */
2978 33479 : tree var_vf = build_int_cst (type, const_vf);
2979 33479 : if (niters_no_overflow)
2980 33258 : niters_vector = fold_build2 (TRUNC_DIV_EXPR, type, ni_minus_gap,
2981 : var_vf);
2982 : else
2983 221 : niters_vector
2984 221 : = fold_build2 (PLUS_EXPR, type,
2985 : fold_build2 (TRUNC_DIV_EXPR, type,
2986 : fold_build2 (MINUS_EXPR, type,
2987 : ni_minus_gap,
2988 : var_vf),
2989 : var_vf),
2990 : build_int_cst (type, 1));
2991 33479 : step_vector = build_one_cst (type);
2992 : }
2993 : else
2994 : {
2995 18 : niters_vector = ni_minus_gap;
2996 18 : step_vector = build_int_cst (type, vf);
2997 : }
2998 :
2999 33497 : if (!is_gimple_val (niters_vector))
3000 : {
3001 25629 : var = create_tmp_var (type, "bnd");
3002 25629 : gimple_seq stmts = NULL;
3003 25629 : niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
3004 25629 : gsi_insert_seq_on_edge_immediate (pe, stmts);
3005 : /* Peeling algorithm guarantees that vector loop bound is at least ONE,
3006 : we set range information to make niters analyzer's life easier.
3007 : Note the number of latch iteration value can be TYPE_MAX_VALUE so
3008 : we have to represent the vector niter TYPE_MAX_VALUE + 1 / vf. */
3009 25629 : if (stmts != NULL
3010 25629 : && integer_onep (step_vector))
3011 : {
3012 25614 : if (niters_no_overflow)
3013 : {
3014 25411 : int_range<1> vr (type,
3015 50822 : wi::one (TYPE_PRECISION (type)),
3016 50822 : wi::div_trunc (wi::max_value
3017 25411 : (TYPE_PRECISION (type),
3018 25411 : TYPE_SIGN (type)),
3019 : const_vf,
3020 50822 : TYPE_SIGN (type)));
3021 25411 : set_range_info (niters_vector, vr);
3022 25411 : }
3023 : /* For VF == 1 the vector IV might also overflow so we cannot
3024 : assert a minimum value of 1. */
3025 203 : else if (const_vf > 1)
3026 : {
3027 167 : int_range<1> vr (type,
3028 334 : wi::one (TYPE_PRECISION (type)),
3029 501 : wi::rshift (wi::max_value (TYPE_PRECISION (type),
3030 167 : TYPE_SIGN (type))
3031 334 : - (const_vf - 1),
3032 334 : exact_log2 (const_vf), TYPE_SIGN (type))
3033 668 : + 1);
3034 167 : set_range_info (niters_vector, vr);
3035 167 : }
3036 : }
3037 : }
3038 33497 : *niters_vector_ptr = niters_vector;
3039 33497 : *step_vector_ptr = step_vector;
3040 :
3041 33497 : return;
3042 : }
3043 :
3044 : /* Given NITERS_VECTOR which is the number of iterations for vectorized
3045 : loop specified by LOOP_VINFO after vectorization, compute the number
3046 : of iterations before vectorization (niters_vector * vf) and store it
3047 : to NITERS_VECTOR_MULT_VF_PTR. */
3048 :
3049 : static void
3050 32704 : vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
3051 : tree niters_vector,
3052 : tree *niters_vector_mult_vf_ptr)
3053 : {
3054 : /* We should be using a step_vector of VF if VF is variable. */
3055 32704 : int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
3056 32704 : tree type = TREE_TYPE (niters_vector);
3057 32704 : tree tree_vf = build_int_cst (type, vf);
3058 32704 : basic_block exit_bb = LOOP_VINFO_MAIN_EXIT (loop_vinfo)->dest;
3059 :
3060 32704 : gcc_assert (niters_vector_mult_vf_ptr != NULL);
3061 32704 : tree niters_vector_mult_vf = fold_build2 (MULT_EXPR, type,
3062 : niters_vector, tree_vf);
3063 :
3064 : /* If we've peeled a vector iteration then subtract one full vector
3065 : iteration. */
3066 32704 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
3067 19 : niters_vector_mult_vf = fold_build2 (MINUS_EXPR, type,
3068 : niters_vector_mult_vf, tree_vf);
3069 :
3070 32704 : if (!is_gimple_val (niters_vector_mult_vf))
3071 : {
3072 24859 : tree var = create_tmp_var (type, "niters_vector_mult_vf");
3073 24859 : gimple_seq stmts = NULL;
3074 24859 : niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
3075 : &stmts, true, var);
3076 24859 : gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
3077 24859 : gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
3078 : }
3079 32704 : *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
3080 32704 : }
3081 :
3082 : /* Function slpeel_add_loop_guard adds guard skipping from the beginning
3083 : of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
3084 : are two pred edges of the merge point before UPDATE_LOOP. The two loops
3085 : appear like below:
3086 :
3087 : guard_bb:
3088 : if (cond)
3089 : goto merge_bb;
3090 : else
3091 : goto skip_loop;
3092 :
3093 : skip_loop:
3094 : header_a:
3095 : i_1 = PHI<i_0, i_2>;
3096 : ...
3097 : i_2 = i_1 + 1;
3098 : if (cond_a)
3099 : goto latch_a;
3100 : else
3101 : goto exit_a;
3102 : latch_a:
3103 : goto header_a;
3104 :
3105 : exit_a:
3106 : i_5 = PHI<i_2>;
3107 :
3108 : merge_bb:
3109 : ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
3110 :
3111 : update_loop:
3112 : header_b:
3113 : i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
3114 : ...
3115 : i_4 = i_3 + 1;
3116 : if (cond_b)
3117 : goto latch_b;
3118 : else
3119 : goto exit_bb;
3120 : latch_b:
3121 : goto header_b;
3122 :
3123 : exit_bb:
3124 :
3125 : This function creates PHI nodes at merge_bb and replaces the use of i_5
3126 : in the update_loop's PHI node with the result of new PHI result. */
3127 :
3128 : static void
3129 25262 : slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
3130 : class loop *update_loop,
3131 : edge guard_edge, edge merge_edge)
3132 : {
3133 25262 : location_t merge_loc, guard_loc;
3134 25262 : edge orig_e = loop_preheader_edge (skip_loop);
3135 25262 : edge update_e = loop_preheader_edge (update_loop);
3136 25262 : gphi_iterator gsi_orig, gsi_update;
3137 :
3138 25262 : for ((gsi_orig = gsi_start_phis (skip_loop->header),
3139 25262 : gsi_update = gsi_start_phis (update_loop->header));
3140 92405 : !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
3141 67143 : gsi_next (&gsi_orig), gsi_next (&gsi_update))
3142 : {
3143 67143 : gphi *orig_phi = gsi_orig.phi ();
3144 67143 : gphi *update_phi = gsi_update.phi ();
3145 :
3146 : /* Generate new phi node at merge bb of the guard. */
3147 67143 : tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3148 67143 : gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
3149 :
3150 : /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
3151 : args in NEW_PHI for these edges. */
3152 67143 : tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
3153 67143 : tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
3154 67143 : merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
3155 67143 : guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
3156 67143 : add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
3157 67143 : add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
3158 :
3159 : /* Update phi in UPDATE_PHI. */
3160 67143 : adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
3161 : }
3162 25262 : }
3163 :
3164 : /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
3165 : Return a value that equals:
3166 :
3167 : - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
3168 : - SKIP_VALUE when the main loop is skipped. */
3169 :
3170 : tree
3171 3792 : vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
3172 : tree skip_value)
3173 : {
3174 3792 : gcc_assert (loop_vinfo->main_loop_edge);
3175 :
3176 3792 : tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
3177 3792 : basic_block bb = loop_vinfo->main_loop_edge->dest;
3178 3792 : gphi *new_phi = create_phi_node (phi_result, bb);
3179 3792 : add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
3180 : UNKNOWN_LOCATION);
3181 3792 : add_phi_arg (new_phi, skip_value,
3182 : loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
3183 3792 : return phi_result;
3184 : }
3185 :
3186 : /* Function vect_do_peeling.
3187 :
3188 : Input:
3189 : - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
3190 :
3191 : preheader:
3192 : LOOP:
3193 : header_bb:
3194 : loop_body
3195 : if (exit_loop_cond) goto exit_bb
3196 : else goto header_bb
3197 : exit_bb:
3198 :
3199 : - NITERS: The number of iterations of the loop.
3200 : - NITERSM1: The number of iterations of the loop's latch.
3201 : - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
3202 : - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
3203 : CHECK_PROFITABILITY is true.
3204 : Output:
3205 : - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
3206 : iterate after vectorization; see vect_set_loop_condition for details.
3207 : - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
3208 : should be set to the number of scalar iterations handled by the
3209 : vector loop. The SSA name is only used on exit from the loop.
3210 :
3211 : This function peels prolog and epilog from the loop, adds guards skipping
3212 : PROLOG and EPILOG for various conditions. As a result, the changed CFG
3213 : would look like:
3214 :
3215 : guard_bb_1:
3216 : if (prefer_scalar_loop) goto merge_bb_1
3217 : else goto guard_bb_2
3218 :
3219 : guard_bb_2:
3220 : if (skip_prolog) goto merge_bb_2
3221 : else goto prolog_preheader
3222 :
3223 : prolog_preheader:
3224 : PROLOG:
3225 : prolog_header_bb:
3226 : prolog_body
3227 : if (exit_prolog_cond) goto prolog_exit_bb
3228 : else goto prolog_header_bb
3229 : prolog_exit_bb:
3230 :
3231 : merge_bb_2:
3232 :
3233 : vector_preheader:
3234 : VECTOR LOOP:
3235 : vector_header_bb:
3236 : vector_body
3237 : if (exit_vector_cond) goto vector_exit_bb
3238 : else goto vector_header_bb
3239 : vector_exit_bb:
3240 :
3241 : guard_bb_3:
3242 : if (skip_epilog) goto merge_bb_3
3243 : else goto epilog_preheader
3244 :
3245 : merge_bb_1:
3246 :
3247 : epilog_preheader:
3248 : EPILOG:
3249 : epilog_header_bb:
3250 : epilog_body
3251 : if (exit_epilog_cond) goto merge_bb_3
3252 : else goto epilog_header_bb
3253 :
3254 : merge_bb_3:
3255 :
3256 : Note this function peels prolog and epilog only if it's necessary,
3257 : as well as guards.
3258 : This function returns the epilogue loop if a decision was made to vectorize
3259 : it, otherwise NULL.
3260 :
3261 : The analysis resulting in this epilogue loop's loop_vec_info was performed
3262 : in the same vect_analyze_loop call as the main loop's. At that time
3263 : vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
3264 : vectorization factors than the main loop. This list is chained in the
3265 : loop's loop_vec_info in the 'epilogue_vinfo' member. When we decide to
3266 : vectorize the epilogue loop for a lower vectorization factor, the
3267 : loop_vec_info in epilogue_vinfo is updated and linked to the epilogue loop.
3268 : This is later used to vectorize the epilogue.
3269 : The reason the loop_vec_info needs updating is that it was
3270 : constructed based on the original main loop, and the epilogue loop is a
3271 : copy of this loop, so all links pointing to statements in the original loop
3272 : need updating. Furthermore, these loop_vec_infos share the
3273 : data_reference's records, which will also need to be updated.
3274 :
3275 : TODO: Guard for prefer_scalar_loop should be emitted along with
3276 : versioning conditions if loop versioning is needed. */
3277 :
3278 :
3279 : class loop *
3280 61566 : vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
3281 : tree *niters_vector, tree *step_vector,
3282 : tree *niters_vector_mult_vf_var, int th,
3283 : bool check_profitability, bool niters_no_overflow,
3284 : tree *advance)
3285 : {
3286 61566 : edge e, guard_e;
3287 61566 : tree type = niters ? TREE_TYPE (niters) : sizetype;
3288 61566 : tree guard_cond;
3289 61566 : basic_block guard_bb, guard_to;
3290 61566 : profile_probability prob_prolog, prob_vector, prob_epilog;
3291 61566 : int estimated_vf;
3292 61566 : int prolog_peeling = 0;
3293 61566 : bool vect_epilogues = loop_vinfo->epilogue_vinfo != NULL;
3294 61566 : bool uncounted_p = LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo);
3295 :
3296 61566 : if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3297 61565 : prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3298 :
3299 61566 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3300 61566 : poly_uint64 bound_epilog = 0;
3301 61566 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
3302 61548 : && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3303 32385 : bound_epilog += vf - 1;
3304 61566 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3305 372 : bound_epilog += 1;
3306 :
3307 : /* For early breaks the scalar loop needs to execute at most VF times
3308 : to find the element that caused the break. */
3309 61566 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
3310 1409 : && LOOP_VINFO_EARLY_BRK_NEEDS_EPILOG (loop_vinfo))
3311 61566 : bound_epilog = vf;
3312 :
3313 61566 : bool epilog_peeling = maybe_ne (bound_epilog, 0U);
3314 61566 : poly_uint64 bound_scalar = bound_epilog;
3315 :
3316 61566 : if (!LOOP_VINFO_EARLY_BRK_NEEDS_EPILOG (loop_vinfo) && dump_enabled_p ())
3317 8132 : dump_printf_loc (MSG_NOTE, vect_location,
3318 : "early break does not require epilog.\n");
3319 :
3320 61566 : if (!prolog_peeling && !epilog_peeling)
3321 : return NULL;
3322 :
3323 : /* Before doing any peeling make sure to reset debug binds outside of
3324 : the loop referring to defs not in LC SSA. */
3325 32782 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3326 99616 : for (unsigned i = 0; i < loop->num_nodes; ++i)
3327 : {
3328 66834 : basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3329 66834 : imm_use_iterator ui;
3330 66834 : gimple *use_stmt;
3331 159420 : for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3332 92586 : gsi_next (&gsi))
3333 : {
3334 385596 : FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
3335 220244 : if (gimple_debug_bind_p (use_stmt)
3336 19820 : && loop != gimple_bb (use_stmt)->loop_father
3337 19836 : && !flow_loop_nested_p (loop,
3338 16 : gimple_bb (use_stmt)->loop_father))
3339 : {
3340 2 : gimple_debug_bind_reset_value (use_stmt);
3341 2 : update_stmt (use_stmt);
3342 92586 : }
3343 : }
3344 624675 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
3345 491007 : gsi_next (&gsi))
3346 : {
3347 491007 : ssa_op_iter op_iter;
3348 491007 : def_operand_p def_p;
3349 795502 : FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
3350 1066326 : FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
3351 493601 : if (gimple_debug_bind_p (use_stmt)
3352 36265 : && loop != gimple_bb (use_stmt)->loop_father
3353 36292 : && !flow_loop_nested_p (loop,
3354 27 : gimple_bb (use_stmt)->loop_father))
3355 : {
3356 0 : gimple_debug_bind_reset_value (use_stmt);
3357 0 : update_stmt (use_stmt);
3358 304495 : }
3359 : }
3360 : }
3361 :
3362 32782 : prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
3363 32782 : estimated_vf = vect_vf_for_cost (loop_vinfo);
3364 32782 : if (estimated_vf == 2)
3365 6788 : estimated_vf = 3;
3366 32782 : prob_prolog = prob_epilog = profile_probability::guessed_always ()
3367 32782 : .apply_scale (estimated_vf - 1, estimated_vf);
3368 :
3369 32782 : class loop *prolog = NULL, *epilog = NULL;
3370 32782 : class loop *first_loop = loop;
3371 32782 : bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
3372 :
3373 : /* SSA form needs to be up-to-date since we are going to manually
3374 : update SSA form in slpeel_tree_duplicate_loop_to_edge_cfg and delete all
3375 : update SSA state after that, so we have to make sure to not lose any
3376 : pending update needs. */
3377 32782 : gcc_assert (!need_ssa_update_p (cfun));
3378 :
3379 : /* If we're vectorizing an epilogue loop, we have ensured that the
3380 : virtual operand is in SSA form throughout the vectorized main loop.
3381 : Normally it is possible to trace the updated
3382 : vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
3383 : back to scalar-stmt vuses, meaning that the effect of the SSA update
3384 : remains local to the main loop. However, there are rare cases in
3385 : which the vectorized loop should have vdefs even when the original scalar
3386 : loop didn't. For example, vectorizing a load with IFN_LOAD_LANES
3387 : introduces clobbers of the temporary vector array, which in turn
3388 : needs new vdefs. If the scalar loop doesn't write to memory, these
3389 : new vdefs will be the only ones in the vector loop.
3390 : We are currently defering updating virtual SSA form and creating
3391 : of a virtual PHI for this case so we do not have to make sure the
3392 : newly introduced virtual def is in LCSSA form. */
3393 :
3394 32782 : if (MAY_HAVE_DEBUG_BIND_STMTS)
3395 : {
3396 12311 : gcc_assert (!adjust_vec.exists ());
3397 12311 : adjust_vec.create (32);
3398 : }
3399 32782 : initialize_original_copy_tables ();
3400 :
3401 : /* Record the anchor bb at which the guard should be placed if the scalar
3402 : loop might be preferred. */
3403 32782 : basic_block anchor = loop_preheader_edge (loop)->src;
3404 :
3405 : /* Generate the number of iterations for the prolog loop. We do this here
3406 : so that we can also get the upper bound on the number of iterations. */
3407 32782 : tree niters_prolog;
3408 32782 : poly_int64 bound_prolog = 0;
3409 32782 : if (prolog_peeling)
3410 : {
3411 428 : niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
3412 : &bound_prolog);
3413 : /* If algonment peeling is known, we will always execute prolog. */
3414 428 : if (TREE_CODE (niters_prolog) == INTEGER_CST)
3415 226 : prob_prolog = profile_probability::always ();
3416 : }
3417 : else
3418 32354 : niters_prolog = build_int_cst (type, 0);
3419 :
3420 32782 : loop_vec_info epilogue_vinfo = loop_vinfo->epilogue_vinfo;
3421 32782 : tree niters_vector_mult_vf = NULL_TREE;
3422 : /* Saving NITERs before the loop, as this may be changed by prologue. */
3423 32782 : tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
3424 32782 : edge update_e = NULL, skip_e = NULL;
3425 32782 : unsigned int lowest_vf = constant_lower_bound (vf);
3426 : /* Prolog loop may be skipped. */
3427 32782 : bool skip_prolog = (prolog_peeling != 0);
3428 : /* Skip this loop to epilog when there are not enough iterations to enter this
3429 : vectorized loop. If true we should perform runtime checks on the NITERS
3430 : to check whether we should skip the current vectorized loop. If we know
3431 : the number of scalar iterations we may choose to add a runtime check if
3432 : this number "maybe" smaller than the number of iterations required
3433 : when we know the number of scalar iterations may potentially
3434 : be smaller than the number of iterations required to enter this loop, for
3435 : this we use the upper bounds on the prolog and epilog peeling. When we
3436 : don't know the number of iterations and don't require versioning it is
3437 : because we have asserted that there are enough scalar iterations to enter
3438 : the main loop, so this skip is not necessary. When we are versioning then
3439 : we only add such a skip if we have chosen to vectorize the epilogue. */
3440 32782 : bool skip_vector = false;
3441 32782 : if (!uncounted_p)
3442 7890 : skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3443 40629 : ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
3444 7890 : bound_prolog + bound_epilog)
3445 24874 : : (!LOOP_VINFO_USE_VERSIONING_WITHOUT_PEELING (loop_vinfo)
3446 40 : || vect_epilogues));
3447 :
3448 : /* Epilog loop must be executed if the number of iterations for epilog
3449 : loop is known at compile time, otherwise we need to add a check at
3450 : the end of vector loop and skip to the end of epilog loop. */
3451 32782 : bool skip_epilog = (prolog_peeling < 0
3452 32580 : || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3453 32782 : || !vf.is_constant ());
3454 : /* PEELING_FOR_GAPS and peeling for early breaks are special because epilog
3455 : loop must be executed. */
3456 32782 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3457 32410 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3458 1201 : skip_epilog = false;
3459 :
3460 32782 : class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3461 32782 : auto_vec<profile_count> original_counts;
3462 32782 : basic_block *original_bbs = NULL;
3463 :
3464 32782 : if (skip_vector)
3465 : {
3466 24834 : split_edge (loop_preheader_edge (loop));
3467 :
3468 24834 : if (epilog_peeling && (vect_epilogues || scalar_loop == NULL))
3469 : {
3470 22614 : original_bbs = get_loop_body (loop);
3471 68255 : for (unsigned int i = 0; i < loop->num_nodes; i++)
3472 45641 : original_counts.safe_push(original_bbs[i]->count);
3473 : }
3474 :
3475 : /* Due to the order in which we peel prolog and epilog, we first
3476 : propagate probability to the whole loop. The purpose is to
3477 : avoid adjusting probabilities of both prolog and vector loops
3478 : separately. Note in this case, the probability of epilog loop
3479 : needs to be scaled back later. */
3480 24834 : basic_block bb_before_loop = loop_preheader_edge (loop)->src;
3481 24834 : if (prob_vector.initialized_p ())
3482 : {
3483 24834 : scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
3484 24834 : scale_loop_profile (loop, prob_vector, -1);
3485 : }
3486 : }
3487 :
3488 32782 : if (vect_epilogues)
3489 : {
3490 : /* Make sure to set the epilogue's epilogue scalar loop, such that we can
3491 : use the original scalar loop as remaining epilogue if necessary. */
3492 6823 : LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
3493 6823 : = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3494 6823 : LOOP_VINFO_SCALAR_MAIN_EXIT (epilogue_vinfo)
3495 6823 : = LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo);
3496 : }
3497 :
3498 32782 : if (prolog_peeling)
3499 : {
3500 428 : e = loop_preheader_edge (loop);
3501 428 : edge exit_e = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
3502 428 : gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, exit_e, e)
3503 : && (!LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)
3504 : || uncounted_p));
3505 :
3506 : /* Peel prolog and put it on preheader edge of loop. */
3507 428 : edge scalar_e = LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo);
3508 428 : edge prolog_e = NULL;
3509 428 : bool early_break_peel_p = LOOP_VINFO_EARLY_BRK_NEEDS_EPILOG (loop_vinfo);
3510 428 : prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, exit_e,
3511 : scalar_loop, scalar_e,
3512 : e, &prolog_e, true, NULL,
3513 : uncounted_p, uncounted_p,
3514 : early_break_peel_p);
3515 :
3516 428 : gcc_assert (prolog);
3517 428 : prolog->force_vectorize = false;
3518 :
3519 : /* Assign hierarchical discriminators to distinguish prolog loop. */
3520 428 : gimple *prolog_last = last_nondebug_stmt (prolog->header);
3521 428 : location_t prolog_loc
3522 428 : = prolog_last ? gimple_location (prolog_last) : UNKNOWN_LOCATION;
3523 428 : if (prolog_loc != UNKNOWN_LOCATION)
3524 : {
3525 426 : unsigned int prolog_copyid = allocate_copyid_base (prolog_loc, 1);
3526 426 : assign_discriminators_to_loop (prolog, 0, prolog_copyid);
3527 : }
3528 428 : first_loop = prolog;
3529 428 : reset_original_copy_tables ();
3530 :
3531 : /* Update the number of iterations for prolog loop. */
3532 428 : tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
3533 428 : vect_set_loop_condition (prolog, prolog_e, NULL, niters_prolog,
3534 : step_prolog, NULL_TREE, false);
3535 :
3536 : /* Skip the prolog loop. */
3537 428 : if (skip_prolog)
3538 : {
3539 428 : guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3540 : niters_prolog, build_int_cst (type, 0));
3541 428 : guard_bb = loop_preheader_edge (prolog)->src;
3542 428 : basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
3543 428 : guard_to = split_edge (loop_preheader_edge (loop));
3544 428 : guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
3545 : guard_to, guard_bb,
3546 : prob_prolog.invert (),
3547 : irred_flag);
3548 1958 : for (edge alt_e : get_loop_exit_edges (prolog))
3549 : {
3550 674 : if (alt_e == prolog_e)
3551 428 : continue;
3552 246 : basic_block old_dom
3553 246 : = get_immediate_dominator (CDI_DOMINATORS, alt_e->dest);
3554 246 : if (flow_bb_inside_loop_p (prolog, old_dom))
3555 : {
3556 101 : auto_vec<basic_block, 8> queue;
3557 101 : for (auto son = first_dom_son (CDI_DOMINATORS, old_dom);
3558 325 : son; son = next_dom_son (CDI_DOMINATORS, son))
3559 224 : if (!flow_bb_inside_loop_p (prolog, son))
3560 123 : queue.safe_push (son);
3561 426 : for (auto son : queue)
3562 123 : set_immediate_dominator (CDI_DOMINATORS, son, guard_bb);
3563 101 : }
3564 428 : }
3565 :
3566 428 : e = EDGE_PRED (guard_to, 0);
3567 428 : e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3568 428 : slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
3569 :
3570 428 : scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
3571 428 : scale_loop_profile (prolog, prob_prolog,
3572 428 : estimated_poly_value (bound_prolog) - 1);
3573 : }
3574 :
3575 : /* Update init address of DRs. */
3576 428 : vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
3577 428 : if (!uncounted_p)
3578 : {
3579 : /* Update niters for vector loop. */
3580 397 : LOOP_VINFO_NITERS (loop_vinfo)
3581 397 : = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
3582 397 : LOOP_VINFO_NITERSM1 (loop_vinfo)
3583 397 : = fold_build2 (MINUS_EXPR, type,
3584 : LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
3585 : }
3586 428 : bool new_var_p = false;
3587 428 : niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
3588 : /* It's guaranteed that vector loop bound before vectorization is at
3589 : least VF, so set range information for newly generated var. */
3590 428 : if (new_var_p)
3591 : {
3592 212 : int_range<1> vr (type,
3593 424 : wi::to_wide (build_int_cst (type, lowest_vf)),
3594 424 : wi::to_wide (TYPE_MAX_VALUE (type)));
3595 212 : set_range_info (niters, vr);
3596 212 : }
3597 :
3598 : /* Prolog iterates at most bound_prolog times, latch iterates at
3599 : most bound_prolog - 1 times. */
3600 428 : if (bound_prolog.is_constant ())
3601 428 : record_niter_bound (prolog, bound_prolog.to_constant () - 1, false,
3602 : true);
3603 428 : delete_update_ssa ();
3604 428 : adjust_vec_debug_stmts ();
3605 428 : scev_reset ();
3606 : }
3607 32782 : basic_block bb_before_epilog = NULL;
3608 :
3609 32782 : if (epilog_peeling)
3610 : {
3611 32747 : e = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
3612 32747 : gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, e, e));
3613 :
3614 : /* Peel epilog and put it on exit edge of loop. If we are vectorizing
3615 : said epilog then we should use a copy of the main loop as a starting
3616 : point. This loop may have already had some preliminary transformations
3617 : to allow for more optimal vectorization, for example if-conversion.
3618 : If we are not vectorizing the epilog then we should use the scalar loop
3619 : as the transformations mentioned above make less or no sense when not
3620 : vectorizing. */
3621 32747 : edge scalar_e = LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo);
3622 32747 : epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
3623 6823 : edge epilog_e = vect_epilogues ? e : scalar_e;
3624 32747 : edge new_epilog_e = NULL;
3625 32747 : auto_vec<basic_block> doms;
3626 32747 : bool early_break_peel_p = LOOP_VINFO_EARLY_BRK_NEEDS_EPILOG (loop_vinfo);
3627 32747 : epilog
3628 32747 : = slpeel_tree_duplicate_loop_to_edge_cfg (loop, e, epilog, epilog_e, e,
3629 : &new_epilog_e, true, &doms,
3630 : uncounted_p, false,
3631 : early_break_peel_p);
3632 :
3633 32747 : LOOP_VINFO_EPILOGUE_MAIN_EXIT (loop_vinfo) = new_epilog_e;
3634 32747 : gcc_assert (epilog);
3635 32747 : gcc_assert (new_epilog_e);
3636 32747 : epilog->force_vectorize = false;
3637 32747 : bb_before_epilog = loop_preheader_edge (epilog)->src;
3638 :
3639 : /* Assign hierarchical discriminators to distinguish epilog loop.
3640 : Only assign if it's a scalar epilog. If it will be vectorized
3641 : (vect_epilogues), discriminators will be assigned.
3642 : Use dynamic copy_id allocation instead of hardcoded constants. */
3643 32747 : if (!vect_epilogues)
3644 : {
3645 25924 : gimple *epilog_last = last_nondebug_stmt (epilog->header);
3646 25924 : location_t epilog_loc
3647 25924 : = epilog_last ? gimple_location (epilog_last) : UNKNOWN_LOCATION;
3648 25895 : if (epilog_loc != UNKNOWN_LOCATION)
3649 : {
3650 21092 : unsigned int epilog_copyid = allocate_copyid_base (epilog_loc, 1);
3651 21092 : assign_discriminators_to_loop (epilog, 0, epilog_copyid);
3652 : }
3653 : }
3654 :
3655 : /* Scalar version loop may be preferred. In this case, add guard
3656 : and skip to epilog. Note this only happens when the number of
3657 : iterations of loop is unknown at compile time, otherwise this
3658 : won't be vectorized. */
3659 32747 : if (skip_vector)
3660 : {
3661 : /* Additional epilogue iteration is peeled if gap exists. */
3662 24834 : tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
3663 24834 : bound_prolog, bound_epilog,
3664 : th, &bound_scalar,
3665 : check_profitability);
3666 : /* Build guard against NITERSM1 since NITERS may overflow. */
3667 24834 : guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
3668 24834 : guard_bb = anchor;
3669 24834 : guard_to = split_edge (loop_preheader_edge (epilog));
3670 24834 : guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
3671 : guard_to, guard_bb,
3672 : prob_vector.invert (),
3673 : irred_flag);
3674 24834 : skip_e = guard_e;
3675 24834 : e = EDGE_PRED (guard_to, 0);
3676 24834 : e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3677 :
3678 : /* Handle any remaining dominator updates needed after
3679 : inserting the loop skip edge above. */
3680 24834 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
3681 276 : && LOOP_VINFO_EARLY_BRK_NEEDS_EPILOG (loop_vinfo)
3682 138 : && prolog_peeling)
3683 : {
3684 : /* Adding a skip edge to skip a loop with multiple exits
3685 : means the dominator of the join blocks for all exits shifts
3686 : from the prolog skip guard to the loop skip guard. */
3687 27 : auto prolog_skip_bb
3688 27 : = single_pred (loop_preheader_edge (prolog)->src);
3689 27 : auto needs_update
3690 27 : = get_dominated_by (CDI_DOMINATORS, prolog_skip_bb);
3691 :
3692 : /* Update everything except for the immediate children of
3693 : the prolog skip block (the prolog and vector preheaders).
3694 : Those should remain dominated by the prolog skip block itself,
3695 : since the loop guard edge goes to the epilogue. */
3696 170 : for (auto bb : needs_update)
3697 89 : if (bb != EDGE_SUCC (prolog_skip_bb, 0)->dest
3698 89 : && bb != EDGE_SUCC (prolog_skip_bb, 1)->dest)
3699 35 : set_immediate_dominator (CDI_DOMINATORS, bb, guard_bb);
3700 27 : }
3701 :
3702 24834 : slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
3703 :
3704 : /* Simply propagate profile info from guard_bb to guard_to which is
3705 : a merge point of control flow. */
3706 24834 : profile_count old_count = guard_to->count;
3707 24834 : guard_to->count = guard_bb->count;
3708 :
3709 : /* Restore the counts of the epilog loop if we didn't use the scalar loop. */
3710 24834 : if (vect_epilogues || scalar_loop == NULL)
3711 : {
3712 22614 : gcc_assert(epilog->num_nodes == loop->num_nodes);
3713 22614 : basic_block *bbs = get_loop_body (epilog);
3714 68255 : for (unsigned int i = 0; i < epilog->num_nodes; i++)
3715 : {
3716 45641 : gcc_assert(get_bb_original (bbs[i]) == original_bbs[i]);
3717 45641 : bbs[i]->count = original_counts[i];
3718 : }
3719 22614 : free (bbs);
3720 22614 : free (original_bbs);
3721 : }
3722 2220 : else if (old_count.nonzero_p ())
3723 2220 : scale_loop_profile (epilog, guard_to->count.probability_in (old_count), -1);
3724 :
3725 : /* Only need to handle basic block before epilog loop if it's not
3726 : the guard_bb, which is the case when skip_vector is true. */
3727 24834 : if (guard_bb != bb_before_epilog && single_pred_p (bb_before_epilog))
3728 24696 : bb_before_epilog->count = single_pred_edge (bb_before_epilog)->count ();
3729 24834 : bb_before_epilog = loop_preheader_edge (epilog)->src;
3730 : }
3731 :
3732 32747 : if (!uncounted_p)
3733 : {
3734 : /* If loop is peeled for non-zero constant times, now niters refers to
3735 : orig_niters - prolog_peeling, it won't overflow even the
3736 : orig_niters overflows. */
3737 32704 : niters_no_overflow |= (prolog_peeling > 0);
3738 32704 : vect_gen_vector_loop_niters (loop_vinfo, niters,
3739 : niters_vector, step_vector,
3740 : niters_no_overflow);
3741 32704 : if (!integer_onep (*step_vector))
3742 : {
3743 : /* On exit from the loop we will have an easy way of calcalating
3744 : NITERS_VECTOR / STEP * STEP. Install a dummy definition
3745 : until then. */
3746 0 : niters_vector_mult_vf
3747 0 : = make_ssa_name (TREE_TYPE (*niters_vector));
3748 0 : SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
3749 0 : *niters_vector_mult_vf_var = niters_vector_mult_vf;
3750 : }
3751 : else
3752 32704 : vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
3753 : &niters_vector_mult_vf);
3754 : /* Update IVs of original loop as if they were advanced by
3755 : niters_vector_mult_vf steps. */
3756 32704 : gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
3757 32704 : update_e = skip_vector ? e : loop_preheader_edge (epilog);
3758 : }
3759 32747 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3760 825 : update_e = single_succ_edge (LOOP_VINFO_MAIN_EXIT (loop_vinfo)->dest);
3761 :
3762 : /* If we have a peeled vector iteration we will never skip the epilog loop
3763 : and we can simplify the cfg a lot by not doing the edge split. */
3764 32747 : if (skip_epilog
3765 32747 : || (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
3766 825 : && !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)))
3767 : {
3768 25154 : guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3769 : niters, niters_vector_mult_vf);
3770 :
3771 25154 : guard_bb = LOOP_VINFO_MAIN_EXIT (loop_vinfo)->dest;
3772 25154 : edge epilog_e = LOOP_VINFO_EPILOGUE_MAIN_EXIT (loop_vinfo);
3773 25154 : guard_to = epilog_e->dest;
3774 25654 : guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
3775 : skip_vector ? anchor : guard_bb,
3776 : prob_epilog.invert (),
3777 : irred_flag);
3778 :
3779 25154 : doms.safe_push (guard_to);
3780 25154 : if (vect_epilogues)
3781 5473 : epilogue_vinfo->skip_this_loop_edge = guard_e;
3782 25154 : edge main_iv = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
3783 25154 : gphi_iterator gsi2 = gsi_start_phis (main_iv->dest);
3784 25154 : for (gphi_iterator gsi = gsi_start_phis (guard_to);
3785 61489 : !gsi_end_p (gsi); gsi_next (&gsi))
3786 : {
3787 : /* We are expecting all of the PHIs we have on epilog_e
3788 : to be also on the main loop exit. But sometimes
3789 : a stray virtual definition can appear at epilog_e
3790 : which we can then take as the same on all exits,
3791 : we've removed the LC SSA PHI on the main exit before
3792 : so we wouldn't need to create a loop PHI for it. */
3793 36335 : if (virtual_operand_p (gimple_phi_result (*gsi))
3794 36335 : && (gsi_end_p (gsi2)
3795 36576 : || !virtual_operand_p (gimple_phi_result (*gsi2))))
3796 172 : add_phi_arg (*gsi,
3797 86 : gimple_phi_arg_def_from_edge (*gsi, epilog_e),
3798 : guard_e, UNKNOWN_LOCATION);
3799 : else
3800 : {
3801 36249 : add_phi_arg (*gsi, gimple_phi_result (*gsi2), guard_e,
3802 : UNKNOWN_LOCATION);
3803 36249 : gsi_next (&gsi2);
3804 : }
3805 : }
3806 :
3807 : /* Only need to handle basic block before epilog loop if it's not
3808 : the guard_bb, which is the case when skip_vector is true. */
3809 25154 : if (guard_bb != bb_before_epilog)
3810 : {
3811 25108 : prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
3812 :
3813 25108 : scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
3814 : }
3815 25154 : scale_loop_profile (epilog, prob_epilog, -1);
3816 : }
3817 :
3818 : /* If we have a peeled vector iteration, all exits are the same, leave it
3819 : and so the main exit needs to be treated the same as the alternative
3820 : exits in that we leave their updates to vectorizable_live_operations.
3821 : */
3822 32747 : tree vector_iters_vf = niters_vector_mult_vf;
3823 32747 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3824 : {
3825 825 : tree tmp_niters_vf
3826 825 : = make_ssa_name (LOOP_VINFO_EARLY_BRK_IV_TYPE (loop_vinfo));
3827 :
3828 43 : if (!(LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo)
3829 911 : && get_loop_exit_edges (loop).length () == 1)
3830 839 : && LOOP_VINFO_EARLY_BRK_NEEDS_EPILOG (loop_vinfo))
3831 : {
3832 615 : basic_block exit_bb = NULL;
3833 615 : edge update_e = NULL;
3834 :
3835 : /* Identify the early exit merge block. I wish we had stored
3836 : this. */
3837 1845 : for (auto e : get_loop_exit_edges (loop))
3838 615 : if (e != LOOP_VINFO_MAIN_EXIT (loop_vinfo))
3839 : {
3840 615 : exit_bb = e->dest;
3841 615 : update_e = single_succ_edge (exit_bb);
3842 615 : break;
3843 615 : }
3844 615 : vect_update_ivs_after_vectorizer (loop_vinfo, tmp_niters_vf,
3845 : update_e, true);
3846 : }
3847 825 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
3848 : vector_iters_vf = tmp_niters_vf;
3849 :
3850 825 : LOOP_VINFO_EARLY_BRK_NITERS_VAR (loop_vinfo) = tmp_niters_vf;
3851 : }
3852 :
3853 32747 : bool recalculate_peel_niters_init
3854 32747 : = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
3855 32747 : vect_update_ivs_after_vectorizer (loop_vinfo, vector_iters_vf,
3856 : update_e,
3857 : recalculate_peel_niters_init);
3858 :
3859 : /* Recalculate the dominators after adding the guard edge. */
3860 32747 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3861 825 : iterate_fix_dominators (CDI_DOMINATORS, doms, false);
3862 :
3863 : /* When we do not have a loop-around edge to the epilog we know
3864 : the vector loop covered at least VF scalar iterations unless
3865 : we have early breaks.
3866 : Update any known upper bound with this knowledge. */
3867 32747 : if (! skip_vector
3868 7913 : && ! LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3869 : {
3870 7364 : if (epilog->any_upper_bound)
3871 7364 : epilog->nb_iterations_upper_bound -= lowest_vf;
3872 7364 : if (epilog->any_likely_upper_bound)
3873 7364 : epilog->nb_iterations_likely_upper_bound -= lowest_vf;
3874 7364 : if (epilog->any_estimate)
3875 7362 : epilog->nb_iterations_estimate -= lowest_vf;
3876 : }
3877 :
3878 32747 : unsigned HOST_WIDE_INT bound;
3879 32747 : if (bound_scalar.is_constant (&bound))
3880 : {
3881 32747 : gcc_assert (bound != 0);
3882 : /* Adjust the upper bound by the extra peeled vector iteration if we
3883 : are an epilogue of an peeled vect loop and not VLA. For VLA the
3884 : loop bounds are unknown. */
3885 65475 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)
3886 32747 : && vf.is_constant ())
3887 62 : bound += vf.to_constant ();
3888 : /* -1 to convert loop iterations to latch iterations. */
3889 32747 : record_niter_bound (epilog, bound - 1, false, true);
3890 32747 : scale_loop_profile (epilog, profile_probability::always (),
3891 : bound - 1);
3892 : }
3893 :
3894 32747 : delete_update_ssa ();
3895 32747 : adjust_vec_debug_stmts ();
3896 32747 : scev_reset ();
3897 32747 : }
3898 :
3899 32782 : if (vect_epilogues)
3900 : {
3901 6823 : epilog->aux = epilogue_vinfo;
3902 6823 : LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3903 6823 : LOOP_VINFO_MAIN_EXIT (epilogue_vinfo)
3904 6823 : = LOOP_VINFO_EPILOGUE_MAIN_EXIT (loop_vinfo);
3905 :
3906 6823 : loop_constraint_clear (epilog, LOOP_C_INFINITE);
3907 :
3908 : /* We now must calculate the number of NITERS performed by the previous
3909 : loop and EPILOGUE_NITERS to be performed by the epilogue. */
3910 6823 : tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3911 : niters_prolog, niters_vector_mult_vf);
3912 :
3913 : /* If skip_vector we may skip the previous loop, we insert a phi-node to
3914 : determine whether we are coming from the previous vectorized loop
3915 : using the update_e edge or the skip_vector basic block using the
3916 : skip_e edge. */
3917 6823 : if (skip_vector)
3918 : {
3919 5543 : gcc_assert (update_e != NULL && skip_e != NULL);
3920 5543 : gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3921 : update_e->dest);
3922 5543 : tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3923 5543 : gimple *stmt = gimple_build_assign (new_ssa, niters);
3924 5543 : gimple_stmt_iterator gsi;
3925 5543 : if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3926 5543 : && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3927 : {
3928 5543 : gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3929 5543 : gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3930 : }
3931 : else
3932 : {
3933 0 : gsi = gsi_last_bb (update_e->src);
3934 0 : gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3935 : }
3936 :
3937 5543 : niters = new_ssa;
3938 5543 : add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3939 5543 : add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3940 : UNKNOWN_LOCATION);
3941 5543 : niters = PHI_RESULT (new_phi);
3942 5543 : epilogue_vinfo->main_loop_edge = update_e;
3943 5543 : epilogue_vinfo->skip_main_loop_edge = skip_e;
3944 : }
3945 :
3946 : /* Set ADVANCE to the number of iterations performed by the previous
3947 : loop and its prologue. */
3948 6823 : *advance = niters;
3949 :
3950 : /* Subtract the number of iterations performed by the vectorized loop
3951 : from the number of total iterations. */
3952 6823 : tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3953 : before_loop_niters,
3954 : niters);
3955 :
3956 6823 : LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3957 6823 : LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3958 6823 : = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3959 : epilogue_niters,
3960 : build_one_cst (TREE_TYPE (epilogue_niters)));
3961 : }
3962 :
3963 32782 : adjust_vec.release ();
3964 32782 : free_original_copy_tables ();
3965 :
3966 32782 : return vect_epilogues ? epilog : NULL;
3967 32782 : }
3968 :
3969 : /* Function vect_create_cond_for_niters_checks.
3970 :
3971 : Create a conditional expression that represents the run-time checks for
3972 : loop's niter. The loop is guaranteed to terminate if the run-time
3973 : checks hold.
3974 :
3975 : Input:
3976 : COND_EXPR - input conditional expression. New conditions will be chained
3977 : with logical AND operation. If it is NULL, then the function
3978 : is used to return the number of alias checks.
3979 : LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3980 : to be checked.
3981 :
3982 : Output:
3983 : COND_EXPR - conditional expression.
3984 :
3985 : The returned COND_EXPR is the conditional expression to be used in the
3986 : if statement that controls which version of the loop gets executed at
3987 : runtime. */
3988 :
3989 : static void
3990 376 : vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3991 : {
3992 376 : tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3993 :
3994 376 : if (*cond_expr)
3995 376 : *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3996 : *cond_expr, part_cond_expr);
3997 : else
3998 0 : *cond_expr = part_cond_expr;
3999 376 : }
4000 :
4001 : /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
4002 : and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
4003 :
4004 : static void
4005 307 : chain_cond_expr (tree *cond_expr, tree part_cond_expr)
4006 : {
4007 307 : if (*cond_expr)
4008 303 : *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
4009 : *cond_expr, part_cond_expr);
4010 : else
4011 4 : *cond_expr = part_cond_expr;
4012 307 : }
4013 :
4014 : /* Function vect_create_cond_for_align_checks.
4015 :
4016 : Create a conditional expression that represents the alignment checks for
4017 : all of data references (array element references) whose alignment must be
4018 : checked at runtime.
4019 :
4020 : Input:
4021 : COND_EXPR - input conditional expression. New conditions will be chained
4022 : with logical AND operation.
4023 : LOOP_VINFO - three fields of the loop information are used.
4024 : LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
4025 : LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
4026 : LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT indicates which check applies.
4027 :
4028 : Output:
4029 : COND_EXPR_STMT_LIST - statements needed to construct the conditional
4030 : expression.
4031 : The returned value is the conditional expression to be used in the if
4032 : statement that controls which version of the loop gets executed at runtime.
4033 :
4034 : Based on the boolean value of LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT, we decide
4035 : which type of check should be applied and create two different expressions
4036 : accordingly.
4037 : 1) When LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT is false, we see if all data refs
4038 : to be checked are already aligned to an alignment boundary. We create
4039 : an expression of "(a_1 | a_2 | a_3 | ... | a_n) & mask", where "a_i" is
4040 : the address of i'th data reference.
4041 : 2) When LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT is true, we see if all data refs
4042 : can be aligned to a boundary after a certain amount of peeling, in other
4043 : words, their addresses have the same bottom bits according to the mask.
4044 : We create "((a_1 ^ a_2) | (a_2 ^ a_3) | ... | (a_n-1 ^ a_n)) & mask",
4045 : where "a_i" is the address of i'th data reference.
4046 :
4047 : Both algorithms make two assumptions:
4048 : 1) The number of bytes "n" in a vector is a power of 2.
4049 : 2) An address "a" is aligned if a%n is zero and that this
4050 : test can be done as a&(n-1) == 0. For example, for 16
4051 : byte vectors the test is a&0xf == 0. */
4052 :
4053 : static void
4054 45 : vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
4055 : tree *cond_expr,
4056 : gimple_seq *cond_expr_stmt_list)
4057 : {
4058 45 : const vec<stmt_vec_info> &may_misalign_stmts
4059 : = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
4060 45 : stmt_vec_info stmt_info;
4061 45 : poly_uint64 mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
4062 45 : tree mask_cst;
4063 45 : unsigned int i;
4064 45 : tree int_ptrsize_type;
4065 45 : char tmp_name[30];
4066 45 : tree or_tmp_name = NULL_TREE;
4067 45 : tree prev_addr_tmp_name = NULL_TREE;
4068 45 : tree and_tmp_name;
4069 45 : gimple *and_stmt;
4070 45 : tree ptrsize_zero;
4071 45 : tree part_cond_expr;
4072 :
4073 45 : gcc_assert (known_ne (mask, 0U));
4074 :
4075 45 : int_ptrsize_type = signed_type_for (ptr_type_node);
4076 :
4077 : /* If LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT is true, we should have at least two
4078 : datarefs to check the mutual alignment. */
4079 45 : gcc_assert (may_misalign_stmts.length () > 1
4080 : || !LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT (loop_vinfo));
4081 :
4082 109 : FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
4083 : {
4084 64 : gimple_seq new_stmt_list = NULL;
4085 64 : tree addr_base;
4086 64 : tree addr_tmp_name;
4087 64 : tree xor_tmp_name;
4088 64 : tree new_or_tmp_name;
4089 64 : gimple *addr_stmt, *or_stmt, *xor_stmt;
4090 64 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4091 64 : bool negative = tree_int_cst_compare
4092 64 : (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
4093 64 : tree offset = negative
4094 64 : ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
4095 : * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
4096 64 : : size_zero_node;
4097 :
4098 : /* create: addr_tmp = (int)(address_of_first_vector) */
4099 64 : addr_base =
4100 64 : vect_create_addr_base_for_vector_ref (loop_vinfo,
4101 : stmt_info, &new_stmt_list,
4102 : offset);
4103 64 : if (new_stmt_list != NULL)
4104 14 : gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
4105 :
4106 64 : sprintf (tmp_name, "addr2int%d", i);
4107 64 : addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
4108 64 : addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
4109 64 : gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
4110 :
4111 64 : if (LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT (loop_vinfo))
4112 : {
4113 : /* Create "((a_1 ^ a_2) | (a_2 ^ a_3) | ... | (a_n-1 ^ a_n)) & mask"
4114 : to check mutual alignment. */
4115 36 : if (prev_addr_tmp_name != NULL_TREE)
4116 : {
4117 18 : sprintf (tmp_name, "xorptrs%d_%d", i - 1, i);
4118 18 : xor_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL,
4119 : tmp_name);
4120 18 : xor_stmt = gimple_build_assign (xor_tmp_name, BIT_XOR_EXPR,
4121 : prev_addr_tmp_name,
4122 : addr_tmp_name);
4123 18 : gimple_seq_add_stmt (cond_expr_stmt_list, xor_stmt);
4124 18 : if (or_tmp_name == NULL_TREE)
4125 : {
4126 : /* Create the 1st XOR when the 2nd data ref is seen. */
4127 : or_tmp_name = xor_tmp_name;
4128 : }
4129 : else
4130 : {
4131 : /* Create: or_tmp = or_tmp | new_xor_tmp. */
4132 0 : sprintf (tmp_name, "orxors%d", i - 1);
4133 0 : new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL,
4134 : tmp_name);
4135 0 : or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
4136 : or_tmp_name, xor_tmp_name);
4137 0 : gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
4138 0 : or_tmp_name = new_or_tmp_name;
4139 : }
4140 : }
4141 : prev_addr_tmp_name = addr_tmp_name;
4142 : }
4143 : else
4144 : {
4145 : /* Create: "(a_1 | a_2 | a_3 | ... | a_n) & mask" to check if all
4146 : addresses are already aligned. */
4147 28 : if (or_tmp_name != NULL_TREE)
4148 : {
4149 : /* Create: or_tmp = or_tmp | addr_tmp. */
4150 1 : sprintf (tmp_name, "orptrs%d", i);
4151 1 : new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL,
4152 : tmp_name);
4153 1 : or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
4154 : or_tmp_name, addr_tmp_name);
4155 1 : gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
4156 1 : or_tmp_name = new_or_tmp_name;
4157 : }
4158 : else
4159 : or_tmp_name = addr_tmp_name;
4160 : }
4161 :
4162 : } /* end for i */
4163 :
4164 45 : mask_cst = build_int_cst (int_ptrsize_type, mask);
4165 :
4166 : /* create: and_tmp = or_tmp & mask */
4167 45 : and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
4168 :
4169 45 : and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
4170 : or_tmp_name, mask_cst);
4171 45 : gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
4172 :
4173 : /* Make and_tmp the left operand of the conditional test against zero.
4174 : if and_tmp has a nonzero bit then some address is unaligned. */
4175 45 : ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
4176 45 : part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
4177 : and_tmp_name, ptrsize_zero);
4178 45 : chain_cond_expr (cond_expr, part_cond_expr);
4179 45 : }
4180 :
4181 : /* Function vect_create_cond_for_vla_spec_read.
4182 :
4183 : Create a conditional expression that represents the run-time checks with
4184 : max speculative read amount in VLA modes. We check two things:
4185 : 1) if the max speculative read amount exceeds the min page size
4186 : 2) if the VF is power-of-2 - done by checking the max read amount instead
4187 :
4188 : Input:
4189 : COND_EXPR - input conditional expression. New conditions will be chained
4190 : with logical AND operation.
4191 : LOOP_VINFO - field LOOP_VINFO_MAX_SPEC_READ_AMOUNT contains the max
4192 : possible speculative read amount in VLA modes.
4193 :
4194 : Output:
4195 : COND_EXPR - conditional expression.
4196 :
4197 : The returned COND_EXPR is the conditional expression to be used in the
4198 : if statement that controls which version of the loop gets executed at
4199 : runtime. */
4200 :
4201 : static void
4202 0 : vect_create_cond_for_vla_spec_read (loop_vec_info loop_vinfo, tree *cond_expr)
4203 : {
4204 0 : poly_uint64 read_amount_poly = LOOP_VINFO_MAX_SPEC_READ_AMOUNT (loop_vinfo);
4205 0 : tree amount = build_int_cst (long_unsigned_type_node, read_amount_poly);
4206 :
4207 : /* Both the read amount and the VF must be variants, and the read amount must
4208 : be a constant power-of-2 multiple of the VF. */
4209 0 : unsigned HOST_WIDE_INT multiple;
4210 0 : gcc_assert (!read_amount_poly.is_constant ()
4211 : && !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()
4212 : && constant_multiple_p (read_amount_poly,
4213 : LOOP_VINFO_VECT_FACTOR (loop_vinfo),
4214 : &multiple)
4215 : && pow2p_hwi (multiple));
4216 :
4217 : tree cst_ul_zero = build_int_cstu (long_unsigned_type_node, 0U);
4218 : tree cst_ul_one = build_int_cstu (long_unsigned_type_node, 1U);
4219 : tree cst_ul_pagesize = build_int_cstu (long_unsigned_type_node,
4220 : (unsigned long) param_min_pagesize);
4221 :
4222 : /* Create an expression of "amount & (amount - 1) == 0". */
4223 : tree amount_m1 = fold_build2 (MINUS_EXPR, long_unsigned_type_node,
4224 : amount, cst_ul_one);
4225 : tree amount_and_expr = fold_build2 (BIT_AND_EXPR, long_unsigned_type_node,
4226 : amount, amount_m1);
4227 : tree powof2_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
4228 : amount_and_expr, cst_ul_zero);
4229 : chain_cond_expr (cond_expr, powof2_cond_expr);
4230 :
4231 : /* Create an expression of "amount <= cst_ul_pagesize". */
4232 : tree pagesize_cond_expr = fold_build2 (LE_EXPR, boolean_type_node,
4233 : amount, cst_ul_pagesize);
4234 : chain_cond_expr (cond_expr, pagesize_cond_expr);
4235 : }
4236 :
4237 : /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
4238 : create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
4239 : Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
4240 : and this new condition are true. Treat a null *COND_EXPR as "true". */
4241 :
4242 : static void
4243 3299 : vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
4244 : {
4245 3299 : const vec<vec_object_pair> &pairs
4246 : = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
4247 3299 : unsigned int i;
4248 3299 : vec_object_pair *pair;
4249 3311 : FOR_EACH_VEC_ELT (pairs, i, pair)
4250 : {
4251 12 : tree addr1 = build_fold_addr_expr (pair->first);
4252 12 : tree addr2 = build_fold_addr_expr (pair->second);
4253 12 : tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
4254 : addr1, addr2);
4255 12 : chain_cond_expr (cond_expr, part_cond_expr);
4256 : }
4257 3299 : }
4258 :
4259 : /* Create an expression that is true when all lower-bound conditions for
4260 : the vectorized loop are met. Chain this condition with *COND_EXPR. */
4261 :
4262 : static void
4263 3299 : vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
4264 : {
4265 3299 : const vec<vec_lower_bound> &lower_bounds
4266 : = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
4267 3549 : for (unsigned int i = 0; i < lower_bounds.length (); ++i)
4268 : {
4269 250 : tree expr = lower_bounds[i].expr;
4270 250 : tree type = unsigned_type_for (TREE_TYPE (expr));
4271 250 : expr = fold_convert (type, expr);
4272 250 : poly_uint64 bound = lower_bounds[i].min_value;
4273 250 : if (!lower_bounds[i].unsigned_p)
4274 : {
4275 91 : expr = fold_build2 (PLUS_EXPR, type, expr,
4276 : build_int_cstu (type, bound - 1));
4277 91 : bound += bound - 1;
4278 : }
4279 250 : tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
4280 : build_int_cstu (type, bound));
4281 250 : chain_cond_expr (cond_expr, part_cond_expr);
4282 : }
4283 3299 : }
4284 :
4285 : /* Function vect_create_cond_for_alias_checks.
4286 :
4287 : Create a conditional expression that represents the run-time checks for
4288 : overlapping of address ranges represented by a list of data references
4289 : relations passed as input.
4290 :
4291 : Input:
4292 : COND_EXPR - input conditional expression. New conditions will be chained
4293 : with logical AND operation. If it is NULL, then the function
4294 : is used to return the number of alias checks.
4295 : LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
4296 : to be checked.
4297 :
4298 : Output:
4299 : COND_EXPR - conditional expression.
4300 :
4301 : The returned COND_EXPR is the conditional expression to be used in the if
4302 : statement that controls which version of the loop gets executed at runtime.
4303 : */
4304 :
4305 : void
4306 3299 : vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
4307 : {
4308 3299 : const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
4309 : LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
4310 :
4311 3299 : if (comp_alias_ddrs.is_empty ())
4312 : return;
4313 :
4314 3170 : create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
4315 : &comp_alias_ddrs, cond_expr);
4316 3170 : if (dump_enabled_p ())
4317 1284 : dump_printf_loc (MSG_NOTE, vect_location,
4318 : "created %u versioning for alias checks.\n",
4319 : comp_alias_ddrs.length ());
4320 : }
4321 :
4322 :
4323 : /* Function vect_loop_versioning.
4324 :
4325 : If the loop has data references that may or may not be aligned or/and
4326 : has data reference relations whose independence was not proven then
4327 : two versions of the loop need to be generated, one which is vectorized
4328 : and one which isn't. A test is then generated to control which of the
4329 : loops is executed. The test checks for the alignment of all of the
4330 : data references that may or may not be aligned. An additional
4331 : sequence of runtime tests is generated for each pairs of DDRs whose
4332 : independence was not proven. The vectorized version of loop is
4333 : executed only if both alias and alignment tests are passed.
4334 :
4335 : The test generated to check which version of loop is executed
4336 : is modified to also check for profitability as indicated by the
4337 : cost model threshold TH.
4338 :
4339 : The versioning precondition(s) are placed in *COND_EXPR and
4340 : *COND_EXPR_STMT_LIST. */
4341 :
4342 : class loop *
4343 3757 : vect_loop_versioning (loop_vec_info loop_vinfo,
4344 : gimple *loop_vectorized_call)
4345 : {
4346 3757 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
4347 3757 : class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
4348 3757 : basic_block condition_bb;
4349 3757 : gphi_iterator gsi;
4350 3757 : gimple_stmt_iterator cond_exp_gsi;
4351 3757 : basic_block merge_bb;
4352 3757 : basic_block new_exit_bb;
4353 3757 : edge new_exit_e, e;
4354 3757 : gphi *orig_phi, *new_phi;
4355 3757 : tree cond_expr = NULL_TREE;
4356 3757 : gimple_seq cond_expr_stmt_list = NULL;
4357 3757 : tree arg;
4358 3757 : profile_probability prob = profile_probability::likely ();
4359 3757 : gimple_seq gimplify_stmt_list = NULL;
4360 3757 : tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
4361 3757 : bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
4362 3757 : bool version_spec_read = LOOP_REQUIRES_VERSIONING_FOR_SPEC_READ (loop_vinfo);
4363 3757 : bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
4364 3757 : bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
4365 3757 : poly_uint64 versioning_threshold
4366 : = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
4367 3757 : tree version_simd_if_cond
4368 : = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
4369 3757 : unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
4370 3757 : bool uncounted_p = LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo);
4371 :
4372 3757 : if (!uncounted_p && vect_apply_runtime_profitability_check_p (loop_vinfo)
4373 : && !ordered_p (th, versioning_threshold))
4374 : cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
4375 : build_int_cst (TREE_TYPE (scalar_loop_iters),
4376 : th - 1));
4377 3757 : if (!uncounted_p && maybe_ne (versioning_threshold, 0U))
4378 : {
4379 3744 : tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
4380 : build_int_cst (TREE_TYPE (scalar_loop_iters),
4381 : versioning_threshold - 1));
4382 3744 : if (cond_expr)
4383 0 : cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
4384 : expr, cond_expr);
4385 : else
4386 3744 : cond_expr = expr;
4387 : }
4388 :
4389 3757 : tree cost_name = NULL_TREE;
4390 3757 : profile_probability prob2 = profile_probability::always ();
4391 3757 : if (cond_expr
4392 3744 : && EXPR_P (cond_expr)
4393 2586 : && (version_niter
4394 2586 : || version_align
4395 : || version_alias
4396 2250 : || version_simd_if_cond))
4397 : {
4398 2586 : cost_name = cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
4399 : &cond_expr_stmt_list,
4400 : is_gimple_val, NULL_TREE);
4401 : /* Split prob () into two so that the overall probability of passing
4402 : both the cost-model and versioning checks is the orig prob. */
4403 2586 : prob2 = prob = prob.sqrt ();
4404 : }
4405 :
4406 3757 : if (version_niter)
4407 376 : vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
4408 :
4409 3757 : if (cond_expr)
4410 : {
4411 3744 : gimple_seq tem = NULL;
4412 3744 : cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
4413 : &tem, is_gimple_condexpr_for_cond,
4414 : NULL_TREE);
4415 3744 : gimple_seq_add_seq (&cond_expr_stmt_list, tem);
4416 : }
4417 :
4418 3757 : if (version_align)
4419 45 : vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
4420 : &cond_expr_stmt_list);
4421 :
4422 3757 : if (version_spec_read)
4423 0 : vect_create_cond_for_vla_spec_read (loop_vinfo, &cond_expr);
4424 :
4425 3757 : if (version_alias)
4426 : {
4427 3299 : vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
4428 3299 : vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
4429 3299 : vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
4430 : }
4431 :
4432 3757 : if (version_simd_if_cond)
4433 : {
4434 58 : gcc_assert (dom_info_available_p (CDI_DOMINATORS));
4435 58 : if (flag_checking)
4436 58 : if (basic_block bb
4437 58 : = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
4438 58 : gcc_assert (bb != loop->header
4439 : && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
4440 : && (scalar_loop == NULL
4441 : || (bb != scalar_loop->header
4442 : && dominated_by_p (CDI_DOMINATORS,
4443 : scalar_loop->header, bb))));
4444 58 : tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
4445 58 : tree c = fold_build2 (NE_EXPR, boolean_type_node,
4446 : version_simd_if_cond, zero);
4447 58 : if (cond_expr)
4448 58 : cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
4449 : c, cond_expr);
4450 : else
4451 0 : cond_expr = c;
4452 58 : if (dump_enabled_p ())
4453 5 : dump_printf_loc (MSG_NOTE, vect_location,
4454 : "created versioning for simd if condition check.\n");
4455 : }
4456 :
4457 3757 : cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
4458 : &gimplify_stmt_list,
4459 : is_gimple_condexpr_for_cond, NULL_TREE);
4460 3757 : gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
4461 :
4462 : /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
4463 : invariant in. */
4464 3757 : class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
4465 3757 : for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
4466 67985 : !gsi_end_p (gsi); gsi_next (&gsi))
4467 : {
4468 64228 : gimple *stmt = gsi_stmt (gsi);
4469 64228 : update_stmt (stmt);
4470 64228 : ssa_op_iter iter;
4471 64228 : use_operand_p use_p;
4472 64228 : basic_block def_bb;
4473 147679 : FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
4474 83451 : if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
4475 83451 : && flow_bb_inside_loop_p (outermost, def_bb))
4476 2515 : outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
4477 : }
4478 :
4479 : /* Search for the outermost loop we can version. Avoid versioning of
4480 : non-perfect nests but allow if-conversion versioned loops inside. */
4481 3757 : class loop *loop_to_version = loop;
4482 3757 : if (flow_loop_nested_p (outermost, loop))
4483 : {
4484 1454 : if (dump_enabled_p ())
4485 676 : dump_printf_loc (MSG_NOTE, vect_location,
4486 : "trying to apply versioning to outer loop %d\n",
4487 : outermost->num);
4488 1454 : if (outermost->num == 0)
4489 1362 : outermost = superloop_at_depth (loop, 1);
4490 : /* And avoid applying versioning on non-perfect nests. */
4491 : while (loop_to_version != outermost
4492 118 : && (e = single_exit (loop_outer (loop_to_version)))
4493 99 : && !(e->flags & EDGE_COMPLEX)
4494 98 : && (!loop_outer (loop_to_version)->inner->next
4495 53 : || vect_loop_vectorized_call (loop_to_version))
4496 1554 : && (!loop_outer (loop_to_version)->inner->next
4497 5 : || !loop_outer (loop_to_version)->inner->next->next))
4498 : loop_to_version = loop_outer (loop_to_version);
4499 : }
4500 :
4501 : /* Apply versioning. If there is already a scalar version created by
4502 : if-conversion re-use that. Note we cannot re-use the copy of
4503 : an if-converted outer-loop when vectorizing the inner loop only. */
4504 3757 : gcond *cond;
4505 3757 : if ((!loop_to_version->inner || loop == loop_to_version)
4506 3719 : && loop_vectorized_call)
4507 : {
4508 88 : gcc_assert (scalar_loop);
4509 88 : condition_bb = gimple_bb (loop_vectorized_call);
4510 176 : cond = as_a <gcond *> (*gsi_last_bb (condition_bb));
4511 88 : gimple_cond_set_condition_from_tree (cond, cond_expr);
4512 88 : update_stmt (cond);
4513 :
4514 88 : if (cond_expr_stmt_list)
4515 : {
4516 88 : cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
4517 88 : gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
4518 : GSI_SAME_STMT);
4519 : }
4520 :
4521 : /* if-conversion uses profile_probability::always () for both paths,
4522 : reset the paths probabilities appropriately. */
4523 88 : edge te, fe;
4524 88 : extract_true_false_edges_from_block (condition_bb, &te, &fe);
4525 88 : te->probability = prob;
4526 88 : fe->probability = prob.invert ();
4527 : /* We can scale loops counts immediately but have to postpone
4528 : scaling the scalar loop because we re-use it during peeling.
4529 :
4530 : Ifcvt duplicates loop preheader, loop body and produces an basic
4531 : block after loop exit. We need to scale all that. */
4532 88 : basic_block preheader = loop_preheader_edge (loop_to_version)->src;
4533 88 : preheader->count = preheader->count.apply_probability (prob * prob2);
4534 88 : scale_loop_frequencies (loop_to_version, prob * prob2);
4535 : /* When the loop has multiple exits then we can only version itself.
4536 : This is denoted by loop_to_version == loop. In this case we can
4537 : do the versioning by selecting the exit edge the vectorizer is
4538 : currently using. */
4539 88 : edge exit_edge;
4540 88 : if (loop_to_version == loop)
4541 88 : exit_edge = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
4542 : else
4543 0 : exit_edge = single_exit (loop_to_version);
4544 88 : exit_edge->dest->count = preheader->count;
4545 88 : LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = (prob * prob2).invert ();
4546 :
4547 88 : nloop = scalar_loop;
4548 88 : if (dump_enabled_p ())
4549 90 : dump_printf_loc (MSG_NOTE, vect_location,
4550 : "reusing %sloop version created by if conversion\n",
4551 : loop_to_version != loop ? "outer " : "");
4552 88 : }
4553 : else
4554 : {
4555 3669 : if (loop_to_version != loop
4556 3669 : && dump_enabled_p ())
4557 14 : dump_printf_loc (MSG_NOTE, vect_location,
4558 : "applying loop versioning to outer loop %d\n",
4559 : loop_to_version->num);
4560 :
4561 3669 : unsigned orig_pe_idx = loop_preheader_edge (loop)->dest_idx;
4562 :
4563 3669 : initialize_original_copy_tables ();
4564 7338 : nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
4565 3669 : prob * prob2, (prob * prob2).invert (),
4566 3669 : prob * prob2, (prob * prob2).invert (),
4567 : true);
4568 :
4569 : /* If the PHI nodes in the loop header were reallocated, we need to fix up
4570 : our internally stashed copies of those. */
4571 3669 : if (loop_to_version == loop)
4572 3631 : for (auto gsi = gsi_start_phis (loop->header);
4573 14486 : !gsi_end_p (gsi); gsi_next (&gsi))
4574 10855 : loop_vinfo->resync_stmt_addr (gsi.phi ());
4575 :
4576 : /* We will later insert second conditional so overall outcome of
4577 : both is prob * prob2. */
4578 3669 : edge true_e, false_e;
4579 3669 : extract_true_false_edges_from_block (condition_bb, &true_e, &false_e);
4580 3669 : true_e->probability = prob;
4581 3669 : false_e->probability = prob.invert ();
4582 3669 : gcc_assert (nloop);
4583 3669 : nloop = get_loop_copy (loop);
4584 :
4585 : /* Assign hierarchical discriminators to distinguish loop versions.
4586 : Only assign to the scalar version here; the vectorized version will
4587 : get discriminators later during transformation/peeling.
4588 : Use dynamic copy_id allocation instead of hardcoded constants. */
4589 3669 : gimple *nloop_last = last_nondebug_stmt (nloop->header);
4590 3669 : location_t nloop_loc
4591 3669 : = nloop_last ? gimple_location (nloop_last) : UNKNOWN_LOCATION;
4592 3669 : if (nloop_loc != UNKNOWN_LOCATION)
4593 : {
4594 3208 : unsigned int nloop_copyid = allocate_copyid_base (nloop_loc, 1);
4595 3208 : assign_discriminators_to_loop (nloop, 0, nloop_copyid);
4596 : }
4597 : /* For cycle vectorization with SLP we rely on the PHI arguments
4598 : appearing in the same order as the SLP node operands which for the
4599 : loop PHI nodes means the preheader edge dest index needs to remain
4600 : the same for the analyzed loop which also becomes the vectorized one.
4601 : Make it so in case the state after versioning differs by redirecting
4602 : the first edge into the header to the same destination which moves
4603 : it last. */
4604 3669 : if (loop_preheader_edge (loop)->dest_idx != orig_pe_idx)
4605 : {
4606 292 : edge e = EDGE_PRED (loop->header, 0);
4607 292 : ssa_redirect_edge (e, e->dest);
4608 292 : flush_pending_stmts (e);
4609 : }
4610 3669 : gcc_assert (loop_preheader_edge (loop)->dest_idx == orig_pe_idx);
4611 :
4612 : /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
4613 : reap those otherwise; they also refer to the original
4614 : loops. */
4615 : class loop *l = loop;
4616 3674 : while (gimple *call = vect_loop_vectorized_call (l))
4617 : {
4618 5 : call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
4619 5 : fold_loop_internal_call (call, boolean_false_node);
4620 5 : l = loop_outer (l);
4621 5 : }
4622 3669 : free_original_copy_tables ();
4623 :
4624 3669 : if (cond_expr_stmt_list)
4625 : {
4626 3588 : cond_exp_gsi = gsi_last_bb (condition_bb);
4627 3588 : gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
4628 : GSI_SAME_STMT);
4629 : }
4630 :
4631 : /* Loop versioning violates an assumption we try to maintain during
4632 : vectorization - that the loop exit block has a single predecessor.
4633 : After versioning, the exit block of both loop versions is the same
4634 : basic block (i.e. it has two predecessors). Just in order to simplify
4635 : following transformations in the vectorizer, we fix this situation
4636 : here by adding a new (empty) block on the exit-edge of the loop,
4637 : with the proper loop-exit phis to maintain loop-closed-form.
4638 : If loop versioning wasn't done from loop, but scalar_loop instead,
4639 : merge_bb will have already just a single successor. */
4640 :
4641 : /* When the loop has multiple exits then we can only version itself.
4642 : This is denoted by loop_to_version == loop. In this case we can
4643 : do the versioning by selecting the exit edge the vectorizer is
4644 : currently using. */
4645 3669 : edge exit_edge;
4646 3669 : if (loop_to_version == loop)
4647 3631 : exit_edge = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
4648 : else
4649 38 : exit_edge = single_exit (loop_to_version);
4650 :
4651 3669 : gcc_assert (exit_edge);
4652 3669 : merge_bb = exit_edge->dest;
4653 3669 : if (EDGE_COUNT (merge_bb->preds) >= 2)
4654 : {
4655 3669 : gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
4656 3669 : new_exit_bb = split_edge (exit_edge);
4657 3669 : new_exit_e = exit_edge;
4658 3669 : e = EDGE_SUCC (new_exit_bb, 0);
4659 :
4660 7585 : for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
4661 3916 : gsi_next (&gsi))
4662 : {
4663 3916 : tree new_res;
4664 3916 : orig_phi = gsi.phi ();
4665 3916 : new_res = copy_ssa_name (PHI_RESULT (orig_phi));
4666 3916 : new_phi = create_phi_node (new_res, new_exit_bb);
4667 3916 : arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
4668 3916 : add_phi_arg (new_phi, arg, new_exit_e,
4669 : gimple_phi_arg_location_from_edge (orig_phi, e));
4670 3916 : adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
4671 : }
4672 : }
4673 :
4674 3669 : update_ssa (TODO_update_ssa_no_phi);
4675 : }
4676 :
4677 : /* Split the cost model check off to a separate BB. Costing assumes
4678 : this is the only thing we perform when we enter the scalar loop
4679 : from a failed cost decision. */
4680 3757 : if (cost_name && TREE_CODE (cost_name) == SSA_NAME)
4681 : {
4682 2586 : gimple *def = SSA_NAME_DEF_STMT (cost_name);
4683 2586 : gcc_assert (gimple_bb (def) == condition_bb);
4684 : /* All uses of the cost check are 'true' after the check we
4685 : are going to insert. */
4686 2586 : replace_uses_by (cost_name, boolean_true_node);
4687 : /* And we're going to build the new single use of it. */
4688 2586 : gcond *cond = gimple_build_cond (NE_EXPR, cost_name, boolean_false_node,
4689 : NULL_TREE, NULL_TREE);
4690 2586 : edge e = split_block (gimple_bb (def), def);
4691 2586 : gimple_stmt_iterator gsi = gsi_for_stmt (def);
4692 2586 : gsi_insert_after (&gsi, cond, GSI_NEW_STMT);
4693 2586 : edge true_e, false_e;
4694 2586 : extract_true_false_edges_from_block (e->dest, &true_e, &false_e);
4695 2586 : e->flags &= ~EDGE_FALLTHRU;
4696 2586 : e->flags |= EDGE_TRUE_VALUE;
4697 2586 : edge e2 = make_edge (e->src, false_e->dest, EDGE_FALSE_VALUE);
4698 2586 : e->probability = prob2;
4699 2586 : e2->probability = prob2.invert ();
4700 2586 : e->dest->count = e->count ();
4701 2586 : set_immediate_dominator (CDI_DOMINATORS, false_e->dest, e->src);
4702 2586 : auto_vec<basic_block, 3> adj;
4703 2586 : for (basic_block son = first_dom_son (CDI_DOMINATORS, e->dest);
4704 7662 : son;
4705 5076 : son = next_dom_son (CDI_DOMINATORS, son))
4706 7566 : if (EDGE_COUNT (son->preds) > 1)
4707 2490 : adj.safe_push (son);
4708 10248 : for (auto son : adj)
4709 2490 : set_immediate_dominator (CDI_DOMINATORS, son, e->src);
4710 : //debug_bb (condition_bb);
4711 : //debug_bb (e->src);
4712 2586 : }
4713 :
4714 3757 : if (version_niter)
4715 : {
4716 : /* The versioned loop could be infinite, we need to clear existing
4717 : niter information which is copied from the original loop. */
4718 376 : gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
4719 376 : vect_free_loop_info_assumptions (nloop);
4720 : }
4721 :
4722 3757 : if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
4723 3757 : && dump_enabled_p ())
4724 : {
4725 818 : if (version_alias)
4726 744 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
4727 : vect_location,
4728 : "loop versioned for vectorization because of "
4729 : "possible aliasing\n");
4730 818 : if (version_align)
4731 30 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
4732 : vect_location,
4733 : "loop versioned for vectorization to enhance "
4734 : "alignment\n");
4735 :
4736 : }
4737 :
4738 3757 : return nloop;
4739 : }
|