Line data Source code
1 : /* Vectorizer Specific Loop Manipulations
2 : Copyright (C) 2003-2026 Free Software Foundation, Inc.
3 : Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 : and Ira Rosen <irar@il.ibm.com>
5 :
6 : This file is part of GCC.
7 :
8 : GCC is free software; you can redistribute it and/or modify it under
9 : the terms of the GNU General Public License as published by the Free
10 : Software Foundation; either version 3, or (at your option) any later
11 : version.
12 :
13 : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : for more details.
17 :
18 : You should have received a copy of the GNU General Public License
19 : along with GCC; see the file COPYING3. If not see
20 : <http://www.gnu.org/licenses/>. */
21 :
22 : #include "config.h"
23 : #include "system.h"
24 : #include "coretypes.h"
25 : #include "backend.h"
26 : #include "tree.h"
27 : #include "gimple.h"
28 : #include "cfghooks.h"
29 : #include "tree-pass.h"
30 : #include "ssa.h"
31 : #include "fold-const.h"
32 : #include "cfganal.h"
33 : #include "gimplify.h"
34 : #include "gimple-iterator.h"
35 : #include "gimplify-me.h"
36 : #include "tree-cfg.h"
37 : #include "tree-ssa-loop-manip.h"
38 : #include "tree-into-ssa.h"
39 : #include "tree-ssa.h"
40 : #include "cfgloop.h"
41 : #include "tree-scalar-evolution.h"
42 : #include "tree-vectorizer.h"
43 : #include "tree-ssa-loop-ivopts.h"
44 : #include "gimple-fold.h"
45 : #include "tree-ssa-loop-niter.h"
46 : #include "internal-fn.h"
47 : #include "stor-layout.h"
48 : #include "optabs-query.h"
49 : #include "vec-perm-indices.h"
50 : #include "insn-config.h"
51 : #include "rtl.h"
52 : #include "recog.h"
53 : #include "langhooks.h"
54 : #include "tree-vector-builder.h"
55 : #include "optabs-tree.h"
56 : #include "hierarchical_discriminator.h"
57 :
58 :
59 : /*************************************************************************
60 : Simple Loop Peeling Utilities
61 :
62 : Utilities to support loop peeling for vectorization purposes.
63 : *************************************************************************/
64 :
65 :
66 : /* Renames the use *OP_P. */
67 :
68 : static void
69 916713 : rename_use_op (use_operand_p op_p)
70 : {
71 916713 : tree new_name;
72 :
73 916713 : if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
74 : return;
75 :
76 913196 : new_name = get_current_def (USE_FROM_PTR (op_p));
77 :
78 : /* Something defined outside of the loop. */
79 913196 : if (!new_name)
80 : return;
81 :
82 : /* An ordinary ssa name defined in the loop. */
83 :
84 787216 : SET_USE (op_p, new_name);
85 : }
86 :
87 :
88 : /* Renames the variables in basic block BB. Allow renaming of PHI arguments
89 : on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
90 : true. */
91 :
92 : static void
93 112490 : rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
94 : {
95 112490 : gimple *stmt;
96 112490 : use_operand_p use_p;
97 112490 : ssa_op_iter iter;
98 112490 : edge e;
99 112490 : edge_iterator ei;
100 112490 : class loop *loop = bb->loop_father;
101 112490 : class loop *outer_loop = NULL;
102 :
103 112490 : if (rename_from_outer_loop)
104 : {
105 835 : gcc_assert (loop);
106 835 : outer_loop = loop_outer (loop);
107 : }
108 :
109 746813 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
110 521833 : gsi_next (&gsi))
111 : {
112 521833 : stmt = gsi_stmt (gsi);
113 1285126 : FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
114 763293 : rename_use_op (use_p);
115 : }
116 :
117 229991 : FOR_EACH_EDGE (e, ei, bb->preds)
118 : {
119 117501 : if (!flow_bb_inside_loop_p (loop, e->src))
120 : {
121 35053 : if (!rename_from_outer_loop)
122 34791 : continue;
123 262 : if (e->src != outer_loop->header)
124 : {
125 153 : if (outer_loop->inner->next)
126 : {
127 : /* If outer_loop has 2 inner loops, allow there to
128 : be an extra basic block which decides which of the
129 : two loops to use using LOOP_VECTORIZED. */
130 150 : if (!single_pred_p (e->src)
131 26 : || single_pred (e->src) != outer_loop->header)
132 124 : continue;
133 : }
134 : }
135 : }
136 187374 : for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
137 104788 : gsi_next (&gsi))
138 104788 : rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
139 : }
140 112490 : }
141 :
142 :
143 : struct adjust_info
144 : {
145 : tree from, to;
146 : basic_block bb;
147 : };
148 :
149 : /* A stack of values to be adjusted in debug stmts. We have to
150 : process them LIFO, so that the closest substitution applies. If we
151 : processed them FIFO, without the stack, we might substitute uses
152 : with a PHI DEF that would soon become non-dominant, and when we got
153 : to the suitable one, it wouldn't have anything to substitute any
154 : more. */
155 : static vec<adjust_info, va_heap> adjust_vec;
156 :
157 : /* Adjust any debug stmts that referenced AI->from values to use the
158 : loop-closed AI->to, if the references are dominated by AI->bb and
159 : not by the definition of AI->from. */
160 :
161 : static void
162 79252 : adjust_debug_stmts_now (adjust_info *ai)
163 : {
164 79252 : basic_block bbphi = ai->bb;
165 79252 : tree orig_def = ai->from;
166 79252 : tree new_def = ai->to;
167 79252 : imm_use_iterator imm_iter;
168 79252 : gimple *stmt;
169 79252 : basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
170 :
171 79252 : gcc_assert (dom_info_available_p (CDI_DOMINATORS));
172 :
173 : /* Adjust any debug stmts that held onto non-loop-closed
174 : references. */
175 395581 : FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
176 : {
177 237077 : use_operand_p use_p;
178 237077 : basic_block bbuse;
179 :
180 237077 : if (!is_gimple_debug (stmt))
181 179106 : continue;
182 :
183 57971 : gcc_assert (gimple_debug_bind_p (stmt));
184 :
185 57971 : bbuse = gimple_bb (stmt);
186 :
187 57971 : if ((bbuse == bbphi
188 57971 : || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
189 59987 : && !(bbuse == bbdef
190 1008 : || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
191 : {
192 0 : if (new_def)
193 0 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
194 0 : SET_USE (use_p, new_def);
195 : else
196 : {
197 0 : gimple_debug_bind_reset_value (stmt);
198 0 : update_stmt (stmt);
199 : }
200 : }
201 79252 : }
202 79252 : }
203 :
204 : /* Adjust debug stmts as scheduled before. */
205 :
206 : static void
207 33780 : adjust_vec_debug_stmts (void)
208 : {
209 33780 : if (!MAY_HAVE_DEBUG_BIND_STMTS)
210 : return;
211 :
212 12468 : gcc_assert (adjust_vec.exists ());
213 :
214 91607 : while (!adjust_vec.is_empty ())
215 : {
216 79139 : adjust_debug_stmts_now (&adjust_vec.last ());
217 79139 : adjust_vec.pop ();
218 : }
219 : }
220 :
221 : /* Adjust any debug stmts that referenced FROM values to use the
222 : loop-closed TO, if the references are dominated by BB and not by
223 : the definition of FROM. If adjust_vec is non-NULL, adjustments
224 : will be postponed until adjust_vec_debug_stmts is called. */
225 :
226 : static void
227 107246 : adjust_debug_stmts (tree from, tree to, basic_block bb)
228 : {
229 107246 : adjust_info ai;
230 :
231 107246 : if (MAY_HAVE_DEBUG_BIND_STMTS
232 107246 : && TREE_CODE (from) == SSA_NAME
233 97846 : && ! SSA_NAME_IS_DEFAULT_DEF (from)
234 203748 : && ! virtual_operand_p (from))
235 : {
236 79252 : ai.from = from;
237 79252 : ai.to = to;
238 79252 : ai.bb = bb;
239 :
240 79252 : if (adjust_vec.exists ())
241 79139 : adjust_vec.safe_push (ai);
242 : else
243 113 : adjust_debug_stmts_now (&ai);
244 : }
245 107246 : }
246 :
247 : /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
248 : to adjust any debug stmts that referenced the old phi arg,
249 : presumably non-loop-closed references left over from other
250 : transformations. */
251 :
252 : static void
253 218692 : adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
254 : {
255 218692 : tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
256 :
257 218692 : gcc_assert (TREE_CODE (orig_def) != SSA_NAME
258 : || orig_def != new_def);
259 :
260 218692 : SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
261 :
262 218692 : if (MAY_HAVE_DEBUG_BIND_STMTS)
263 84046 : adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
264 : gimple_bb (update_phi));
265 218692 : }
266 :
267 : /* Define one loop rgroup control CTRL from loop LOOP. INIT_CTRL is the value
268 : that the control should have during the first iteration and NEXT_CTRL is the
269 : value that it should have on subsequent iterations. */
270 :
271 : static void
272 86 : vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
273 : tree next_ctrl)
274 : {
275 86 : gphi *phi = create_phi_node (ctrl, loop->header);
276 86 : add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
277 86 : add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
278 86 : }
279 :
280 : /* Add SEQ to the end of LOOP's preheader block. */
281 :
282 : static void
283 18 : add_preheader_seq (class loop *loop, gimple_seq seq)
284 : {
285 18 : if (seq)
286 : {
287 15 : edge pe = loop_preheader_edge (loop);
288 15 : basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
289 15 : gcc_assert (!new_bb);
290 : }
291 18 : }
292 :
293 : /* Add SEQ to the beginning of LOOP's header block. */
294 :
295 : static void
296 0 : add_header_seq (class loop *loop, gimple_seq seq)
297 : {
298 0 : if (seq)
299 : {
300 0 : gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
301 0 : gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
302 : }
303 0 : }
304 :
305 : /* Return true if the target can interleave elements of two vectors.
306 : OFFSET is 0 if the first half of the vectors should be interleaved
307 : or 1 if the second half should. When returning true, store the
308 : associated permutation in INDICES. */
309 :
310 : static bool
311 0 : interleave_supported_p (vec_perm_indices *indices, tree vectype,
312 : unsigned int offset)
313 : {
314 0 : poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
315 0 : poly_uint64 base = exact_div (nelts, 2) * offset;
316 0 : vec_perm_builder sel (nelts, 2, 3);
317 0 : for (unsigned int i = 0; i < 3; ++i)
318 : {
319 0 : sel.quick_push (base + i);
320 0 : sel.quick_push (base + i + nelts);
321 : }
322 0 : indices->new_vector (sel, 2, nelts);
323 0 : return can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
324 0 : *indices);
325 0 : }
326 :
327 : /* Try to use permutes to define the masks in DEST_RGM using the masks
328 : in SRC_RGM, given that the former has twice as many masks as the
329 : latter. Return true on success, adding any new statements to SEQ. */
330 :
331 : static bool
332 0 : vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
333 : rgroup_controls *src_rgm)
334 : {
335 0 : tree src_masktype = src_rgm->type;
336 0 : tree dest_masktype = dest_rgm->type;
337 0 : machine_mode src_mode = TYPE_MODE (src_masktype);
338 0 : insn_code icode1, icode2;
339 0 : if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
340 0 : && (icode1 = optab_handler (vec_unpacku_hi_optab,
341 : src_mode)) != CODE_FOR_nothing
342 0 : && (icode2 = optab_handler (vec_unpacku_lo_optab,
343 : src_mode)) != CODE_FOR_nothing)
344 : {
345 : /* Unpacking the source masks gives at least as many mask bits as
346 : we need. We can then VIEW_CONVERT any excess bits away. */
347 0 : machine_mode dest_mode = insn_data[icode1].operand[0].mode;
348 0 : gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
349 0 : tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
350 0 : for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
351 : {
352 0 : tree src = src_rgm->controls[i / 2];
353 0 : tree dest = dest_rgm->controls[i];
354 0 : tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
355 0 : ? VEC_UNPACK_HI_EXPR
356 : : VEC_UNPACK_LO_EXPR);
357 0 : gassign *stmt;
358 0 : if (dest_masktype == unpack_masktype)
359 0 : stmt = gimple_build_assign (dest, code, src);
360 : else
361 : {
362 0 : tree temp = make_ssa_name (unpack_masktype);
363 0 : stmt = gimple_build_assign (temp, code, src);
364 0 : gimple_seq_add_stmt (seq, stmt);
365 0 : stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
366 : build1 (VIEW_CONVERT_EXPR,
367 : dest_masktype, temp));
368 : }
369 0 : gimple_seq_add_stmt (seq, stmt);
370 : }
371 : return true;
372 : }
373 0 : vec_perm_indices indices[2];
374 0 : if (dest_masktype == src_masktype
375 0 : && interleave_supported_p (&indices[0], src_masktype, 0)
376 0 : && interleave_supported_p (&indices[1], src_masktype, 1))
377 : {
378 : /* The destination requires twice as many mask bits as the source, so
379 : we can use interleaving permutes to double up the number of bits. */
380 : tree masks[2];
381 0 : for (unsigned int i = 0; i < 2; ++i)
382 0 : masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
383 0 : for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
384 : {
385 0 : tree src = src_rgm->controls[i / 2];
386 0 : tree dest = dest_rgm->controls[i];
387 0 : gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
388 0 : src, src, masks[i & 1]);
389 0 : gimple_seq_add_stmt (seq, stmt);
390 : }
391 0 : return true;
392 : }
393 : return false;
394 0 : }
395 :
396 : /* Populate DEST_RGM->controls, given that they should add up to STEP.
397 :
398 : STEP = MIN_EXPR <ivtmp_34, VF>;
399 :
400 : First length (MIN (X, VF/N)):
401 : loop_len_15 = MIN_EXPR <STEP, VF/N>;
402 :
403 : Second length:
404 : tmp = STEP - loop_len_15;
405 : loop_len_16 = MIN (tmp, VF/N);
406 :
407 : Third length:
408 : tmp2 = tmp - loop_len_16;
409 : loop_len_17 = MIN (tmp2, VF/N);
410 :
411 : Last length:
412 : loop_len_18 = tmp2 - loop_len_17;
413 : */
414 :
415 : static void
416 0 : vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
417 : rgroup_controls *dest_rgm, tree step)
418 : {
419 0 : tree ctrl_type = dest_rgm->type;
420 0 : poly_uint64 nitems_per_ctrl
421 0 : = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
422 0 : tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
423 :
424 0 : for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
425 : {
426 0 : tree ctrl = dest_rgm->controls[i];
427 0 : if (i == 0)
428 : {
429 : /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N]. */
430 0 : gassign *assign
431 0 : = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
432 0 : gimple_seq_add_stmt (seq, assign);
433 : }
434 0 : else if (i == dest_rgm->controls.length () - 1)
435 : {
436 : /* Last iteration: Remain capped to the range [0, VF/N]. */
437 0 : gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step,
438 0 : dest_rgm->controls[i - 1]);
439 0 : gimple_seq_add_stmt (seq, assign);
440 : }
441 : else
442 : {
443 : /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N]. */
444 0 : step = gimple_build (seq, MINUS_EXPR, iv_type, step,
445 0 : dest_rgm->controls[i - 1]);
446 0 : gassign *assign
447 0 : = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
448 0 : gimple_seq_add_stmt (seq, assign);
449 : }
450 : }
451 0 : }
452 :
453 : /* Stores the standard position for induction variable increment in belonging to
454 : LOOP_EXIT (just before the exit condition of the given exit to BSI.
455 : INSERT_AFTER is set to true if the increment should be inserted after
456 : *BSI. */
457 :
458 : void
459 61880 : vect_iv_increment_position (edge loop_exit, gimple_stmt_iterator *bsi,
460 : bool *insert_after)
461 : {
462 61880 : basic_block bb = loop_exit->src;
463 61880 : *bsi = gsi_last_bb (bb);
464 61880 : *insert_after = false;
465 61880 : }
466 :
467 : /* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
468 : for all the rgroup controls in RGC and return a control that is nonzero
469 : when the loop needs to iterate. Add any new preheader statements to
470 : PREHEADER_SEQ. Use LOOP_COND_GSI to insert code before the exit gcond.
471 :
472 : RGC belongs to loop LOOP. The loop originally iterated NITERS
473 : times and has been vectorized according to LOOP_VINFO.
474 :
475 : If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
476 : starts with NITERS_SKIP dummy iterations of the scalar loop before
477 : the real work starts. The mask elements for these dummy iterations
478 : must be 0, to ensure that the extra iterations do not have an effect.
479 :
480 : It is known that:
481 :
482 : NITERS * RGC->max_nscalars_per_iter * RGC->factor
483 :
484 : does not overflow. However, MIGHT_WRAP_P says whether an induction
485 : variable that starts at 0 and has step:
486 :
487 : VF * RGC->max_nscalars_per_iter * RGC->factor
488 :
489 : might overflow before hitting a value above:
490 :
491 : (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
492 :
493 : This means that we cannot guarantee that such an induction variable
494 : would ever hit a value that produces a set of all-false masks or zero
495 : lengths for RGC.
496 :
497 : Note: the cost of the code generated by this function is modeled
498 : by vect_estimate_min_profitable_iters, so changes here may need
499 : corresponding changes there. */
500 :
501 : static tree
502 0 : vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
503 : gimple_seq *preheader_seq,
504 : gimple_seq *header_seq,
505 : gimple_stmt_iterator loop_cond_gsi,
506 : rgroup_controls *rgc, tree niters,
507 : tree niters_skip, bool might_wrap_p,
508 : tree *iv_step, tree *compare_step)
509 : {
510 0 : tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
511 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
512 0 : bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
513 :
514 0 : tree ctrl_type = rgc->type;
515 0 : unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
516 0 : poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
517 0 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
518 0 : tree length_limit = NULL_TREE;
519 : /* For length, we need length_limit to ensure length in range. */
520 0 : if (!use_masks_p)
521 0 : length_limit = build_int_cst (compare_type, nitems_per_ctrl);
522 :
523 : /* Calculate the maximum number of item values that the rgroup
524 : handles in total, the number that it handles for each iteration
525 : of the vector loop, and the number that it should skip during the
526 : first iteration of the vector loop. */
527 0 : tree nitems_total = niters;
528 0 : tree nitems_step = build_int_cst (iv_type, vf);
529 0 : tree nitems_skip = niters_skip;
530 0 : if (nitems_per_iter != 1)
531 : {
532 : /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
533 : these multiplications don't overflow. */
534 0 : tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
535 0 : tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
536 0 : nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
537 : nitems_total, compare_factor);
538 0 : nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
539 : nitems_step, iv_factor);
540 0 : if (nitems_skip)
541 0 : nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
542 : nitems_skip, compare_factor);
543 : }
544 :
545 : /* Create an induction variable that counts the number of items
546 : processed. */
547 0 : tree index_before_incr, index_after_incr;
548 0 : gimple_stmt_iterator incr_gsi;
549 0 : bool insert_after;
550 0 : edge exit_e = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
551 0 : vect_iv_increment_position (exit_e, &incr_gsi, &insert_after);
552 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
553 : {
554 : /* Create an IV that counts down from niters_total and whose step
555 : is the (variable) amount processed in the current iteration:
556 : ...
557 : _10 = (unsigned long) count_12(D);
558 : ...
559 : # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
560 : _36 = (MIN_EXPR | SELECT_VL) <ivtmp_9, POLY_INT_CST [4, 4]>;
561 : ...
562 : vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
563 : ...
564 : ivtmp_35 = ivtmp_9 - POLY_INT_CST [4, 4];
565 : ...
566 : if (ivtmp_9 > POLY_INT_CST [4, 4])
567 : goto <bb 4>; [83.33%]
568 : else
569 : goto <bb 5>; [16.67%]
570 : */
571 0 : nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
572 0 : tree step = rgc->controls.length () == 1 ? rgc->controls[0]
573 0 : : make_ssa_name (iv_type);
574 : /* Create decrement IV. */
575 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
576 : {
577 0 : create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
578 : insert_after, &index_before_incr, &index_after_incr);
579 0 : tree vectype = build_zero_cst (rgc->type);
580 0 : tree len = gimple_build (header_seq, IFN_SELECT_VL, iv_type,
581 : index_before_incr, nitems_step,
582 : vectype);
583 0 : gimple_seq_add_stmt (header_seq, gimple_build_assign (step, len));
584 : }
585 : else
586 : {
587 0 : create_iv (nitems_total, MINUS_EXPR, nitems_step, NULL_TREE, loop,
588 : &incr_gsi, insert_after, &index_before_incr,
589 : &index_after_incr);
590 0 : gimple_seq_add_stmt (header_seq,
591 0 : gimple_build_assign (step, MIN_EXPR,
592 : index_before_incr,
593 : nitems_step));
594 : }
595 0 : *iv_step = step;
596 0 : *compare_step = nitems_step;
597 0 : return LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) ? index_after_incr
598 0 : : index_before_incr;
599 : }
600 :
601 : /* Create increment IV. */
602 0 : create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
603 : loop, &incr_gsi, insert_after, &index_before_incr,
604 : &index_after_incr);
605 :
606 0 : tree zero_index = build_int_cst (compare_type, 0);
607 0 : tree test_index, test_limit, first_limit;
608 0 : gimple_stmt_iterator *test_gsi;
609 0 : if (might_wrap_p)
610 : {
611 : /* In principle the loop should stop iterating once the incremented
612 : IV reaches a value greater than or equal to:
613 :
614 : NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
615 :
616 : However, there's no guarantee that this addition doesn't overflow
617 : the comparison type, or that the IV hits a value above it before
618 : wrapping around. We therefore adjust the limit down by one
619 : IV step:
620 :
621 : (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
622 : -[infinite-prec] NITEMS_STEP
623 :
624 : and compare the IV against this limit _before_ incrementing it.
625 : Since the comparison type is unsigned, we actually want the
626 : subtraction to saturate at zero:
627 :
628 : (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
629 : -[sat] NITEMS_STEP
630 :
631 : And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
632 :
633 : NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
634 :
635 : where the rightmost subtraction can be done directly in
636 : COMPARE_TYPE. */
637 0 : test_index = index_before_incr;
638 0 : tree adjust = gimple_convert (preheader_seq, compare_type,
639 : nitems_step);
640 0 : if (nitems_skip)
641 0 : adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
642 : adjust, nitems_skip);
643 0 : test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
644 : nitems_total, adjust);
645 0 : test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
646 : test_limit, adjust);
647 0 : test_gsi = &incr_gsi;
648 :
649 : /* Get a safe limit for the first iteration. */
650 0 : if (nitems_skip)
651 : {
652 : /* The first vector iteration can handle at most NITEMS_STEP
653 : items. NITEMS_STEP <= CONST_LIMIT, and adding
654 : NITEMS_SKIP to that cannot overflow. */
655 0 : tree const_limit = build_int_cst (compare_type,
656 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)
657 0 : * nitems_per_iter);
658 0 : first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
659 : nitems_total, const_limit);
660 0 : first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
661 : first_limit, nitems_skip);
662 : }
663 : else
664 : /* For the first iteration it doesn't matter whether the IV hits
665 : a value above NITEMS_TOTAL. That only matters for the latch
666 : condition. */
667 : first_limit = nitems_total;
668 : }
669 : else
670 : {
671 : /* Test the incremented IV, which will always hit a value above
672 : the bound before wrapping. */
673 0 : test_index = index_after_incr;
674 0 : test_limit = nitems_total;
675 0 : if (nitems_skip)
676 0 : test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
677 : test_limit, nitems_skip);
678 : test_gsi = &loop_cond_gsi;
679 :
680 : first_limit = test_limit;
681 : }
682 :
683 : /* Convert the IV value to the comparison type (either a no-op or
684 : a demotion). */
685 0 : gimple_seq test_seq = NULL;
686 0 : test_index = gimple_convert (&test_seq, compare_type, test_index);
687 0 : gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
688 :
689 : /* Provide a definition of each control in the group. */
690 0 : tree next_ctrl = NULL_TREE;
691 0 : tree ctrl;
692 0 : unsigned int i;
693 0 : FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
694 : {
695 : /* Previous controls will cover BIAS items. This control covers the
696 : next batch. */
697 0 : poly_uint64 bias = nitems_per_ctrl * i;
698 0 : tree bias_tree = build_int_cst (compare_type, bias);
699 :
700 : /* See whether the first iteration of the vector loop is known
701 : to have a full control. */
702 0 : poly_uint64 const_limit;
703 0 : bool first_iteration_full
704 0 : = (poly_int_tree_p (first_limit, &const_limit)
705 0 : && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
706 :
707 : /* Rather than have a new IV that starts at BIAS and goes up to
708 : TEST_LIMIT, prefer to use the same 0-based IV for each control
709 : and adjust the bound down by BIAS. */
710 0 : tree this_test_limit = test_limit;
711 0 : if (i != 0)
712 : {
713 0 : this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
714 : compare_type, this_test_limit,
715 : bias_tree);
716 0 : this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
717 : compare_type, this_test_limit,
718 : bias_tree);
719 : }
720 :
721 : /* Create the initial control. First include all items that
722 : are within the loop limit. */
723 0 : tree init_ctrl = NULL_TREE;
724 0 : if (!first_iteration_full)
725 : {
726 0 : tree start, end;
727 0 : if (first_limit == test_limit)
728 : {
729 : /* Use a natural test between zero (the initial IV value)
730 : and the loop limit. The "else" block would be valid too,
731 : but this choice can avoid the need to load BIAS_TREE into
732 : a register. */
733 : start = zero_index;
734 : end = this_test_limit;
735 : }
736 : else
737 : {
738 : /* FIRST_LIMIT is the maximum number of items handled by the
739 : first iteration of the vector loop. Test the portion
740 : associated with this control. */
741 0 : start = bias_tree;
742 0 : end = first_limit;
743 : }
744 :
745 0 : if (use_masks_p)
746 0 : init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
747 : start, end, "max_mask");
748 : else
749 : {
750 0 : init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
751 0 : gimple_seq seq = vect_gen_len (init_ctrl, start,
752 : end, length_limit);
753 0 : gimple_seq_add_seq (preheader_seq, seq);
754 : }
755 : }
756 :
757 : /* Now AND out the bits that are within the number of skipped
758 : items. */
759 0 : poly_uint64 const_skip;
760 0 : if (nitems_skip
761 0 : && !(poly_int_tree_p (nitems_skip, &const_skip)
762 0 : && known_le (const_skip, bias)))
763 : {
764 0 : gcc_assert (use_masks_p);
765 0 : tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
766 : bias_tree, nitems_skip);
767 0 : if (init_ctrl)
768 0 : init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
769 : init_ctrl, unskipped_mask);
770 : else
771 : init_ctrl = unskipped_mask;
772 : }
773 :
774 0 : if (!init_ctrl)
775 : {
776 : /* First iteration is full. */
777 0 : if (use_masks_p)
778 0 : init_ctrl = build_minus_one_cst (ctrl_type);
779 : else
780 : init_ctrl = length_limit;
781 : }
782 :
783 : /* Get the control value for the next iteration of the loop. */
784 0 : if (use_masks_p)
785 : {
786 0 : gimple_seq stmts = NULL;
787 0 : next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
788 : this_test_limit, "next_mask");
789 0 : gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
790 : }
791 : else
792 : {
793 0 : next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
794 0 : gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
795 : length_limit);
796 0 : gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
797 : }
798 :
799 0 : vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
800 : }
801 :
802 0 : int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
803 0 : if (partial_load_bias != 0)
804 : {
805 0 : tree adjusted_len = rgc->bias_adjusted_ctrl;
806 0 : gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
807 0 : rgc->controls[0],
808 : build_int_cst
809 0 : (TREE_TYPE (rgc->controls[0]),
810 0 : partial_load_bias));
811 0 : gimple_seq_add_stmt (header_seq, minus);
812 : }
813 :
814 : return next_ctrl;
815 : }
816 :
817 : /* Set up the iteration condition and rgroup controls for LOOP, given
818 : that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
819 : loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is
820 : the number of iterations of the original scalar loop that should be
821 : handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as
822 : for vect_set_loop_condition.
823 :
824 : Insert the branch-back condition before LOOP_COND_GSI and return the
825 : final gcond. */
826 :
827 : static gcond *
828 0 : vect_set_loop_condition_partial_vectors (class loop *loop, edge exit_edge,
829 : loop_vec_info loop_vinfo, tree niters,
830 : tree final_iv, bool niters_maybe_zero,
831 : gimple_stmt_iterator loop_cond_gsi)
832 : {
833 0 : gimple_seq preheader_seq = NULL;
834 0 : gimple_seq header_seq = NULL;
835 :
836 0 : bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
837 0 : tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
838 0 : unsigned int compare_precision = TYPE_PRECISION (compare_type);
839 0 : tree orig_niters = niters;
840 :
841 : /* Type of the initial value of NITERS. */
842 0 : tree ni_actual_type = TREE_TYPE (niters);
843 0 : unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
844 0 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
845 0 : if (niters_skip)
846 0 : niters_skip = gimple_convert (&preheader_seq, compare_type, niters_skip);
847 :
848 : /* Convert NITERS to the same size as the compare. */
849 0 : if (compare_precision > ni_actual_precision
850 0 : && niters_maybe_zero)
851 : {
852 : /* We know that there is always at least one iteration, so if the
853 : count is zero then it must have wrapped. Cope with this by
854 : subtracting 1 before the conversion and adding 1 to the result. */
855 0 : gcc_assert (TYPE_UNSIGNED (ni_actual_type));
856 0 : niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
857 : niters, build_minus_one_cst (ni_actual_type));
858 0 : niters = gimple_convert (&preheader_seq, compare_type, niters);
859 0 : niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
860 : niters, build_one_cst (compare_type));
861 : }
862 : else
863 0 : niters = gimple_convert (&preheader_seq, compare_type, niters);
864 :
865 : /* Iterate over all the rgroups and fill in their controls. We could use
866 : the first control from any rgroup for the loop condition; here we
867 : arbitrarily pick the last. */
868 0 : tree test_ctrl = NULL_TREE;
869 0 : tree iv_step = NULL_TREE;
870 0 : tree compare_step = NULL_TREE;
871 0 : rgroup_controls *rgc;
872 0 : rgroup_controls *iv_rgc = nullptr;
873 0 : unsigned int i;
874 0 : auto_vec<rgroup_controls> *controls = use_masks_p
875 0 : ? &LOOP_VINFO_MASKS (loop_vinfo).rgc_vec
876 : : &LOOP_VINFO_LENS (loop_vinfo);
877 0 : FOR_EACH_VEC_ELT (*controls, i, rgc)
878 0 : if (!rgc->controls.is_empty ())
879 : {
880 : /* First try using permutes. This adds a single vector
881 : instruction to the loop for each mask, but needs no extra
882 : loop invariants or IVs. */
883 0 : unsigned int nmasks = i + 1;
884 0 : if (use_masks_p && (nmasks & 1) == 0)
885 : {
886 0 : rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
887 0 : if (!half_rgc->controls.is_empty ()
888 0 : && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
889 0 : continue;
890 : }
891 :
892 0 : if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
893 0 : || !iv_rgc
894 0 : || (iv_rgc->max_nscalars_per_iter * iv_rgc->factor
895 0 : != rgc->max_nscalars_per_iter * rgc->factor))
896 : {
897 : /* See whether zero-based IV would ever generate all-false masks
898 : or zero length before wrapping around. */
899 0 : bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
900 :
901 : /* Set up all controls for this group. */
902 0 : test_ctrl
903 0 : = vect_set_loop_controls_directly (loop, loop_vinfo,
904 : &preheader_seq, &header_seq,
905 : loop_cond_gsi, rgc, niters,
906 : niters_skip, might_wrap_p,
907 : &iv_step, &compare_step);
908 :
909 0 : iv_rgc = rgc;
910 : }
911 :
912 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
913 0 : && rgc->controls.length () > 1)
914 : {
915 : /* vect_set_loop_controls_directly creates an IV whose step
916 : is equal to the expected sum of RGC->controls. Use that
917 : information to populate RGC->controls. */
918 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
919 0 : gcc_assert (iv_step);
920 0 : vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, iv_step);
921 : }
922 : }
923 :
924 : /* Emit all accumulated statements. */
925 0 : add_preheader_seq (loop, preheader_seq);
926 0 : add_header_seq (loop, header_seq);
927 :
928 : /* Get a boolean result that tells us whether to iterate. */
929 0 : gcond *cond_stmt;
930 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
931 0 : && !LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
932 : {
933 0 : gcc_assert (compare_step);
934 0 : tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
935 0 : cond_stmt = gimple_build_cond (code, test_ctrl, compare_step, NULL_TREE,
936 : NULL_TREE);
937 0 : }
938 : else
939 : {
940 0 : tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
941 0 : tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
942 0 : cond_stmt
943 0 : = gimple_build_cond (code, test_ctrl, zero_ctrl, NULL_TREE, NULL_TREE);
944 : }
945 0 : gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
946 :
947 : /* The loop iterates (NITERS - 1) / VF + 1 times.
948 : Subtract one from this to get the latch count. */
949 0 : tree step = build_int_cst (compare_type,
950 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo));
951 0 : tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
952 : build_minus_one_cst (compare_type));
953 0 : loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
954 : niters_minus_one, step);
955 :
956 0 : if (final_iv)
957 : {
958 0 : gassign *assign;
959 : /* If vectorizing an inverted early break loop we have to restart the
960 : scalar loop at niters - vf. This matches what we do in
961 : vect_gen_vector_loop_niters_mult_vf for non-masked loops. */
962 0 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
963 : {
964 0 : tree ftype = TREE_TYPE (orig_niters);
965 0 : tree vf = build_int_cst (ftype, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
966 0 : assign = gimple_build_assign (final_iv, MINUS_EXPR, orig_niters, vf);
967 : }
968 : else
969 0 : assign = gimple_build_assign (final_iv, orig_niters);
970 0 : gsi_insert_on_edge_immediate (exit_edge, assign);
971 : }
972 :
973 0 : return cond_stmt;
974 : }
975 :
976 : /* Set up the iteration condition and rgroup controls for LOOP in AVX512
977 : style, given that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the
978 : vectorized loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is
979 : the number of iterations of the original scalar loop that should be
980 : handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as
981 : for vect_set_loop_condition.
982 :
983 : Insert the branch-back condition before LOOP_COND_GSI and return the
984 : final gcond. */
985 :
986 : static gcond *
987 18 : vect_set_loop_condition_partial_vectors_avx512 (class loop *loop,
988 : edge exit_edge,
989 : loop_vec_info loop_vinfo, tree niters,
990 : tree final_iv,
991 : bool niters_maybe_zero,
992 : gimple_stmt_iterator loop_cond_gsi)
993 : {
994 18 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
995 18 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
996 18 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
997 18 : tree orig_niters = niters;
998 18 : gimple_seq preheader_seq = NULL;
999 :
1000 : /* Create an IV that counts down from niters and whose step
1001 : is the number of iterations processed in the current iteration.
1002 : Produce the controls with compares like the following.
1003 :
1004 : # iv_2 = PHI <niters, iv_3>
1005 : rem_4 = MIN <iv_2, VF>;
1006 : remv_6 = { rem_4, rem_4, rem_4, ... }
1007 : mask_5 = { 0, 0, 1, 1, 2, 2, ... } < remv6;
1008 : iv_3 = iv_2 - VF;
1009 : if (iv_2 > VF)
1010 : continue;
1011 :
1012 : Where the constant is built with elements at most VF - 1 and
1013 : repetitions according to max_nscalars_per_iter which is guarnateed
1014 : to be the same within a group. */
1015 :
1016 : /* Convert NITERS to the determined IV type. */
1017 18 : if (TYPE_PRECISION (iv_type) > TYPE_PRECISION (TREE_TYPE (niters))
1018 18 : && niters_maybe_zero)
1019 : {
1020 : /* We know that there is always at least one iteration, so if the
1021 : count is zero then it must have wrapped. Cope with this by
1022 : subtracting 1 before the conversion and adding 1 to the result. */
1023 0 : gcc_assert (TYPE_UNSIGNED (TREE_TYPE (niters)));
1024 0 : niters = gimple_build (&preheader_seq, PLUS_EXPR, TREE_TYPE (niters),
1025 0 : niters, build_minus_one_cst (TREE_TYPE (niters)));
1026 0 : niters = gimple_convert (&preheader_seq, iv_type, niters);
1027 0 : niters = gimple_build (&preheader_seq, PLUS_EXPR, iv_type,
1028 : niters, build_one_cst (iv_type));
1029 : }
1030 : else
1031 18 : niters = gimple_convert (&preheader_seq, iv_type, niters);
1032 :
1033 : /* Bias the initial value of the IV in case we need to skip iterations
1034 : at the beginning. */
1035 18 : tree niters_adj = niters;
1036 18 : if (niters_skip)
1037 : {
1038 1 : tree skip = gimple_convert (&preheader_seq, iv_type, niters_skip);
1039 1 : niters_adj = gimple_build (&preheader_seq, PLUS_EXPR,
1040 : iv_type, niters, skip);
1041 : }
1042 :
1043 : /* The iteration step is the vectorization factor. */
1044 18 : tree iv_step = build_int_cst (iv_type, vf);
1045 :
1046 : /* Create the decrement IV. */
1047 18 : tree index_before_incr, index_after_incr;
1048 18 : gimple_stmt_iterator incr_gsi;
1049 18 : bool insert_after;
1050 18 : vect_iv_increment_position (exit_edge, &incr_gsi, &insert_after);
1051 18 : create_iv (niters_adj, MINUS_EXPR, iv_step, NULL_TREE, loop,
1052 : &incr_gsi, insert_after, &index_before_incr,
1053 : &index_after_incr);
1054 :
1055 : /* Iterate over all the rgroups and fill in their controls. */
1056 80 : for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1057 : {
1058 26 : if (rgc.controls.is_empty ())
1059 6 : continue;
1060 :
1061 20 : tree ctrl_type = rgc.type;
1062 20 : poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type);
1063 :
1064 20 : tree vectype = rgc.compare_type;
1065 :
1066 : /* index_after_incr is the IV specifying the remaining iterations in
1067 : the next iteration. */
1068 20 : tree rem = index_after_incr;
1069 : /* When the data type for the compare to produce the mask is
1070 : smaller than the IV type we need to saturate. Saturate to
1071 : the smallest possible value (IV_TYPE) so we only have to
1072 : saturate once (CSE will catch redundant ones we add). */
1073 20 : if (TYPE_PRECISION (TREE_TYPE (vectype)) < TYPE_PRECISION (iv_type))
1074 9 : rem = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
1075 : UNKNOWN_LOCATION,
1076 9 : MIN_EXPR, TREE_TYPE (rem), rem, iv_step);
1077 20 : rem = gimple_convert (&incr_gsi, false, GSI_CONTINUE_LINKING,
1078 20 : UNKNOWN_LOCATION, TREE_TYPE (vectype), rem);
1079 :
1080 : /* Build a data vector composed of the remaining iterations. */
1081 20 : rem = gimple_build_vector_from_val (&incr_gsi, false, GSI_CONTINUE_LINKING,
1082 : UNKNOWN_LOCATION, vectype, rem);
1083 :
1084 : /* Provide a definition of each vector in the control group. */
1085 20 : tree next_ctrl = NULL_TREE;
1086 20 : tree first_rem = NULL_TREE;
1087 20 : tree ctrl;
1088 20 : unsigned int i;
1089 238 : FOR_EACH_VEC_ELT_REVERSE (rgc.controls, i, ctrl)
1090 : {
1091 : /* Previous controls will cover BIAS items. This control covers the
1092 : next batch. */
1093 86 : poly_uint64 bias = nitems_per_ctrl * i;
1094 :
1095 : /* Build the constant to compare the remaining iters against,
1096 : this is sth like { 0, 0, 1, 1, 2, 2, 3, 3, ... } appropriately
1097 : split into pieces. */
1098 86 : unsigned n = TYPE_VECTOR_SUBPARTS (ctrl_type).to_constant ();
1099 86 : tree_vector_builder builder (vectype, n, 1);
1100 1266 : for (unsigned i = 0; i < n; ++i)
1101 : {
1102 1180 : unsigned HOST_WIDE_INT val
1103 1180 : = (i + bias.to_constant ()) / rgc.max_nscalars_per_iter;
1104 1180 : gcc_assert (val < vf.to_constant ());
1105 1180 : builder.quick_push (build_int_cst (TREE_TYPE (vectype), val));
1106 : }
1107 86 : tree cmp_series = builder.build ();
1108 :
1109 : /* Create the initial control. First include all items that
1110 : are within the loop limit. */
1111 86 : tree init_ctrl = NULL_TREE;
1112 86 : poly_uint64 const_limit;
1113 : /* See whether the first iteration of the vector loop is known
1114 : to have a full control. */
1115 86 : if (poly_int_tree_p (niters, &const_limit)
1116 86 : && known_ge (const_limit, (i + 1) * nitems_per_ctrl))
1117 1 : init_ctrl = build_minus_one_cst (ctrl_type);
1118 : else
1119 : {
1120 : /* The remaining work items initially are niters. Saturate,
1121 : splat and compare. */
1122 85 : if (!first_rem)
1123 : {
1124 19 : first_rem = niters;
1125 19 : if (TYPE_PRECISION (TREE_TYPE (vectype))
1126 19 : < TYPE_PRECISION (iv_type))
1127 9 : first_rem = gimple_build (&preheader_seq,
1128 9 : MIN_EXPR, TREE_TYPE (first_rem),
1129 : first_rem, iv_step);
1130 19 : first_rem = gimple_convert (&preheader_seq, TREE_TYPE (vectype),
1131 : first_rem);
1132 19 : first_rem = gimple_build_vector_from_val (&preheader_seq,
1133 : vectype, first_rem);
1134 : }
1135 85 : init_ctrl = gimple_build (&preheader_seq, LT_EXPR, ctrl_type,
1136 : cmp_series, first_rem);
1137 : }
1138 :
1139 : /* Now AND out the bits that are within the number of skipped
1140 : items. */
1141 86 : poly_uint64 const_skip;
1142 86 : if (niters_skip
1143 86 : && !(poly_int_tree_p (niters_skip, &const_skip)
1144 1 : && known_le (const_skip, bias)))
1145 : {
1146 : /* For integer mode masks it's cheaper to shift out the bits
1147 : since that avoids loading a constant. */
1148 1 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (ctrl_type)) == MODE_INT);
1149 1 : init_ctrl = gimple_build (&preheader_seq, VIEW_CONVERT_EXPR,
1150 1 : lang_hooks.types.type_for_mode
1151 1 : (TYPE_MODE (ctrl_type), 1),
1152 : init_ctrl);
1153 : /* ??? But when the shift amount isn't constant this requires
1154 : a round-trip to GRPs. We could apply the bias to either
1155 : side of the compare instead. */
1156 2 : tree shift = gimple_build (&preheader_seq, MINUS_EXPR,
1157 1 : TREE_TYPE (niters_skip), niters_skip,
1158 1 : build_int_cst (TREE_TYPE (niters_skip),
1159 1 : bias));
1160 2 : shift = gimple_build (&preheader_seq, MULT_EXPR,
1161 1 : TREE_TYPE (niters_skip), shift,
1162 1 : build_int_cst (TREE_TYPE (niters_skip),
1163 1 : rgc.max_nscalars_per_iter));
1164 1 : init_ctrl = gimple_build (&preheader_seq, LSHIFT_EXPR,
1165 1 : TREE_TYPE (init_ctrl),
1166 : init_ctrl, shift);
1167 1 : init_ctrl = gimple_build (&preheader_seq, VIEW_CONVERT_EXPR,
1168 : ctrl_type, init_ctrl);
1169 : }
1170 :
1171 : /* Get the control value for the next iteration of the loop. */
1172 86 : next_ctrl = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
1173 : UNKNOWN_LOCATION,
1174 : LT_EXPR, ctrl_type, cmp_series, rem);
1175 :
1176 86 : vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
1177 86 : }
1178 : }
1179 :
1180 : /* Emit all accumulated statements. */
1181 18 : add_preheader_seq (loop, preheader_seq);
1182 :
1183 : /* Adjust the exit test using the decrementing IV. */
1184 18 : tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
1185 : /* When we peel for alignment with niter_skip != 0 this can
1186 : cause niter + niter_skip to wrap and since we are comparing the
1187 : value before the decrement here we get a false early exit.
1188 : We can't compare the value after decrement either because that
1189 : decrement could wrap as well as we're not doing a saturating
1190 : decrement. To avoid this situation we force a larger
1191 : iv_type. */
1192 18 : gcond *cond_stmt = gimple_build_cond (code, index_before_incr, iv_step,
1193 : NULL_TREE, NULL_TREE);
1194 18 : gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
1195 :
1196 : /* The loop iterates (NITERS - 1 + NITERS_SKIP) / VF + 1 times.
1197 : Subtract one from this to get the latch count. */
1198 18 : tree niters_minus_one
1199 18 : = fold_build2 (PLUS_EXPR, TREE_TYPE (orig_niters), orig_niters,
1200 : build_minus_one_cst (TREE_TYPE (orig_niters)));
1201 18 : tree niters_adj2 = fold_convert (iv_type, niters_minus_one);
1202 18 : if (niters_skip)
1203 1 : niters_adj2 = fold_build2 (PLUS_EXPR, iv_type, niters_minus_one,
1204 : fold_convert (iv_type, niters_skip));
1205 18 : loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, iv_type,
1206 : niters_adj2, iv_step);
1207 :
1208 18 : if (final_iv)
1209 : {
1210 0 : gassign *assign;
1211 : /* If vectorizing an inverted early break loop we have to restart the
1212 : scalar loop at niters - vf. This matches what we do in
1213 : vect_gen_vector_loop_niters_mult_vf for non-masked loops. */
1214 0 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
1215 : {
1216 0 : tree ftype = TREE_TYPE (orig_niters);
1217 0 : tree vf = build_int_cst (ftype, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1218 0 : assign = gimple_build_assign (final_iv, MINUS_EXPR, orig_niters, vf);
1219 : }
1220 : else
1221 0 : assign = gimple_build_assign (final_iv, orig_niters);
1222 0 : gsi_insert_on_edge_immediate (exit_edge, assign);
1223 : }
1224 :
1225 18 : return cond_stmt;
1226 : }
1227 :
1228 :
1229 : /* Like vect_set_loop_condition, but handle the case in which the vector
1230 : loop handles exactly VF scalars per iteration. */
1231 :
1232 : static gcond *
1233 61795 : vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge exit_edge,
1234 : class loop *loop, tree niters, tree step,
1235 : tree final_iv, bool niters_maybe_zero,
1236 : gimple_stmt_iterator loop_cond_gsi)
1237 : {
1238 61795 : tree indx_before_incr, indx_after_incr;
1239 61795 : gcond *cond_stmt;
1240 61795 : gcond *orig_cond;
1241 61795 : edge pe = loop_preheader_edge (loop);
1242 61795 : gimple_stmt_iterator incr_gsi;
1243 61795 : bool insert_after;
1244 61795 : enum tree_code code;
1245 61795 : tree niters_type = TREE_TYPE (niters);
1246 :
1247 61795 : orig_cond = get_loop_exit_condition (exit_edge);
1248 61795 : gcc_assert (orig_cond);
1249 61795 : loop_cond_gsi = gsi_for_stmt (orig_cond);
1250 :
1251 61795 : tree init, limit;
1252 61795 : if (!niters_maybe_zero && integer_onep (step))
1253 : {
1254 : /* In this case we can use a simple 0-based IV:
1255 :
1256 : A:
1257 : x = 0;
1258 : do
1259 : {
1260 : ...
1261 : x += 1;
1262 : }
1263 : while (x < NITERS); */
1264 61795 : code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
1265 61795 : init = build_zero_cst (niters_type);
1266 61795 : limit = niters;
1267 : }
1268 : else
1269 : {
1270 : /* The following works for all values of NITERS except 0:
1271 :
1272 : B:
1273 : x = 0;
1274 : do
1275 : {
1276 : ...
1277 : x += STEP;
1278 : }
1279 : while (x <= NITERS - STEP);
1280 :
1281 : so that the loop continues to iterate if x + STEP - 1 < NITERS
1282 : but stops if x + STEP - 1 >= NITERS.
1283 :
1284 : However, if NITERS is zero, x never hits a value above NITERS - STEP
1285 : before wrapping around. There are two obvious ways of dealing with
1286 : this:
1287 :
1288 : - start at STEP - 1 and compare x before incrementing it
1289 : - start at -1 and compare x after incrementing it
1290 :
1291 : The latter is simpler and is what we use. The loop in this case
1292 : looks like:
1293 :
1294 : C:
1295 : x = -1;
1296 : do
1297 : {
1298 : ...
1299 : x += STEP;
1300 : }
1301 : while (x < NITERS - STEP);
1302 :
1303 : In both cases the loop limit is NITERS - STEP. */
1304 0 : gimple_seq seq = NULL;
1305 0 : limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
1306 0 : limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
1307 0 : if (seq)
1308 : {
1309 0 : basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1310 0 : gcc_assert (!new_bb);
1311 : }
1312 0 : if (niters_maybe_zero)
1313 : {
1314 : /* Case C. */
1315 0 : code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
1316 0 : init = build_all_ones_cst (niters_type);
1317 : }
1318 : else
1319 : {
1320 : /* Case B. */
1321 0 : code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
1322 0 : init = build_zero_cst (niters_type);
1323 : }
1324 : }
1325 :
1326 61795 : vect_iv_increment_position (exit_edge, &incr_gsi, &insert_after);
1327 61795 : create_iv (init, PLUS_EXPR, step, NULL_TREE, loop,
1328 : &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
1329 61795 : indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
1330 : true, NULL_TREE, true,
1331 : GSI_SAME_STMT);
1332 61795 : limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
1333 : true, GSI_SAME_STMT);
1334 :
1335 61795 : cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
1336 : NULL_TREE);
1337 :
1338 61795 : gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
1339 :
1340 : /* Record the number of latch iterations. */
1341 61795 : if (limit == niters)
1342 : /* Case A: the loop iterates NITERS times. Subtract one to get the
1343 : latch count. */
1344 61795 : loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
1345 : build_int_cst (niters_type, 1));
1346 : else
1347 : /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
1348 : Subtract one from this to get the latch count. */
1349 0 : loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
1350 : limit, step);
1351 :
1352 61795 : if (final_iv)
1353 : {
1354 0 : gassign *assign;
1355 0 : gcc_assert (single_pred_p (exit_edge->dest));
1356 0 : tree phi_dest
1357 0 : = integer_zerop (init) ? final_iv : copy_ssa_name (indx_after_incr);
1358 : /* Make sure to maintain LC SSA form here and elide the subtraction
1359 : if the value is zero. */
1360 0 : gphi *phi = create_phi_node (phi_dest, exit_edge->dest);
1361 0 : add_phi_arg (phi, indx_after_incr, exit_edge, UNKNOWN_LOCATION);
1362 0 : if (!integer_zerop (init))
1363 : {
1364 0 : assign = gimple_build_assign (final_iv, MINUS_EXPR,
1365 : phi_dest, init);
1366 0 : gimple_stmt_iterator gsi = gsi_after_labels (exit_edge->dest);
1367 0 : gsi_insert_before (&gsi, assign, GSI_SAME_STMT);
1368 : }
1369 : }
1370 :
1371 61795 : return cond_stmt;
1372 : }
1373 :
1374 : /* If we're using fully-masked loops, make LOOP iterate:
1375 :
1376 : N == (NITERS - 1) / STEP + 1
1377 :
1378 : times. When NITERS is zero, this is equivalent to making the loop
1379 : execute (1 << M) / STEP times, where M is the precision of NITERS.
1380 : NITERS_MAYBE_ZERO is true if this last case might occur.
1381 :
1382 : If we're not using fully-masked loops, make LOOP iterate:
1383 :
1384 : N == (NITERS - STEP) / STEP + 1
1385 :
1386 : times, where NITERS is known to be outside the range [1, STEP - 1].
1387 : This is equivalent to making the loop execute NITERS / STEP times
1388 : when NITERS is nonzero and (1 << M) / STEP times otherwise.
1389 : NITERS_MAYBE_ZERO again indicates whether this last case might occur.
1390 :
1391 : If FINAL_IV is nonnull, it is an SSA name that should be set to
1392 : N * STEP on exit from the loop.
1393 :
1394 : Assumption: the exit-condition of LOOP is the last stmt in the loop. */
1395 :
1396 : void
1397 61813 : vect_set_loop_condition (class loop *loop, edge loop_e, loop_vec_info loop_vinfo,
1398 : tree niters, tree step, tree final_iv,
1399 : bool niters_maybe_zero)
1400 : {
1401 61813 : gcond *cond_stmt;
1402 61813 : gcond *orig_cond = get_loop_exit_condition (loop_e);
1403 61813 : gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
1404 :
1405 61813 : if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1406 : {
1407 18 : if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_avx512)
1408 18 : cond_stmt = vect_set_loop_condition_partial_vectors_avx512 (loop, loop_e,
1409 : loop_vinfo,
1410 : niters, final_iv,
1411 : niters_maybe_zero,
1412 : loop_cond_gsi);
1413 : else
1414 0 : cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_e,
1415 : loop_vinfo,
1416 : niters, final_iv,
1417 : niters_maybe_zero,
1418 : loop_cond_gsi);
1419 : }
1420 : else
1421 61795 : cond_stmt = vect_set_loop_condition_normal (loop_vinfo, loop_e, loop,
1422 : niters,
1423 : step, final_iv,
1424 : niters_maybe_zero,
1425 : loop_cond_gsi);
1426 :
1427 : /* Remove old loop exit test. */
1428 61813 : stmt_vec_info orig_cond_info;
1429 61813 : if (loop_vinfo
1430 61813 : && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
1431 61375 : loop_vinfo->remove_stmt (orig_cond_info);
1432 : else
1433 438 : gsi_remove (&loop_cond_gsi, true);
1434 :
1435 61813 : if (dump_enabled_p ())
1436 11177 : dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
1437 : (gimple *) cond_stmt);
1438 61813 : }
1439 :
1440 : /* Get the virtual operand live on E. The precondition on this is valid
1441 : immediate dominators and an actual virtual definition dominating E. */
1442 : /* ??? Costly band-aid. For the use in question we can populate a
1443 : live-on-exit/end-of-BB virtual operand when copying stmts. */
1444 :
1445 : static tree
1446 10 : get_live_virtual_operand_on_edge (edge e)
1447 : {
1448 10 : basic_block bb = e->src;
1449 22 : do
1450 : {
1451 61 : for (auto gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1452 : {
1453 37 : gimple *stmt = gsi_stmt (gsi);
1454 55 : if (gimple_vdef (stmt))
1455 10 : return gimple_vdef (stmt);
1456 39 : if (gimple_vuse (stmt))
1457 : return gimple_vuse (stmt);
1458 : }
1459 8 : if (gphi *vphi = get_virtual_phi (bb))
1460 2 : return gimple_phi_result (vphi);
1461 6 : bb = get_immediate_dominator (CDI_DOMINATORS, bb);
1462 6 : }
1463 : while (1);
1464 : }
1465 :
1466 : /* Given LOOP this function generates a new copy of it and puts it
1467 : on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
1468 : non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1469 : basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1470 : entry or exit of LOOP. If FLOW_LOOPS then connect LOOP to SCALAR_LOOP as a
1471 : continuation. This is correct for cases where one loop continues from the
1472 : other like in the vectorizer, but not true for uses in e.g. loop distribution
1473 : where the contents of the loop body are split but the iteration space of both
1474 : copies remains the same.
1475 :
1476 : If UPDATED_DOMS is not NULL it is update with the list of basic blocks whoms
1477 : dominators were updated during the peeling. When doing early break vectorization
1478 : then LOOP_VINFO needs to be provided and is used to keep track of any newly created
1479 : memory references that need to be updated should we decide to vectorize. */
1480 :
1481 : class loop *
1482 34915 : slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
1483 : class loop *scalar_loop,
1484 : edge scalar_exit, edge e, edge *new_e,
1485 : bool flow_loops,
1486 : vec<basic_block> *updated_doms,
1487 : bool uncounted_p, bool create_main_e)
1488 : {
1489 34915 : class loop *new_loop;
1490 34915 : basic_block *new_bbs, *bbs, *pbbs;
1491 34915 : bool at_exit;
1492 34915 : bool was_imm_dom;
1493 34915 : basic_block exit_dest;
1494 34915 : edge exit, new_exit;
1495 34915 : bool duplicate_outer_loop = false;
1496 :
1497 34915 : exit = loop_exit;
1498 34915 : at_exit = (e == exit);
1499 34915 : if (!at_exit && e != loop_preheader_edge (loop))
1500 : return NULL;
1501 :
1502 34915 : if (scalar_loop == NULL)
1503 : {
1504 32394 : scalar_loop = loop;
1505 32394 : scalar_exit = loop_exit;
1506 : }
1507 2521 : else if (scalar_loop == loop)
1508 0 : scalar_exit = loop_exit;
1509 : else
1510 : {
1511 : /* Loop has been version, match exits up using the aux index. */
1512 7563 : for (edge exit : get_loop_exit_edges (scalar_loop))
1513 2521 : if (exit->aux == loop_exit->aux)
1514 : {
1515 2521 : scalar_exit = exit;
1516 2521 : break;
1517 2521 : }
1518 :
1519 2521 : gcc_assert (scalar_exit);
1520 : }
1521 :
1522 34915 : bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1523 34915 : pbbs = bbs + 1;
1524 34915 : get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1525 : /* Allow duplication of outer loops. */
1526 34915 : if (scalar_loop->inner)
1527 124 : duplicate_outer_loop = true;
1528 :
1529 : /* Generate new loop structure. */
1530 34915 : new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1531 34915 : duplicate_subloops (scalar_loop, new_loop);
1532 :
1533 34915 : exit_dest = exit->dest;
1534 34915 : was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1535 34915 : exit_dest) == exit->src ?
1536 : true : false);
1537 :
1538 : /* Also copy the pre-header, this avoids jumping through hoops to
1539 : duplicate the loop entry PHI arguments. Create an empty
1540 : pre-header unconditionally for this. */
1541 34915 : basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1542 34915 : edge entry_e = single_pred_edge (preheader);
1543 34915 : bbs[0] = preheader;
1544 34915 : new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1545 :
1546 34915 : copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1547 : &scalar_exit, 1, &new_exit, NULL,
1548 : at_exit ? loop->latch : e->src, true);
1549 34915 : exit = loop_exit;
1550 34915 : basic_block new_preheader = new_bbs[0];
1551 :
1552 34915 : gcc_assert (new_exit);
1553 :
1554 : /* Record the new loop exit information. new_loop doesn't have SCEV data and
1555 : so we must initialize the exit information. */
1556 34915 : if (new_e)
1557 33780 : *new_e = new_exit;
1558 :
1559 : /* Before installing PHI arguments make sure that the edges
1560 : into them match that of the scalar loop we analyzed. This
1561 : makes sure the SLP tree matches up between the main vectorized
1562 : loop and the epilogue vectorized copies. */
1563 34915 : if (single_succ_edge (preheader)->dest_idx
1564 34915 : != single_succ_edge (new_bbs[0])->dest_idx)
1565 : {
1566 29916 : basic_block swap_bb = new_bbs[1];
1567 29916 : gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1568 29916 : std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1569 29916 : EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1570 29916 : EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1571 : }
1572 34915 : if (duplicate_outer_loop)
1573 : {
1574 124 : class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1575 124 : if (loop_preheader_edge (scalar_loop)->dest_idx
1576 124 : != loop_preheader_edge (new_inner_loop)->dest_idx)
1577 : {
1578 99 : basic_block swap_bb = new_inner_loop->header;
1579 99 : gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1580 99 : std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1581 99 : EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1582 99 : EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1583 : }
1584 : }
1585 :
1586 34915 : add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1587 :
1588 : /* Skip new preheader since it's deleted if copy loop is added at entry. */
1589 182320 : for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1590 112490 : rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1591 :
1592 : /* Rename the exit uses. */
1593 141431 : for (edge exit : get_loop_exit_edges (new_loop))
1594 36686 : for (auto gsi = gsi_start_phis (exit->dest);
1595 85318 : !gsi_end_p (gsi); gsi_next (&gsi))
1596 : {
1597 48632 : tree orig_def = PHI_ARG_DEF_FROM_EDGE (gsi.phi (), exit);
1598 48632 : rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), exit));
1599 48632 : if (MAY_HAVE_DEBUG_BIND_STMTS)
1600 23200 : adjust_debug_stmts (orig_def, PHI_RESULT (gsi.phi ()), exit->dest);
1601 34915 : }
1602 :
1603 34915 : auto loop_exits = get_loop_exit_edges (loop);
1604 34915 : bool multiple_exits_p = loop_exits.length () > 1;
1605 34915 : auto_vec<basic_block> doms;
1606 :
1607 34915 : if (at_exit) /* Add the loop copy at exit. */
1608 : {
1609 33342 : if (scalar_loop != loop && new_exit->dest != exit_dest)
1610 : {
1611 2517 : new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1612 2517 : flush_pending_stmts (new_exit);
1613 : }
1614 :
1615 33342 : bool need_virtual_phi = get_virtual_phi (loop->header);
1616 :
1617 : /* For the main loop exit preserve the LC PHI nodes. For vectorization
1618 : we need them to continue or finalize reductions. Since we do not
1619 : copy the loop exit blocks we have to materialize PHIs at the
1620 : new destination before redirecting edges. */
1621 33342 : for (auto gsi_from = gsi_start_phis (loop_exit->dest);
1622 78962 : !gsi_end_p (gsi_from); gsi_next (&gsi_from))
1623 : {
1624 45620 : tree res = gimple_phi_result (*gsi_from);
1625 45620 : create_phi_node (copy_ssa_name (res), new_preheader);
1626 : }
1627 33342 : edge e = redirect_edge_and_branch (loop_exit, new_preheader);
1628 33342 : gcc_assert (e == loop_exit);
1629 33342 : flush_pending_stmts (loop_exit);
1630 33342 : set_immediate_dominator (CDI_DOMINATORS, new_preheader, loop_exit->src);
1631 :
1632 : /* If we ended up choosing an exit leading to a path not using memory
1633 : we can end up without a virtual LC PHI. Create it when it is
1634 : needed because of the epilog loop continuation. */
1635 33342 : if (need_virtual_phi && !get_virtual_phi (loop_exit->dest))
1636 : {
1637 8 : tree header_def = gimple_phi_result (get_virtual_phi (loop->header));
1638 8 : gphi *vphi = create_phi_node (copy_ssa_name (header_def),
1639 : new_preheader);
1640 8 : add_phi_arg (vphi, get_live_virtual_operand_on_edge (loop_exit),
1641 : loop_exit, UNKNOWN_LOCATION);
1642 : }
1643 :
1644 33342 : bool multiple_exits_p = loop_exits.length () > 1;
1645 33342 : basic_block main_loop_exit_block = new_preheader;
1646 33342 : basic_block alt_loop_exit_block = NULL;
1647 : /* Create the CFG for multiple exits.
1648 : | loop_exit | alt1 | altN
1649 : v v ... v
1650 : main_loop_exit_block: alt_loop_exit_block:
1651 : | /
1652 : v v
1653 : new_preheader:
1654 : where in the new preheader we need merge PHIs for
1655 : the continuation values into the epilogue header.
1656 : Do not bother with exit PHIs for the early exits but
1657 : their live virtual operand. We'll fix up things below. */
1658 33342 : if (multiple_exits_p || uncounted_p)
1659 : {
1660 1406 : edge loop_e = single_succ_edge (new_preheader);
1661 1406 : new_preheader = split_edge (loop_e);
1662 :
1663 1406 : gphi *vphi = NULL;
1664 1406 : alt_loop_exit_block = new_preheader;
1665 7170 : for (auto exit : loop_exits)
1666 2952 : if (exit != loop_exit)
1667 : {
1668 1546 : tree vphi_def = NULL_TREE;
1669 1546 : if (gphi *evphi = get_virtual_phi (exit->dest))
1670 602 : vphi_def = gimple_phi_arg_def_from_edge (evphi, exit);
1671 1546 : edge res = redirect_edge_and_branch (exit, alt_loop_exit_block);
1672 1546 : gcc_assert (res == exit);
1673 1546 : redirect_edge_var_map_clear (exit);
1674 1546 : if (alt_loop_exit_block == new_preheader)
1675 1377 : alt_loop_exit_block = split_edge (exit);
1676 1546 : if (!need_virtual_phi)
1677 1102 : continue;
1678 : /* When the edge has no virtual LC PHI get at the live
1679 : virtual operand by other means. */
1680 444 : if (!vphi_def)
1681 2 : vphi_def = get_live_virtual_operand_on_edge (exit);
1682 444 : if (!vphi)
1683 410 : vphi = create_phi_node (copy_ssa_name (vphi_def),
1684 : alt_loop_exit_block);
1685 : else
1686 : /* Edge redirection might re-allocate the PHI node
1687 : so we have to rediscover it. */
1688 34 : vphi = get_virtual_phi (alt_loop_exit_block);
1689 444 : add_phi_arg (vphi, vphi_def, exit, UNKNOWN_LOCATION);
1690 : }
1691 :
1692 1406 : set_immediate_dominator (CDI_DOMINATORS, new_preheader,
1693 : loop->header);
1694 :
1695 : /* Fix up the profile counts of the new exit blocks.
1696 : main_loop_exit_block was created by duplicating the
1697 : preheader, so needs its count scaling according to the main
1698 : exit edge's probability. The remaining count from the
1699 : preheader goes to the alt_loop_exit_block, since all
1700 : alternative exits have been redirected there. */
1701 1406 : main_loop_exit_block->count = loop_exit->count ();
1702 1406 : alt_loop_exit_block->count
1703 1406 : = preheader->count - main_loop_exit_block->count;
1704 : }
1705 :
1706 : /* Adjust the epilog loop PHI entry values to continue iteration.
1707 : This adds remaining necessary LC PHI nodes to the main exit
1708 : and creates merge PHIs when we have multiple exits with
1709 : their appropriate continuation. */
1710 33342 : if (flow_loops)
1711 : {
1712 33342 : edge loop_entry = single_succ_edge (new_preheader);
1713 33342 : bool peeled_iters = (uncounted_p
1714 33342 : || single_pred (loop->latch) != loop_exit->src);
1715 :
1716 : /* Record the new SSA names in the cache so that we can skip
1717 : materializing them again when we fill in the rest of the LC SSA
1718 : variables. */
1719 33342 : hash_map <tree, tree> new_phi_args;
1720 33342 : for (auto psi = gsi_start_phis (main_loop_exit_block);
1721 78970 : !gsi_end_p (psi); gsi_next (&psi))
1722 : {
1723 45628 : gphi *phi = *psi;
1724 45628 : tree new_arg = gimple_phi_arg_def_from_edge (phi, loop_exit);
1725 45628 : if (TREE_CODE (new_arg) != SSA_NAME)
1726 384 : continue;
1727 :
1728 : /* If the loop doesn't have a virtual def then only possibly keep
1729 : the epilog LC PHI for it and avoid creating new defs. */
1730 45338 : if (virtual_operand_p (new_arg) && !need_virtual_phi)
1731 : {
1732 94 : auto gsi = gsi_for_stmt (phi);
1733 94 : remove_phi_node (&gsi, true);
1734 94 : continue;
1735 94 : }
1736 :
1737 : /* If we decided not to remove the PHI node we should also not
1738 : rematerialize it later on. */
1739 45244 : new_phi_args.put (new_arg, gimple_phi_result (phi));
1740 : }
1741 :
1742 : /* Create the merge PHI nodes in new_preheader and populate the
1743 : arguments for the exits. */
1744 33342 : if (multiple_exits_p || uncounted_p)
1745 : {
1746 1406 : for (auto gsi_from = gsi_start_phis (loop->header),
1747 1406 : gsi_to = gsi_start_phis (new_loop->header);
1748 4454 : !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1749 3048 : gsi_next (&gsi_from), gsi_next (&gsi_to))
1750 : {
1751 3048 : gimple *from_phi = gsi_stmt (gsi_from);
1752 3048 : gimple *to_phi = gsi_stmt (gsi_to);
1753 :
1754 : /* When the vector loop is peeled then we need to use the
1755 : value at start of the loop, otherwise the main loop exit
1756 : should use the final iter value. */
1757 3048 : tree new_arg;
1758 3048 : if (peeled_iters)
1759 123 : new_arg = gimple_phi_result (from_phi);
1760 : else
1761 2925 : new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1762 : loop_latch_edge (loop));
1763 :
1764 : /* Check if we've already created a new phi node during edge
1765 : redirection and re-use it if so. Otherwise create a
1766 : LC PHI node to feed the merge PHI. */
1767 3048 : tree *res;
1768 6096 : if (virtual_operand_p (new_arg))
1769 : {
1770 : /* Use the existing virtual LC SSA from exit block. */
1771 422 : gphi *vphi = get_virtual_phi (main_loop_exit_block);
1772 422 : new_arg = gimple_phi_result (vphi);
1773 : }
1774 2626 : else if ((res = new_phi_args.get (new_arg)))
1775 105 : new_arg = *res;
1776 : else
1777 : {
1778 : /* Create the LC PHI node for the exit. */
1779 2521 : tree new_def = copy_ssa_name (new_arg);
1780 2521 : gphi *lc_phi
1781 2521 : = create_phi_node (new_def, main_loop_exit_block);
1782 2521 : SET_PHI_ARG_DEF (lc_phi, 0, new_arg);
1783 2521 : new_arg = new_def;
1784 : }
1785 :
1786 : /* Create the PHI node in the merge block merging the
1787 : main and early exit values. */
1788 3048 : tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
1789 3048 : gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
1790 3048 : edge main_e = single_succ_edge (main_loop_exit_block);
1791 3048 : SET_PHI_ARG_DEF_ON_EDGE (lcssa_phi, main_e, new_arg);
1792 :
1793 : /* And adjust the epilog entry value. */
1794 3048 : adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
1795 : }
1796 : }
1797 :
1798 33342 : if (multiple_exits_p)
1799 : {
1800 : /* After creating the merge PHIs handle the early exits those
1801 : should use the values at the start of the loop. */
1802 1377 : for (auto gsi_from = gsi_start_phis (loop->header),
1803 1377 : gsi_to = gsi_start_phis (new_preheader);
1804 4373 : !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1805 2996 : gsi_next (&gsi_from), gsi_next (&gsi_to))
1806 : {
1807 2996 : gimple *from_phi = gsi_stmt (gsi_from);
1808 2996 : gimple *to_phi = gsi_stmt (gsi_to);
1809 :
1810 : /* Now update the virtual PHI nodes with the right value. */
1811 2996 : tree alt_arg = gimple_phi_result (from_phi);
1812 5992 : if (virtual_operand_p (alt_arg))
1813 : {
1814 410 : gphi *vphi = get_virtual_phi (alt_loop_exit_block);
1815 410 : alt_arg = gimple_phi_result (vphi);
1816 : }
1817 : /* For other live args we didn't create LC PHI nodes.
1818 : Do so here. */
1819 : else
1820 : {
1821 2586 : tree alt_def = copy_ssa_name (alt_arg);
1822 2586 : gphi *lc_phi
1823 2586 : = create_phi_node (alt_def, alt_loop_exit_block);
1824 5509 : for (unsigned i = 0; i < gimple_phi_num_args (lc_phi);
1825 : ++i)
1826 2923 : SET_PHI_ARG_DEF (lc_phi, i, alt_arg);
1827 : alt_arg = alt_def;
1828 : }
1829 2996 : edge alt_e = single_succ_edge (alt_loop_exit_block);
1830 2996 : SET_PHI_ARG_DEF_ON_EDGE (to_phi, alt_e, alt_arg);
1831 : }
1832 : }
1833 : /* For the single exit case only create the missing LC PHI nodes
1834 : for the continuation of the loop IVs that are not also already
1835 : reductions and thus had LC PHI nodes on the exit already. */
1836 33342 : if (!multiple_exits_p && !uncounted_p)
1837 : {
1838 31936 : for (auto gsi_from = gsi_start_phis (loop->header),
1839 31936 : gsi_to = gsi_start_phis (new_loop->header);
1840 122211 : !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1841 90275 : gsi_next (&gsi_from), gsi_next (&gsi_to))
1842 : {
1843 90275 : gimple *from_phi = gsi_stmt (gsi_from);
1844 90275 : gimple *to_phi = gsi_stmt (gsi_to);
1845 90275 : tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1846 : loop_latch_edge (loop));
1847 :
1848 : /* Check if we've already created a new phi node during edge
1849 : redirection. If we have, only propagate the value
1850 : downwards. */
1851 90275 : if (tree *res = new_phi_args.get (new_arg))
1852 : {
1853 42522 : adjust_phi_and_debug_stmts (to_phi, loop_entry, *res);
1854 42522 : continue;
1855 : }
1856 :
1857 47753 : tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
1858 47753 : gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
1859 47753 : SET_PHI_ARG_DEF_ON_EDGE (lcssa_phi, loop_exit, new_arg);
1860 47753 : adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
1861 : }
1862 : }
1863 33342 : }
1864 :
1865 33342 : if (was_imm_dom || duplicate_outer_loop)
1866 32992 : set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1867 :
1868 : /* And remove the non-necessary forwarder again. Keep the other
1869 : one so we have a proper pre-header for the loop at the exit edge. */
1870 33342 : redirect_edge_pred (single_succ_edge (preheader),
1871 : single_pred (preheader));
1872 33342 : delete_basic_block (preheader);
1873 33342 : set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1874 33342 : loop_preheader_edge (scalar_loop)->src);
1875 :
1876 : /* Finally after wiring the new epilogue we need to update its main exit
1877 : to the original function exit we recorded. Other exits are already
1878 : correct. */
1879 33342 : if (multiple_exits_p || uncounted_p)
1880 : {
1881 1406 : class loop *update_loop = new_loop;
1882 1406 : doms = get_all_dominated_blocks (CDI_DOMINATORS, loop->header);
1883 73580 : for (unsigned i = 0; i < doms.length (); ++i)
1884 72174 : if (flow_bb_inside_loop_p (loop, doms[i]))
1885 4341 : doms.unordered_remove (i);
1886 :
1887 7170 : for (edge e : get_loop_exit_edges (update_loop))
1888 : {
1889 2952 : edge ex;
1890 2952 : edge_iterator ei;
1891 5870 : FOR_EACH_EDGE (ex, ei, e->dest->succs)
1892 : {
1893 : /* Find the first non-fallthrough block as fall-throughs can't
1894 : dominate other blocks. */
1895 2918 : if (single_succ_p (ex->dest))
1896 : {
1897 1414 : doms.safe_push (ex->dest);
1898 1414 : ex = single_succ_edge (ex->dest);
1899 : }
1900 2918 : doms.safe_push (ex->dest);
1901 : }
1902 2952 : doms.safe_push (e->dest);
1903 1406 : }
1904 :
1905 1406 : iterate_fix_dominators (CDI_DOMINATORS, doms, false);
1906 1406 : if (updated_doms)
1907 1406 : updated_doms->safe_splice (doms);
1908 : }
1909 : }
1910 : else /* Add the copy at entry. */
1911 : {
1912 : /* Copy the current loop LC PHI nodes between the original loop exit
1913 : block and the new loop header. This allows us to later split the
1914 : preheader block and still find the right LC nodes. */
1915 1573 : if (flow_loops)
1916 438 : for (auto gsi_from = gsi_start_phis (new_loop->header),
1917 438 : gsi_to = gsi_start_phis (loop->header);
1918 1361 : !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1919 923 : gsi_next (&gsi_from), gsi_next (&gsi_to))
1920 : {
1921 923 : gimple *from_phi = gsi_stmt (gsi_from);
1922 923 : gimple *to_phi = gsi_stmt (gsi_to);
1923 923 : tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1924 : loop_latch_edge (new_loop));
1925 923 : adjust_phi_and_debug_stmts (to_phi, loop_preheader_edge (loop),
1926 : new_arg);
1927 : }
1928 :
1929 1573 : if (scalar_loop != loop)
1930 : {
1931 : /* Remove the non-necessary forwarder of scalar_loop again. */
1932 4 : redirect_edge_pred (single_succ_edge (preheader),
1933 : single_pred (preheader));
1934 4 : delete_basic_block (preheader);
1935 4 : set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1936 4 : loop_preheader_edge (scalar_loop)->src);
1937 4 : preheader = split_edge (loop_preheader_edge (loop));
1938 4 : entry_e = single_pred_edge (preheader);
1939 : }
1940 :
1941 1573 : redirect_edge_and_branch_force (entry_e, new_preheader);
1942 1573 : flush_pending_stmts (entry_e);
1943 1573 : set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1944 :
1945 :
1946 : /* `vect_set_loop_condition' replaces the condition in the main exit of
1947 : loop. For counted loops, this is the IV counting exit, so in the case
1948 : of the prolog loop, we are replacing the old IV counting exit limit of
1949 : total loop niters for the new limit of the prolog niters, as desired.
1950 : For uncounted loops, we don't have an IV-counting exit to replace, so
1951 : we add a dummy exit to be consumed by `vect_set_loop_condition' later
1952 : on. */
1953 1573 : if (create_main_e)
1954 : {
1955 31 : edge to_latch_e = single_pred_edge (new_loop->latch);
1956 31 : bool latch_is_false = to_latch_e->flags & EDGE_FALSE_VALUE ? true
1957 : : false;
1958 :
1959 : /* Add new bb for duplicate exit. */
1960 31 : basic_block bbcond = split_edge (to_latch_e);
1961 31 : gimple_stmt_iterator a = gsi_last_bb (bbcond);
1962 :
1963 : /* Fix flags for the edge leading to the latch. */
1964 31 : to_latch_e = find_edge (bbcond, new_loop->latch);
1965 31 : to_latch_e->flags &= ~EDGE_FALLTHRU;
1966 31 : to_latch_e->flags |= latch_is_false ? EDGE_FALSE_VALUE
1967 : : EDGE_TRUE_VALUE;
1968 :
1969 : /* Build the condition. */
1970 31 : tree cone = build_int_cst (sizetype, 1);
1971 31 : tree czero = build_int_cst (sizetype, 0);
1972 31 : gcond *cond_copy = gimple_build_cond (NE_EXPR, cone, czero, NULL_TREE,
1973 : NULL_TREE);
1974 :
1975 31 : gsi_insert_after (&a, cond_copy, GSI_NEW_STMT);
1976 :
1977 : /* Add edge for exiting the loop via new condition. */
1978 38 : edge dup_exit = make_edge (bbcond, new_exit->dest, latch_is_false
1979 : ? EDGE_TRUE_VALUE : EDGE_FALSE_VALUE);
1980 :
1981 31 : profile_probability probability = profile_probability::even ();
1982 31 : to_latch_e->probability = dup_exit->probability = probability;
1983 :
1984 31 : set_immediate_dominator (CDI_DOMINATORS, dup_exit->src,
1985 : new_exit->src);
1986 31 : new_exit = dup_exit;
1987 31 : *new_e = new_exit;
1988 : }
1989 :
1990 1573 : redirect_edge_and_branch_force (new_exit, preheader);
1991 1573 : flush_pending_stmts (new_exit);
1992 1573 : set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1993 :
1994 : /* And remove the non-necessary forwarder again. Keep the other
1995 : one so we have a proper pre-header for the loop at the exit edge. */
1996 1573 : redirect_edge_pred (single_succ_edge (new_preheader),
1997 : single_pred (new_preheader));
1998 1573 : delete_basic_block (new_preheader);
1999 1573 : set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
2000 1573 : loop_preheader_edge (new_loop)->src);
2001 :
2002 : /* Update dominators for multiple exits. */
2003 1573 : if (multiple_exits_p || create_main_e)
2004 : {
2005 1153 : for (edge alt_e : loop_exits)
2006 : {
2007 457 : if ((alt_e == loop_exit) && !create_main_e)
2008 201 : continue;
2009 256 : basic_block old_dom
2010 256 : = get_immediate_dominator (CDI_DOMINATORS, alt_e->dest);
2011 256 : if (flow_bb_inside_loop_p (loop, old_dom))
2012 : {
2013 101 : auto_vec<basic_block, 8> queue;
2014 101 : for (auto son = first_dom_son (CDI_DOMINATORS, old_dom);
2015 325 : son; son = next_dom_son (CDI_DOMINATORS, son))
2016 224 : if (!flow_bb_inside_loop_p (loop, son))
2017 123 : queue.safe_push (son);
2018 426 : for (auto son : queue)
2019 123 : set_immediate_dominator (CDI_DOMINATORS,
2020 : son, get_bb_copy (old_dom));
2021 101 : }
2022 : }
2023 : }
2024 :
2025 : /* When loop_exit != scalar_exit due to if-conversion loop versioning,
2026 : the `scalar_exit' now has two incoming edges, one from the if-converted
2027 : and one from the peeled prolog loop. It is therefore dominated by a
2028 : common block between these. Update its dominator accordingly. */
2029 232 : if (create_main_e && loop_exit != scalar_exit)
2030 0 : set_immediate_dominator (CDI_DOMINATORS, scalar_exit->dest,
2031 : recompute_dominator (CDI_DOMINATORS,
2032 : scalar_exit->dest));
2033 : }
2034 :
2035 34915 : free (new_bbs);
2036 34915 : free (bbs);
2037 :
2038 34915 : checking_verify_dominators (CDI_DOMINATORS);
2039 :
2040 34915 : return new_loop;
2041 34915 : }
2042 :
2043 :
2044 : /* Given the condition expression COND, put it as the last statement of
2045 : GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
2046 : DOM_BB; return the skip edge. GUARD_TO is the target basic block to
2047 : skip the loop. PROBABILITY is the skip edge's probability. Mark the
2048 : new edge as irreducible if IRREDUCIBLE_P is true. */
2049 :
2050 : static edge
2051 50927 : slpeel_add_loop_guard (basic_block guard_bb, tree cond,
2052 : basic_block guard_to, basic_block dom_bb,
2053 : profile_probability probability, bool irreducible_p)
2054 : {
2055 50927 : gimple_stmt_iterator gsi;
2056 50927 : edge new_e, enter_e;
2057 50927 : gcond *cond_stmt;
2058 50927 : gimple_seq gimplify_stmt_list = NULL;
2059 :
2060 50927 : enter_e = EDGE_SUCC (guard_bb, 0);
2061 50927 : enter_e->flags &= ~EDGE_FALLTHRU;
2062 50927 : enter_e->flags |= EDGE_FALSE_VALUE;
2063 50927 : gsi = gsi_last_bb (guard_bb);
2064 :
2065 50927 : cond = force_gimple_operand_1 (cond, &gimplify_stmt_list,
2066 : is_gimple_condexpr_for_cond, NULL_TREE);
2067 50927 : if (gimplify_stmt_list)
2068 22938 : gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
2069 :
2070 50927 : cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
2071 50927 : gsi = gsi_last_bb (guard_bb);
2072 50927 : gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
2073 :
2074 : /* Add new edge to connect guard block to the merge/loop-exit block. */
2075 50927 : new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
2076 :
2077 50927 : new_e->probability = probability;
2078 50927 : if (irreducible_p)
2079 14 : new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
2080 :
2081 50927 : enter_e->probability = probability.invert ();
2082 50927 : set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
2083 :
2084 : /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
2085 50927 : if (enter_e->dest->loop_father->header == enter_e->dest)
2086 441 : split_edge (enter_e);
2087 :
2088 50927 : return new_e;
2089 : }
2090 :
2091 :
2092 : /* This function verifies that the following restrictions apply to LOOP:
2093 : (1) it consists of exactly 2 basic blocks - header, and an empty latch
2094 : for innermost loop and 5 basic blocks for outer-loop.
2095 : (2) it is single entry, single exit
2096 : (3) its exit condition is the last stmt in the header
2097 : (4) E is the entry/exit edge of LOOP.
2098 : */
2099 :
2100 : bool
2101 421902 : slpeel_can_duplicate_loop_p (const class loop *loop, const_edge exit_e,
2102 : const_edge e)
2103 : {
2104 421902 : edge entry_e = loop_preheader_edge (loop);
2105 421902 : gcond *orig_cond = get_loop_exit_condition (exit_e);
2106 421902 : gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
2107 :
2108 : /* All loops have an outer scope; the only case loop->outer is NULL is for
2109 : the function itself. */
2110 421902 : if (!loop_outer (loop)
2111 421902 : || !empty_block_p (loop->latch)
2112 : || !exit_e
2113 : /* Verify that new loop exit condition can be trivially modified. */
2114 421902 : || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
2115 843804 : || (e != exit_e && e != entry_e))
2116 0 : return false;
2117 :
2118 421902 : basic_block *bbs = XNEWVEC (basic_block, loop->num_nodes);
2119 421902 : get_loop_body_with_size (loop, bbs, loop->num_nodes);
2120 421902 : bool ret = can_copy_bbs_p (bbs, loop->num_nodes);
2121 421902 : free (bbs);
2122 421902 : return ret;
2123 : }
2124 :
2125 : /* Function find_loop_location.
2126 :
2127 : Extract the location of the loop in the source code.
2128 : If the loop is not well formed for vectorization, an estimated
2129 : location is calculated.
2130 : Return the loop location if succeed and NULL if not. */
2131 :
2132 : dump_user_location_t
2133 3569629 : find_loop_location (class loop *loop)
2134 : {
2135 3569629 : gimple *stmt = NULL;
2136 3569629 : basic_block bb;
2137 3569629 : gimple_stmt_iterator si;
2138 :
2139 3569629 : if (!loop)
2140 0 : return dump_user_location_t ();
2141 :
2142 : /* For the root of the loop tree return the function location. */
2143 3569629 : if (!loop_outer (loop))
2144 0 : return dump_user_location_t::from_function_decl (cfun->decl);
2145 :
2146 3569629 : if (loops_state_satisfies_p (LOOPS_HAVE_RECORDED_EXITS))
2147 : {
2148 : /* We only care about the loop location, so use any exit with location
2149 : information. */
2150 11197484 : for (edge e : get_loop_exit_edges (loop))
2151 : {
2152 3665246 : stmt = get_loop_exit_condition (e);
2153 :
2154 3665246 : if (stmt
2155 3665246 : && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
2156 3117485 : return stmt;
2157 3569629 : }
2158 : }
2159 :
2160 : /* If we got here the loop is probably not "well formed",
2161 : try to estimate the loop location */
2162 :
2163 452144 : if (!loop->header)
2164 0 : return dump_user_location_t ();
2165 :
2166 452144 : bb = loop->header;
2167 :
2168 1597532 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2169 : {
2170 1026206 : stmt = gsi_stmt (si);
2171 1026206 : if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
2172 332962 : return stmt;
2173 : }
2174 :
2175 119182 : return dump_user_location_t ();
2176 : }
2177 :
2178 : /* Return true if the phi described by STMT_INFO defines an IV of the
2179 : loop to be vectorized. */
2180 :
2181 : static bool
2182 1118594 : iv_phi_p (stmt_vec_info stmt_info)
2183 : {
2184 1118594 : gphi *phi = as_a <gphi *> (stmt_info->stmt);
2185 2237188 : if (virtual_operand_p (PHI_RESULT (phi)))
2186 : return false;
2187 :
2188 891559 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2189 891559 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2190 116309 : return false;
2191 :
2192 : return true;
2193 : }
2194 :
2195 : /* Return true if vectorizer can peel for nonlinear iv. */
2196 : static bool
2197 7936 : vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo,
2198 : stmt_vec_info stmt_info)
2199 : {
2200 7936 : enum vect_induction_op_type induction_type
2201 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
2202 7936 : tree niters_skip;
2203 : /* Init_expr will be update by vect_update_ivs_after_vectorizer,
2204 : if niters or vf is unkown:
2205 : For shift, when shift mount >= precision, there would be UD.
2206 : For mult, don't known how to generate
2207 : init_expr * pow (step, niters) for variable niters.
2208 : For neg unknown niters are ok, since niters of vectorized main loop
2209 : will always be multiple of 2.
2210 : See also PR113163, PR114196 and PR114485. */
2211 7936 : if (!LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()
2212 7936 : || LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2213 7936 : || (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2214 4738 : && induction_type != vect_step_op_neg))
2215 : {
2216 4576 : if (dump_enabled_p ())
2217 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2218 : "Peeling for epilogue is not supported"
2219 : " for this nonlinear induction"
2220 : " when iteration count is unknown or"
2221 : " when using partial vectorization.\n");
2222 4576 : return false;
2223 : }
2224 :
2225 3360 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
2226 266 : && induction_type == vect_step_op_mul)
2227 : {
2228 24 : if (dump_enabled_p ())
2229 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2230 : "Peeling for is not supported for nonlinear mult"
2231 : " induction using partial vectorization.\n");
2232 24 : return false;
2233 : }
2234 :
2235 : /* Avoid compile time hog on vect_peel_nonlinear_iv_init. */
2236 3094 : if (induction_type == vect_step_op_mul)
2237 : {
2238 401 : tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
2239 401 : tree type = TREE_TYPE (step_expr);
2240 :
2241 796 : if (wi::exact_log2 (wi::to_wide (step_expr)) == -1
2242 401 : && LOOP_VINFO_INT_NITERS(loop_vinfo) >= TYPE_PRECISION (type))
2243 : {
2244 6 : if (dump_enabled_p ())
2245 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2246 : "Avoid compile time hog on"
2247 : " vect_peel_nonlinear_iv_init"
2248 : " for nonlinear induction vec_step_op_mul"
2249 : " when iteration count is too big.\n");
2250 6 : return false;
2251 : }
2252 : }
2253 :
2254 : /* Also doens't support peel for neg when niter is variable.
2255 : ??? generate something like niter_expr & 1 ? init_expr : -init_expr? */
2256 3330 : niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
2257 3330 : if ((niters_skip != NULL_TREE
2258 0 : && (TREE_CODE (niters_skip) != INTEGER_CST
2259 0 : || (HOST_WIDE_INT) TREE_INT_CST_LOW (niters_skip) < 0))
2260 3330 : || (!vect_use_loop_mask_for_alignment_p (loop_vinfo)
2261 3330 : && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0))
2262 : {
2263 4 : if (dump_enabled_p ())
2264 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2265 : "Peeling for alignement is not supported"
2266 : " for nonlinear induction when niters_skip"
2267 : " is not constant.\n");
2268 4 : return false;
2269 : }
2270 :
2271 : return true;
2272 : }
2273 :
2274 : /* Function vect_can_advance_ivs_p
2275 :
2276 : In case the number of iterations that LOOP iterates is unknown at compile
2277 : time, an epilog loop will be generated, and the loop induction variables
2278 : (IVs) will be "advanced" to the value they are supposed to take just before
2279 : the epilog loop. Here we check that the access function of the loop IVs
2280 : and the expression that represents the loop bound are simple enough.
2281 : These restrictions will be relaxed in the future. */
2282 :
2283 : bool
2284 427029 : vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
2285 : {
2286 427029 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2287 427029 : basic_block bb = loop->header;
2288 427029 : gphi_iterator gsi;
2289 :
2290 : /* Analyze phi functions of the loop header. */
2291 :
2292 427029 : if (dump_enabled_p ())
2293 24259 : dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
2294 1443696 : for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2295 : {
2296 1022275 : tree evolution_part;
2297 1022275 : enum vect_induction_op_type induction_type;
2298 :
2299 1022275 : gphi *phi = gsi.phi ();
2300 1022275 : stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
2301 1022275 : if (dump_enabled_p ())
2302 69846 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
2303 : phi_info->stmt);
2304 :
2305 : /* Skip virtual phi's. The data dependences that are associated with
2306 : virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
2307 :
2308 : Skip reduction phis. */
2309 1022275 : if (!iv_phi_p (phi_info))
2310 : {
2311 300312 : if (dump_enabled_p ())
2312 24734 : dump_printf_loc (MSG_NOTE, vect_location,
2313 : "reduc or virtual phi. skip.\n");
2314 300312 : continue;
2315 : }
2316 :
2317 721963 : induction_type = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
2318 721963 : if (induction_type != vect_step_op_add)
2319 : {
2320 7936 : if (!vect_can_peel_nonlinear_iv_p (loop_vinfo, phi_info))
2321 : return false;
2322 :
2323 3326 : continue;
2324 : }
2325 :
2326 : /* Analyze the evolution function. */
2327 :
2328 714027 : evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
2329 714027 : if (evolution_part == NULL_TREE)
2330 : {
2331 966 : if (dump_enabled_p ())
2332 78 : dump_printf (MSG_MISSED_OPTIMIZATION,
2333 : "No access function or evolution.\n");
2334 966 : return false;
2335 : }
2336 :
2337 : /* FORNOW: We do not transform initial conditions of IVs
2338 : which evolution functions are not invariants in the loop. */
2339 :
2340 713061 : if (!expr_invariant_in_loop_p (loop, evolution_part))
2341 : {
2342 32 : if (dump_enabled_p ())
2343 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2344 : "evolution not invariant in loop.\n");
2345 32 : return false;
2346 : }
2347 :
2348 : /* FORNOW: We do not transform initial conditions of IVs
2349 : which evolution functions are a polynomial of degree >= 2. */
2350 :
2351 1729696 : if (tree_is_chrec (evolution_part))
2352 : {
2353 0 : if (dump_enabled_p ())
2354 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2355 : "evolution is chrec.\n");
2356 0 : return false;
2357 : }
2358 : }
2359 :
2360 : return true;
2361 : }
2362 :
2363 :
2364 : /* Function vect_update_ivs_after_vectorizer.
2365 :
2366 : "Advance" the induction variables of LOOP to the value they should take
2367 : after the execution of LOOP. This is currently necessary because the
2368 : vectorizer does not handle induction variables that are used after the
2369 : loop. Such a situation occurs when the last iterations of LOOP are
2370 : peeled, because:
2371 : 1. We introduced new uses after LOOP for IVs that were not originally used
2372 : after LOOP: the IVs of LOOP are now used by an epilog loop.
2373 : 2. LOOP is going to be vectorized; this means that it will iterate N/VF
2374 : times, whereas the loop IVs should be bumped N times.
2375 :
2376 : Input:
2377 : - LOOP - a loop that is going to be vectorized. The last few iterations
2378 : of LOOP were peeled.
2379 : - NITERS - the number of iterations that LOOP executes (before it is
2380 : vectorized). i.e, the number of times the ivs should be bumped.
2381 : - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
2382 : coming out from LOOP on which there are uses of the LOOP ivs
2383 : (this is the path from LOOP->exit to epilog_loop->preheader).
2384 :
2385 : The new definitions of the ivs are placed in LOOP->exit.
2386 : The phi args associated with the edge UPDATE_E in the bb
2387 : UPDATE_E->dest are updated accordingly.
2388 :
2389 : - EARLY_EXIT_P - Indicates whether the exit is an early exit rather than
2390 : the main latch exit.
2391 :
2392 : Assumption 1: Like the rest of the vectorizer, this function assumes
2393 : a single loop exit that has a single predecessor.
2394 :
2395 : Assumption 2: The phi nodes in the LOOP header and in update_bb are
2396 : organized in the same order.
2397 :
2398 : Assumption 3: The access function of the ivs is simple enough (see
2399 : vect_can_advance_ivs_p). This assumption will be relaxed in the future.
2400 :
2401 : Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
2402 : coming out of LOOP on which the ivs of LOOP are used (this is the path
2403 : that leads to the epilog loop; other paths skip the epilog loop). This
2404 : path starts with the edge UPDATE_E, and its destination (denoted update_bb)
2405 : needs to have its phis updated.
2406 : */
2407 :
2408 : static void
2409 34719 : vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
2410 : tree niters, edge update_e,
2411 : bool early_exit_p)
2412 : {
2413 34719 : gphi_iterator gsi, gsi1;
2414 34719 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2415 34719 : basic_block update_bb = update_e->dest;
2416 34719 : basic_block exit_bb = update_e->src;
2417 : /* Check to see if this is an empty loop pre-header block. If it exists
2418 : we need to use the edge from that block -> loop header for updates but
2419 : must use the original exit_bb to add any new adjustment because there
2420 : can be a skip_epilog edge bypassing the epilog and so the loop pre-header
2421 : too. */
2422 34722 : if (empty_block_p (update_bb) && single_succ_p (update_bb))
2423 : {
2424 3 : update_e = single_succ_edge (update_bb);
2425 3 : update_bb = update_e->dest;
2426 : }
2427 34719 : gimple_stmt_iterator last_gsi = gsi_last_bb (exit_bb);
2428 :
2429 34719 : for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
2430 131038 : !gsi_end_p (gsi) && !gsi_end_p (gsi1);
2431 96319 : gsi_next (&gsi), gsi_next (&gsi1))
2432 : {
2433 96319 : tree init_expr;
2434 96319 : tree step_expr, off;
2435 96319 : tree type;
2436 96319 : tree var, ni, ni_name;
2437 :
2438 96319 : gphi *phi = gsi.phi ();
2439 96319 : gphi *phi1 = gsi1.phi ();
2440 96319 : stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
2441 96319 : if (dump_enabled_p ())
2442 12291 : dump_printf_loc (MSG_NOTE, vect_location,
2443 : "vect_update_ivs_after_vectorizer: phi: %G",
2444 : (gimple *) phi);
2445 :
2446 : /* Skip reduction and virtual phis. */
2447 96319 : if (!iv_phi_p (phi_info))
2448 : {
2449 43032 : if (dump_enabled_p ())
2450 4744 : dump_printf_loc (MSG_NOTE, vect_location,
2451 : "reduc or virtual phi. skip.\n");
2452 43032 : continue;
2453 : }
2454 :
2455 53287 : type = TREE_TYPE (gimple_phi_result (phi));
2456 53287 : step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
2457 53287 : step_expr = unshare_expr (step_expr);
2458 :
2459 : /* FORNOW: We do not support IVs whose evolution function is a polynomial
2460 : of degree >= 2 or exponential. */
2461 53287 : gcc_assert (!tree_is_chrec (step_expr));
2462 :
2463 53287 : init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
2464 53287 : gimple_seq stmts = NULL;
2465 53287 : enum vect_induction_op_type induction_type
2466 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
2467 :
2468 53287 : if (induction_type == vect_step_op_add)
2469 : {
2470 53159 : tree stype = TREE_TYPE (step_expr);
2471 53159 : off = fold_build2 (MULT_EXPR, stype,
2472 : fold_convert (stype, niters), step_expr);
2473 :
2474 53159 : if (POINTER_TYPE_P (type))
2475 3570 : ni = fold_build_pointer_plus (init_expr, off);
2476 : else
2477 49589 : ni = fold_convert (type,
2478 : fold_build2 (PLUS_EXPR, stype,
2479 : fold_convert (stype, init_expr),
2480 : off));
2481 : }
2482 : /* Don't bother call vect_peel_nonlinear_iv_init. */
2483 128 : else if (induction_type == vect_step_op_neg)
2484 : ni = init_expr;
2485 : else
2486 84 : ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
2487 : niters, step_expr,
2488 : induction_type, early_exit_p);
2489 :
2490 53287 : var = create_tmp_var (type, "tmp");
2491 :
2492 53287 : gimple_seq new_stmts = NULL;
2493 53287 : ni_name = force_gimple_operand (ni, &new_stmts, false, var);
2494 :
2495 : /* Exit_bb shouldn't be empty, but we also can't insert after a ctrl
2496 : statements. */
2497 53287 : if (!gsi_end_p (last_gsi) && !is_ctrl_stmt (gsi_stmt (last_gsi)))
2498 : {
2499 248 : gsi_insert_seq_after (&last_gsi, stmts, GSI_SAME_STMT);
2500 248 : gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
2501 : }
2502 : else
2503 : {
2504 53039 : gsi_insert_seq_before (&last_gsi, stmts, GSI_SAME_STMT);
2505 53039 : gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
2506 : }
2507 :
2508 : /* Update the PHI argument on the requested edge. */
2509 53287 : adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
2510 : }
2511 34719 : }
2512 :
2513 : /* Return a gimple value containing the misalignment (measured in vector
2514 : elements) for the loop described by LOOP_VINFO, i.e. how many elements
2515 : it is away from a perfectly aligned address. Add any new statements
2516 : to SEQ. */
2517 :
2518 : static tree
2519 212 : get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
2520 : {
2521 212 : dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2522 212 : stmt_vec_info stmt_info = dr_info->stmt;
2523 212 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2524 :
2525 212 : poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
2526 212 : unsigned HOST_WIDE_INT target_align_c;
2527 212 : tree target_align_minus_1;
2528 :
2529 212 : bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
2530 212 : size_zero_node) < 0;
2531 212 : tree offset = (negative
2532 212 : ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
2533 : * TREE_INT_CST_LOW
2534 : (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
2535 209 : : size_zero_node);
2536 212 : tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
2537 : stmt_info, seq,
2538 : offset);
2539 212 : tree type = unsigned_type_for (TREE_TYPE (start_addr));
2540 212 : if (target_align.is_constant (&target_align_c))
2541 212 : target_align_minus_1 = build_int_cst (type, target_align_c - 1);
2542 : else
2543 : {
2544 : tree vla = build_int_cst (type, target_align);
2545 : target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla,
2546 : build_int_cst (type, 1));
2547 : }
2548 :
2549 212 : HOST_WIDE_INT elem_size
2550 212 : = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
2551 424 : tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
2552 :
2553 : /* Create: misalign_in_bytes = addr & (target_align - 1). */
2554 212 : tree int_start_addr = fold_convert (type, start_addr);
2555 212 : tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
2556 : target_align_minus_1);
2557 :
2558 : /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
2559 212 : tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
2560 : elem_size_log);
2561 :
2562 212 : return misalign_in_elems;
2563 : }
2564 :
2565 : /* Function vect_gen_prolog_loop_niters
2566 :
2567 : Generate the number of iterations which should be peeled as prolog for the
2568 : loop represented by LOOP_VINFO. It is calculated as the misalignment of
2569 : DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
2570 : As a result, after the execution of this loop, the data reference DR will
2571 : refer to an aligned location. The following computation is generated:
2572 :
2573 : If the misalignment of DR is known at compile time:
2574 : addr_mis = int mis = DR_MISALIGNMENT (dr);
2575 : Else, compute address misalignment in bytes:
2576 : addr_mis = addr & (target_align - 1)
2577 :
2578 : prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
2579 :
2580 : (elem_size = element type size; an element is the scalar element whose type
2581 : is the inner type of the vectype)
2582 :
2583 : The computations will be emitted at the end of BB. We also compute and
2584 : store upper bound (included) of the result in BOUND.
2585 :
2586 : When the step of the data-ref in the loop is not 1 (as in interleaved data
2587 : and SLP), the number of iterations of the prolog must be divided by the step
2588 : (which is equal to the size of interleaved group).
2589 :
2590 : The above formulas assume that VF == number of elements in the vector. This
2591 : may not hold when there are multiple-types in the loop.
2592 : In this case, for some data-references in the loop the VF does not represent
2593 : the number of elements that fit in the vector. Therefore, instead of VF we
2594 : use TYPE_VECTOR_SUBPARTS. */
2595 :
2596 : static tree
2597 438 : vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
2598 : basic_block bb, poly_int64 *bound)
2599 : {
2600 438 : dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2601 438 : tree var;
2602 438 : tree niters_type
2603 438 : = LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo) ? sizetype
2604 438 : : TREE_TYPE (LOOP_VINFO_NITERS
2605 : (loop_vinfo));
2606 438 : gimple_seq stmts = NULL, new_stmts = NULL;
2607 438 : tree iters, iters_name;
2608 438 : stmt_vec_info stmt_info = dr_info->stmt;
2609 438 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2610 438 : poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
2611 :
2612 438 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2613 : {
2614 226 : int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2615 :
2616 226 : if (dump_enabled_p ())
2617 217 : dump_printf_loc (MSG_NOTE, vect_location,
2618 : "known peeling = %d.\n", npeel);
2619 :
2620 226 : iters = build_int_cst (niters_type, npeel);
2621 226 : *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2622 : }
2623 : else
2624 : {
2625 212 : tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
2626 212 : tree type = TREE_TYPE (misalign_in_elems);
2627 212 : HOST_WIDE_INT elem_size
2628 212 : = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
2629 : /* We only do prolog peeling if the target alignment is known at compile
2630 : time. */
2631 212 : poly_uint64 align_in_elems =
2632 212 : exact_div (target_align, elem_size);
2633 212 : tree align_in_elems_minus_1 =
2634 212 : build_int_cst (type, align_in_elems - 1);
2635 212 : tree align_in_elems_tree = build_int_cst (type, align_in_elems);
2636 :
2637 : /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
2638 : & (align_in_elems - 1)). */
2639 212 : bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
2640 212 : size_zero_node) < 0;
2641 212 : if (negative)
2642 3 : iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
2643 : align_in_elems_tree);
2644 : else
2645 209 : iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
2646 : misalign_in_elems);
2647 212 : iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
2648 212 : iters = fold_convert (niters_type, iters);
2649 212 : *bound = align_in_elems;
2650 : }
2651 :
2652 438 : if (dump_enabled_p ())
2653 271 : dump_printf_loc (MSG_NOTE, vect_location,
2654 : "niters for prolog loop: %T\n", iters);
2655 :
2656 438 : var = create_tmp_var (niters_type, "prolog_loop_niters");
2657 438 : iters_name = force_gimple_operand (iters, &new_stmts, false, var);
2658 :
2659 438 : if (new_stmts)
2660 212 : gimple_seq_add_seq (&stmts, new_stmts);
2661 438 : if (stmts)
2662 : {
2663 212 : gcc_assert (single_succ_p (bb));
2664 212 : gimple_stmt_iterator gsi = gsi_last_bb (bb);
2665 212 : if (gsi_end_p (gsi))
2666 42 : gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2667 : else
2668 170 : gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
2669 : }
2670 438 : return iters_name;
2671 : }
2672 :
2673 :
2674 : /* Function vect_update_init_of_dr
2675 :
2676 : If CODE is PLUS, the vector loop starts NITERS iterations after the
2677 : scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
2678 : iterations before the scalar one (using masking to skip inactive
2679 : elements). This function updates the information recorded in DR to
2680 : account for the difference. Specifically, it updates the OFFSET
2681 : field of DR_INFO. */
2682 :
2683 : static void
2684 25874 : vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
2685 : {
2686 25874 : struct data_reference *dr = dr_info->dr;
2687 25874 : tree offset = dr_info->offset;
2688 25874 : if (!offset)
2689 25874 : offset = build_zero_cst (sizetype);
2690 :
2691 25874 : niters = fold_build2 (MULT_EXPR, sizetype,
2692 : fold_convert (sizetype, niters),
2693 : fold_convert (sizetype, DR_STEP (dr)));
2694 25874 : offset = fold_build2 (code, sizetype,
2695 : fold_convert (sizetype, offset), niters);
2696 25874 : dr_info->offset = offset;
2697 25874 : }
2698 :
2699 :
2700 : /* Function vect_update_inits_of_drs
2701 :
2702 : Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
2703 : CODE and NITERS are as for vect_update_inits_of_dr. */
2704 :
2705 : void
2706 7495 : vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
2707 : tree_code code)
2708 : {
2709 7495 : unsigned int i;
2710 7495 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2711 7495 : struct data_reference *dr;
2712 :
2713 7495 : DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
2714 :
2715 : /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
2716 : here, but since we might use these niters to update the epilogues niters
2717 : and data references we can't insert them here as this definition might not
2718 : always dominate its uses. */
2719 7495 : if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
2720 4586 : niters = fold_convert (sizetype, niters);
2721 :
2722 41172 : FOR_EACH_VEC_ELT (datarefs, i, dr)
2723 : {
2724 26300 : dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2725 26300 : if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
2726 25874 : && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
2727 25874 : vect_update_init_of_dr (dr_info, niters, code);
2728 : }
2729 7495 : }
2730 :
2731 : /* For the information recorded in LOOP_VINFO prepare the loop for peeling
2732 : by masking. This involves calculating the number of iterations to
2733 : be peeled and then aligning all memory references appropriately. */
2734 :
2735 : void
2736 1 : vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
2737 : {
2738 1 : tree misalign_in_elems;
2739 1 : tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
2740 :
2741 1 : gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
2742 :
2743 : /* From the information recorded in LOOP_VINFO get the number of iterations
2744 : that need to be skipped via masking. */
2745 1 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2746 : {
2747 1 : poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2748 1 : - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
2749 1 : misalign_in_elems = build_int_cst (type, misalign);
2750 : }
2751 : else
2752 : {
2753 0 : gimple_seq seq1 = NULL, seq2 = NULL;
2754 0 : misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
2755 0 : misalign_in_elems = fold_convert (type, misalign_in_elems);
2756 0 : misalign_in_elems = force_gimple_operand (misalign_in_elems,
2757 : &seq2, true, NULL_TREE);
2758 0 : gimple_seq_add_seq (&seq1, seq2);
2759 0 : if (seq1)
2760 : {
2761 0 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2762 0 : basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
2763 0 : gcc_assert (!new_bb);
2764 : }
2765 : }
2766 :
2767 1 : if (dump_enabled_p ())
2768 1 : dump_printf_loc (MSG_NOTE, vect_location,
2769 : "misalignment for fully-masked loop: %T\n",
2770 : misalign_in_elems);
2771 :
2772 1 : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
2773 :
2774 1 : vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
2775 1 : }
2776 :
2777 : /* This function builds ni_name = number of iterations. Statements
2778 : are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
2779 : it to TRUE if new ssa_var is generated. */
2780 :
2781 : tree
2782 61856 : vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
2783 : {
2784 61856 : if (LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo))
2785 : return NULL_TREE;
2786 61782 : tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
2787 61782 : if (TREE_CODE (ni) == INTEGER_CST)
2788 : return ni;
2789 : else
2790 : {
2791 25779 : tree ni_name, var;
2792 25779 : gimple_seq stmts = NULL;
2793 25779 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2794 :
2795 25779 : var = create_tmp_var (TREE_TYPE (ni), "niters");
2796 25779 : ni_name = force_gimple_operand (ni, &stmts, false, var);
2797 25779 : if (stmts)
2798 : {
2799 24723 : gsi_insert_seq_on_edge_immediate (pe, stmts);
2800 24723 : if (new_var_p != NULL)
2801 222 : *new_var_p = true;
2802 : }
2803 :
2804 25779 : return ni_name;
2805 : }
2806 : }
2807 :
2808 : /* Calculate the number of iterations above which vectorized loop will be
2809 : preferred than scalar loop. NITERS_PROLOG is the number of iterations
2810 : of prolog loop. If it's integer const, the integer number is also passed
2811 : in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
2812 : number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
2813 : value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
2814 : threshold below which the scalar (rather than vectorized) loop will be
2815 : executed. This function stores the upper bound (inclusive) of the result
2816 : in BOUND_SCALAR. */
2817 :
2818 : static tree
2819 24799 : vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
2820 : poly_int64 bound_prolog, poly_int64 bound_epilog,
2821 : int th, poly_uint64 *bound_scalar,
2822 : bool check_profitability)
2823 : {
2824 24799 : tree type = TREE_TYPE (niters_prolog);
2825 24799 : tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
2826 : build_int_cst (type, bound_epilog));
2827 :
2828 24799 : *bound_scalar = bound_prolog + bound_epilog;
2829 24799 : if (check_profitability)
2830 : {
2831 : /* TH indicates the minimum niters of vectorized loop, while we
2832 : compute the maximum niters of scalar loop. */
2833 15877 : th--;
2834 : /* Peeling for constant times. */
2835 15877 : if (int_niters_prolog >= 0)
2836 : {
2837 15839 : *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
2838 15839 : return build_int_cst (type, *bound_scalar);
2839 : }
2840 : /* Peeling an unknown number of times. Note that both BOUND_PROLOG
2841 : and BOUND_EPILOG are inclusive upper bounds. */
2842 38 : if (known_ge (th, bound_prolog + bound_epilog))
2843 : {
2844 0 : *bound_scalar = th;
2845 0 : return build_int_cst (type, th);
2846 : }
2847 : /* Need to do runtime comparison. */
2848 38 : else if (maybe_gt (th, bound_epilog))
2849 : {
2850 38 : *bound_scalar = upper_bound (*bound_scalar, th);
2851 38 : return fold_build2 (MAX_EXPR, type,
2852 : build_int_cst (type, th), niters);
2853 : }
2854 : }
2855 : return niters;
2856 : }
2857 :
2858 : /* NITERS is the number of times that the original scalar loop executes
2859 : after peeling. Work out the maximum number of iterations N that can
2860 : be handled by the vectorized form of the loop and then either:
2861 :
2862 : a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
2863 :
2864 : niters_vector = N
2865 :
2866 : b) set *STEP_VECTOR_PTR to one and generate:
2867 :
2868 : niters_vector = N / vf
2869 :
2870 : In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
2871 : any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
2872 : is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).
2873 :
2874 : Case (a) is used for LOOP_VINFO_USING_PARTIAL_VECTORS_P or if VF is
2875 : variable. As stated above, NITERS_VECTOR then equals the number
2876 : of scalar iterations and vect_set_loop_condition will handle the
2877 : step. As opposed to (b) we don't know anything about NITER_VECTOR's
2878 : range here.
2879 : */
2880 :
2881 : void
2882 34047 : vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
2883 : tree *niters_vector_ptr, tree *step_vector_ptr,
2884 : bool niters_no_overflow)
2885 : {
2886 34047 : tree ni_minus_gap, var;
2887 34047 : tree niters_vector, step_vector;
2888 34047 : tree type = niters ? TREE_TYPE (niters) : sizetype;
2889 34047 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2890 34047 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2891 :
2892 : /* If epilogue loop is required because of data accesses with gaps, we
2893 : subtract one iteration from the total number of iterations here for
2894 : correct calculation of RATIO. */
2895 34047 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2896 : {
2897 372 : ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
2898 : build_one_cst (type));
2899 372 : if (!is_gimple_val (ni_minus_gap))
2900 : {
2901 170 : var = create_tmp_var (type, "ni_gap");
2902 170 : gimple *stmts = NULL;
2903 170 : ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
2904 : true, var);
2905 170 : gsi_insert_seq_on_edge_immediate (pe, stmts);
2906 : }
2907 : }
2908 : else
2909 : ni_minus_gap = niters;
2910 :
2911 : /* To silence some unexpected warnings, simply initialize to 0. */
2912 34047 : unsigned HOST_WIDE_INT const_vf = 0;
2913 34047 : if (vf.is_constant (&const_vf)
2914 34047 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2915 : {
2916 : /* Create: niters / vf, which is equivalent to niters >> log2(vf) when
2917 : vf is a power of two, and when not we approximate using a
2918 : truncating division. */
2919 : /* If it's known that niters == number of latch executions + 1 doesn't
2920 : overflow, we can generate niters / vf; otherwise we generate
2921 : (niters - vf) / vf + 1 by using the fact that we know ratio
2922 : will be at least one. */
2923 34029 : tree var_vf = build_int_cst (type, const_vf);
2924 34029 : if (niters_no_overflow)
2925 33804 : niters_vector = fold_build2 (TRUNC_DIV_EXPR, type, ni_minus_gap,
2926 : var_vf);
2927 : else
2928 225 : niters_vector
2929 225 : = fold_build2 (PLUS_EXPR, type,
2930 : fold_build2 (TRUNC_DIV_EXPR, type,
2931 : fold_build2 (MINUS_EXPR, type,
2932 : ni_minus_gap,
2933 : var_vf),
2934 : var_vf),
2935 : build_int_cst (type, 1));
2936 34029 : step_vector = build_one_cst (type);
2937 : }
2938 : else
2939 : {
2940 18 : niters_vector = ni_minus_gap;
2941 18 : step_vector = build_int_cst (type, vf);
2942 : }
2943 :
2944 34047 : if (!is_gimple_val (niters_vector))
2945 : {
2946 25549 : var = create_tmp_var (type, "bnd");
2947 25549 : gimple_seq stmts = NULL;
2948 25549 : niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2949 25549 : gsi_insert_seq_on_edge_immediate (pe, stmts);
2950 : /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2951 : we set range information to make niters analyzer's life easier.
2952 : Note the number of latch iteration value can be TYPE_MAX_VALUE so
2953 : we have to represent the vector niter TYPE_MAX_VALUE + 1 / vf. */
2954 25549 : if (stmts != NULL
2955 25549 : && integer_onep (step_vector))
2956 : {
2957 25534 : if (niters_no_overflow)
2958 : {
2959 25327 : int_range<1> vr (type,
2960 50654 : wi::one (TYPE_PRECISION (type)),
2961 50654 : wi::div_trunc (wi::max_value
2962 25327 : (TYPE_PRECISION (type),
2963 25327 : TYPE_SIGN (type)),
2964 : const_vf,
2965 50654 : TYPE_SIGN (type)));
2966 25327 : set_range_info (niters_vector, vr);
2967 25327 : }
2968 : /* For VF == 1 the vector IV might also overflow so we cannot
2969 : assert a minimum value of 1. */
2970 207 : else if (const_vf > 1)
2971 : {
2972 171 : int_range<1> vr (type,
2973 342 : wi::one (TYPE_PRECISION (type)),
2974 513 : wi::rshift (wi::max_value (TYPE_PRECISION (type),
2975 171 : TYPE_SIGN (type))
2976 342 : - (const_vf - 1),
2977 342 : exact_log2 (const_vf), TYPE_SIGN (type))
2978 684 : + 1);
2979 171 : set_range_info (niters_vector, vr);
2980 171 : }
2981 : }
2982 : }
2983 34047 : *niters_vector_ptr = niters_vector;
2984 34047 : *step_vector_ptr = step_vector;
2985 :
2986 34047 : return;
2987 : }
2988 :
2989 : /* Given NITERS_VECTOR which is the number of iterations for vectorized
2990 : loop specified by LOOP_VINFO after vectorization, compute the number
2991 : of iterations before vectorization (niters_vector * vf) and store it
2992 : to NITERS_VECTOR_MULT_VF_PTR. */
2993 :
2994 : static void
2995 33299 : vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2996 : tree niters_vector,
2997 : tree *niters_vector_mult_vf_ptr)
2998 : {
2999 : /* We should be using a step_vector of VF if VF is variable. */
3000 33299 : int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
3001 33299 : tree type = TREE_TYPE (niters_vector);
3002 33299 : tree tree_vf = build_int_cst (type, vf);
3003 33299 : basic_block exit_bb = LOOP_VINFO_MAIN_EXIT (loop_vinfo)->dest;
3004 :
3005 33299 : gcc_assert (niters_vector_mult_vf_ptr != NULL);
3006 33299 : tree niters_vector_mult_vf = fold_build2 (MULT_EXPR, type,
3007 : niters_vector, tree_vf);
3008 :
3009 : /* If we've peeled a vector iteration then subtract one full vector
3010 : iteration. */
3011 33299 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
3012 19 : niters_vector_mult_vf = fold_build2 (MINUS_EXPR, type,
3013 : niters_vector_mult_vf, tree_vf);
3014 :
3015 33299 : if (!is_gimple_val (niters_vector_mult_vf))
3016 : {
3017 24824 : tree var = create_tmp_var (type, "niters_vector_mult_vf");
3018 24824 : gimple_seq stmts = NULL;
3019 24824 : niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
3020 : &stmts, true, var);
3021 24824 : gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
3022 24824 : gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
3023 : }
3024 33299 : *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
3025 33299 : }
3026 :
3027 : /* Function slpeel_add_loop_guard adds guard skipping from the beginning
3028 : of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
3029 : are two pred edges of the merge point before UPDATE_LOOP. The two loops
3030 : appear like below:
3031 :
3032 : guard_bb:
3033 : if (cond)
3034 : goto merge_bb;
3035 : else
3036 : goto skip_loop;
3037 :
3038 : skip_loop:
3039 : header_a:
3040 : i_1 = PHI<i_0, i_2>;
3041 : ...
3042 : i_2 = i_1 + 1;
3043 : if (cond_a)
3044 : goto latch_a;
3045 : else
3046 : goto exit_a;
3047 : latch_a:
3048 : goto header_a;
3049 :
3050 : exit_a:
3051 : i_5 = PHI<i_2>;
3052 :
3053 : merge_bb:
3054 : ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
3055 :
3056 : update_loop:
3057 : header_b:
3058 : i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
3059 : ...
3060 : i_4 = i_3 + 1;
3061 : if (cond_b)
3062 : goto latch_b;
3063 : else
3064 : goto exit_bb;
3065 : latch_b:
3066 : goto header_b;
3067 :
3068 : exit_bb:
3069 :
3070 : This function creates PHI nodes at merge_bb and replaces the use of i_5
3071 : in the update_loop's PHI node with the result of new PHI result. */
3072 :
3073 : static void
3074 25237 : slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
3075 : class loop *update_loop,
3076 : edge guard_edge, edge merge_edge)
3077 : {
3078 25237 : location_t merge_loc, guard_loc;
3079 25237 : edge orig_e = loop_preheader_edge (skip_loop);
3080 25237 : edge update_e = loop_preheader_edge (update_loop);
3081 25237 : gphi_iterator gsi_orig, gsi_update;
3082 :
3083 25237 : for ((gsi_orig = gsi_start_phis (skip_loop->header),
3084 25237 : gsi_update = gsi_start_phis (update_loop->header));
3085 92455 : !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
3086 67218 : gsi_next (&gsi_orig), gsi_next (&gsi_update))
3087 : {
3088 67218 : gphi *orig_phi = gsi_orig.phi ();
3089 67218 : gphi *update_phi = gsi_update.phi ();
3090 :
3091 : /* Generate new phi node at merge bb of the guard. */
3092 67218 : tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3093 67218 : gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
3094 :
3095 : /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
3096 : args in NEW_PHI for these edges. */
3097 67218 : tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
3098 67218 : tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
3099 67218 : merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
3100 67218 : guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
3101 67218 : add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
3102 67218 : add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
3103 :
3104 : /* Update phi in UPDATE_PHI. */
3105 67218 : adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
3106 : }
3107 25237 : }
3108 :
3109 : /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
3110 : Return a value that equals:
3111 :
3112 : - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
3113 : - SKIP_VALUE when the main loop is skipped. */
3114 :
3115 : tree
3116 3961 : vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
3117 : tree skip_value)
3118 : {
3119 3961 : gcc_assert (loop_vinfo->main_loop_edge);
3120 :
3121 3961 : tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
3122 3961 : basic_block bb = loop_vinfo->main_loop_edge->dest;
3123 3961 : gphi *new_phi = create_phi_node (phi_result, bb);
3124 3961 : add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
3125 : UNKNOWN_LOCATION);
3126 3961 : add_phi_arg (new_phi, skip_value,
3127 : loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
3128 3961 : return phi_result;
3129 : }
3130 :
3131 : /* Function vect_do_peeling.
3132 :
3133 : Input:
3134 : - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
3135 :
3136 : preheader:
3137 : LOOP:
3138 : header_bb:
3139 : loop_body
3140 : if (exit_loop_cond) goto exit_bb
3141 : else goto header_bb
3142 : exit_bb:
3143 :
3144 : - NITERS: The number of iterations of the loop.
3145 : - NITERSM1: The number of iterations of the loop's latch.
3146 : - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
3147 : - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
3148 : CHECK_PROFITABILITY is true.
3149 : Output:
3150 : - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
3151 : iterate after vectorization; see vect_set_loop_condition for details.
3152 : - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
3153 : should be set to the number of scalar iterations handled by the
3154 : vector loop. The SSA name is only used on exit from the loop.
3155 :
3156 : This function peels prolog and epilog from the loop, adds guards skipping
3157 : PROLOG and EPILOG for various conditions. As a result, the changed CFG
3158 : would look like:
3159 :
3160 : guard_bb_1:
3161 : if (prefer_scalar_loop) goto merge_bb_1
3162 : else goto guard_bb_2
3163 :
3164 : guard_bb_2:
3165 : if (skip_prolog) goto merge_bb_2
3166 : else goto prolog_preheader
3167 :
3168 : prolog_preheader:
3169 : PROLOG:
3170 : prolog_header_bb:
3171 : prolog_body
3172 : if (exit_prolog_cond) goto prolog_exit_bb
3173 : else goto prolog_header_bb
3174 : prolog_exit_bb:
3175 :
3176 : merge_bb_2:
3177 :
3178 : vector_preheader:
3179 : VECTOR LOOP:
3180 : vector_header_bb:
3181 : vector_body
3182 : if (exit_vector_cond) goto vector_exit_bb
3183 : else goto vector_header_bb
3184 : vector_exit_bb:
3185 :
3186 : guard_bb_3:
3187 : if (skip_epilog) goto merge_bb_3
3188 : else goto epilog_preheader
3189 :
3190 : merge_bb_1:
3191 :
3192 : epilog_preheader:
3193 : EPILOG:
3194 : epilog_header_bb:
3195 : epilog_body
3196 : if (exit_epilog_cond) goto merge_bb_3
3197 : else goto epilog_header_bb
3198 :
3199 : merge_bb_3:
3200 :
3201 : Note this function peels prolog and epilog only if it's necessary,
3202 : as well as guards.
3203 : This function returns the epilogue loop if a decision was made to vectorize
3204 : it, otherwise NULL.
3205 :
3206 : The analysis resulting in this epilogue loop's loop_vec_info was performed
3207 : in the same vect_analyze_loop call as the main loop's. At that time
3208 : vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
3209 : vectorization factors than the main loop. This list is chained in the
3210 : loop's loop_vec_info in the 'epilogue_vinfo' member. When we decide to
3211 : vectorize the epilogue loop for a lower vectorization factor, the
3212 : loop_vec_info in epilogue_vinfo is updated and linked to the epilogue loop.
3213 : This is later used to vectorize the epilogue.
3214 : The reason the loop_vec_info needs updating is that it was
3215 : constructed based on the original main loop, and the epilogue loop is a
3216 : copy of this loop, so all links pointing to statements in the original loop
3217 : need updating. Furthermore, these loop_vec_infos share the
3218 : data_reference's records, which will also need to be updated.
3219 :
3220 : TODO: Guard for prefer_scalar_loop should be emitted along with
3221 : versioning conditions if loop versioning is needed. */
3222 :
3223 :
3224 : class loop *
3225 61418 : vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
3226 : tree *niters_vector, tree *step_vector,
3227 : tree *niters_vector_mult_vf_var, int th,
3228 : bool check_profitability, bool niters_no_overflow,
3229 : tree *advance)
3230 : {
3231 61418 : edge e, guard_e;
3232 61418 : tree type = niters ? TREE_TYPE (niters) : sizetype;
3233 61418 : tree guard_cond;
3234 61418 : basic_block guard_bb, guard_to;
3235 61418 : profile_probability prob_prolog, prob_vector, prob_epilog;
3236 61418 : int estimated_vf;
3237 61418 : int prolog_peeling = 0;
3238 61418 : bool vect_epilogues = loop_vinfo->epilogue_vinfo != NULL;
3239 61418 : bool uncounted_p = LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo);
3240 :
3241 61418 : if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3242 61417 : prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3243 :
3244 61418 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3245 61418 : poly_uint64 bound_epilog = 0;
3246 61418 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
3247 61400 : && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3248 32410 : bound_epilog += vf - 1;
3249 61418 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3250 372 : bound_epilog += 1;
3251 :
3252 : /* For early breaks the scalar loop needs to execute at most VF times
3253 : to find the element that caused the break. */
3254 61418 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3255 1406 : bound_epilog = vf;
3256 :
3257 61418 : bool epilog_peeling = maybe_ne (bound_epilog, 0U);
3258 61418 : poly_uint64 bound_scalar = bound_epilog;
3259 :
3260 61418 : if (!prolog_peeling && !epilog_peeling)
3261 : return NULL;
3262 :
3263 : /* Before doing any peeling make sure to reset debug binds outside of
3264 : the loop referring to defs not in LC SSA. */
3265 33373 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3266 101966 : for (unsigned i = 0; i < loop->num_nodes; ++i)
3267 : {
3268 68593 : basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3269 68593 : imm_use_iterator ui;
3270 68593 : gimple *use_stmt;
3271 162514 : for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3272 93921 : gsi_next (&gsi))
3273 : {
3274 391365 : FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
3275 223656 : if (gimple_debug_bind_p (use_stmt)
3276 20133 : && loop != gimple_bb (use_stmt)->loop_father
3277 20149 : && !flow_loop_nested_p (loop,
3278 16 : gimple_bb (use_stmt)->loop_father))
3279 : {
3280 2 : gimple_debug_bind_reset_value (use_stmt);
3281 2 : update_stmt (use_stmt);
3282 93921 : }
3283 : }
3284 639694 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
3285 502508 : gsi_next (&gsi))
3286 : {
3287 502508 : ssa_op_iter op_iter;
3288 502508 : def_operand_p def_p;
3289 815221 : FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
3290 1094295 : FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
3291 505733 : if (gimple_debug_bind_p (use_stmt)
3292 36864 : && loop != gimple_bb (use_stmt)->loop_father
3293 36891 : && !flow_loop_nested_p (loop,
3294 27 : gimple_bb (use_stmt)->loop_father))
3295 : {
3296 0 : gimple_debug_bind_reset_value (use_stmt);
3297 0 : update_stmt (use_stmt);
3298 312713 : }
3299 : }
3300 : }
3301 :
3302 33373 : prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
3303 33373 : estimated_vf = vect_vf_for_cost (loop_vinfo);
3304 33373 : if (estimated_vf == 2)
3305 6610 : estimated_vf = 3;
3306 33373 : prob_prolog = prob_epilog = profile_probability::guessed_always ()
3307 33373 : .apply_scale (estimated_vf - 1, estimated_vf);
3308 :
3309 33373 : class loop *prolog = NULL, *epilog = NULL;
3310 33373 : class loop *first_loop = loop;
3311 33373 : bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
3312 :
3313 : /* SSA form needs to be up-to-date since we are going to manually
3314 : update SSA form in slpeel_tree_duplicate_loop_to_edge_cfg and delete all
3315 : update SSA state after that, so we have to make sure to not lose any
3316 : pending update needs. */
3317 33373 : gcc_assert (!need_ssa_update_p (cfun));
3318 :
3319 : /* If we're vectorizing an epilogue loop, we have ensured that the
3320 : virtual operand is in SSA form throughout the vectorized main loop.
3321 : Normally it is possible to trace the updated
3322 : vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
3323 : back to scalar-stmt vuses, meaning that the effect of the SSA update
3324 : remains local to the main loop. However, there are rare cases in
3325 : which the vectorized loop should have vdefs even when the original scalar
3326 : loop didn't. For example, vectorizing a load with IFN_LOAD_LANES
3327 : introduces clobbers of the temporary vector array, which in turn
3328 : needs new vdefs. If the scalar loop doesn't write to memory, these
3329 : new vdefs will be the only ones in the vector loop.
3330 : We are currently defering updating virtual SSA form and creating
3331 : of a virtual PHI for this case so we do not have to make sure the
3332 : newly introduced virtual def is in LCSSA form. */
3333 :
3334 33373 : if (MAY_HAVE_DEBUG_BIND_STMTS)
3335 : {
3336 12395 : gcc_assert (!adjust_vec.exists ());
3337 12395 : adjust_vec.create (32);
3338 : }
3339 33373 : initialize_original_copy_tables ();
3340 :
3341 : /* Record the anchor bb at which the guard should be placed if the scalar
3342 : loop might be preferred. */
3343 33373 : basic_block anchor = loop_preheader_edge (loop)->src;
3344 :
3345 : /* Generate the number of iterations for the prolog loop. We do this here
3346 : so that we can also get the upper bound on the number of iterations. */
3347 33373 : tree niters_prolog;
3348 33373 : poly_int64 bound_prolog = 0;
3349 33373 : if (prolog_peeling)
3350 : {
3351 438 : niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
3352 : &bound_prolog);
3353 : /* If algonment peeling is known, we will always execute prolog. */
3354 438 : if (TREE_CODE (niters_prolog) == INTEGER_CST)
3355 226 : prob_prolog = profile_probability::always ();
3356 : }
3357 : else
3358 32935 : niters_prolog = build_int_cst (type, 0);
3359 :
3360 33373 : loop_vec_info epilogue_vinfo = loop_vinfo->epilogue_vinfo;
3361 33373 : tree niters_vector_mult_vf = NULL_TREE;
3362 : /* Saving NITERs before the loop, as this may be changed by prologue. */
3363 33373 : tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
3364 33373 : edge update_e = NULL, skip_e = NULL;
3365 33373 : unsigned int lowest_vf = constant_lower_bound (vf);
3366 : /* Prolog loop may be skipped. */
3367 33373 : bool skip_prolog = (prolog_peeling != 0);
3368 : /* Skip this loop to epilog when there are not enough iterations to enter this
3369 : vectorized loop. If true we should perform runtime checks on the NITERS
3370 : to check whether we should skip the current vectorized loop. If we know
3371 : the number of scalar iterations we may choose to add a runtime check if
3372 : this number "maybe" smaller than the number of iterations required
3373 : when we know the number of scalar iterations may potentially
3374 : be smaller than the number of iterations required to enter this loop, for
3375 : this we use the upper bounds on the prolog and epilog peeling. When we
3376 : don't know the number of iterations and don't require versioning it is
3377 : because we have asserted that there are enough scalar iterations to enter
3378 : the main loop, so this skip is not necessary. When we are versioning then
3379 : we only add such a skip if we have chosen to vectorize the epilogue. */
3380 33373 : bool skip_vector = false;
3381 33373 : if (!uncounted_p)
3382 8516 : skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3383 41846 : ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
3384 8516 : bound_prolog + bound_epilog)
3385 24837 : : (!LOOP_VINFO_USE_VERSIONING_WITHOUT_PEELING (loop_vinfo)
3386 38 : || vect_epilogues));
3387 :
3388 : /* Epilog loop must be executed if the number of iterations for epilog
3389 : loop is known at compile time, otherwise we need to add a check at
3390 : the end of vector loop and skip to the end of epilog loop. */
3391 33373 : bool skip_epilog = (prolog_peeling < 0
3392 33161 : || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3393 33373 : || !vf.is_constant ());
3394 : /* PEELING_FOR_GAPS and peeling for early breaks are special because epilog
3395 : loop must be executed. */
3396 33373 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3397 33001 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3398 1778 : skip_epilog = false;
3399 :
3400 33373 : class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3401 33373 : auto_vec<profile_count> original_counts;
3402 33373 : basic_block *original_bbs = NULL;
3403 :
3404 33373 : if (skip_vector)
3405 : {
3406 24799 : split_edge (loop_preheader_edge (loop));
3407 :
3408 24799 : if (epilog_peeling && (vect_epilogues || scalar_loop == NULL))
3409 : {
3410 22585 : original_bbs = get_loop_body (loop);
3411 68178 : for (unsigned int i = 0; i < loop->num_nodes; i++)
3412 45593 : original_counts.safe_push(original_bbs[i]->count);
3413 : }
3414 :
3415 : /* Due to the order in which we peel prolog and epilog, we first
3416 : propagate probability to the whole loop. The purpose is to
3417 : avoid adjusting probabilities of both prolog and vector loops
3418 : separately. Note in this case, the probability of epilog loop
3419 : needs to be scaled back later. */
3420 24799 : basic_block bb_before_loop = loop_preheader_edge (loop)->src;
3421 24799 : if (prob_vector.initialized_p ())
3422 : {
3423 24799 : scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
3424 24799 : scale_loop_profile (loop, prob_vector, -1);
3425 : }
3426 : }
3427 :
3428 33373 : if (vect_epilogues)
3429 : {
3430 : /* Make sure to set the epilogue's epilogue scalar loop, such that we can
3431 : use the original scalar loop as remaining epilogue if necessary. */
3432 7056 : LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
3433 7056 : = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3434 7056 : LOOP_VINFO_SCALAR_MAIN_EXIT (epilogue_vinfo)
3435 7056 : = LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo);
3436 : }
3437 :
3438 33373 : if (prolog_peeling)
3439 : {
3440 438 : e = loop_preheader_edge (loop);
3441 438 : edge exit_e = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
3442 438 : gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, exit_e, e)
3443 : && (!LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)
3444 : || uncounted_p));
3445 :
3446 : /* Peel prolog and put it on preheader edge of loop. */
3447 438 : edge scalar_e = LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo);
3448 438 : edge prolog_e = NULL;
3449 438 : prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, exit_e,
3450 : scalar_loop, scalar_e,
3451 : e, &prolog_e, true, NULL,
3452 : false, uncounted_p);
3453 :
3454 438 : gcc_assert (prolog);
3455 438 : prolog->force_vectorize = false;
3456 :
3457 : /* Assign hierarchical discriminators to distinguish prolog loop. */
3458 438 : gimple *prolog_last = last_nondebug_stmt (prolog->header);
3459 438 : location_t prolog_loc
3460 438 : = prolog_last ? gimple_location (prolog_last) : UNKNOWN_LOCATION;
3461 438 : if (prolog_loc != UNKNOWN_LOCATION)
3462 : {
3463 436 : unsigned int prolog_copyid = allocate_copyid_base (prolog_loc, 1);
3464 436 : assign_discriminators_to_loop (prolog, 0, prolog_copyid);
3465 : }
3466 438 : first_loop = prolog;
3467 438 : reset_original_copy_tables ();
3468 :
3469 : /* Update the number of iterations for prolog loop. */
3470 438 : tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
3471 438 : vect_set_loop_condition (prolog, prolog_e, NULL, niters_prolog,
3472 : step_prolog, NULL_TREE, false);
3473 :
3474 : /* Skip the prolog loop. */
3475 438 : if (skip_prolog)
3476 : {
3477 438 : guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3478 : niters_prolog, build_int_cst (type, 0));
3479 438 : guard_bb = loop_preheader_edge (prolog)->src;
3480 438 : basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
3481 438 : guard_to = split_edge (loop_preheader_edge (loop));
3482 438 : guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
3483 : guard_to, guard_bb,
3484 : prob_prolog.invert (),
3485 : irred_flag);
3486 2008 : for (edge alt_e : get_loop_exit_edges (prolog))
3487 : {
3488 694 : if (alt_e == prolog_e)
3489 438 : continue;
3490 256 : basic_block old_dom
3491 256 : = get_immediate_dominator (CDI_DOMINATORS, alt_e->dest);
3492 256 : if (flow_bb_inside_loop_p (prolog, old_dom))
3493 : {
3494 101 : auto_vec<basic_block, 8> queue;
3495 101 : for (auto son = first_dom_son (CDI_DOMINATORS, old_dom);
3496 325 : son; son = next_dom_son (CDI_DOMINATORS, son))
3497 224 : if (!flow_bb_inside_loop_p (prolog, son))
3498 123 : queue.safe_push (son);
3499 426 : for (auto son : queue)
3500 123 : set_immediate_dominator (CDI_DOMINATORS, son, guard_bb);
3501 101 : }
3502 438 : }
3503 :
3504 438 : e = EDGE_PRED (guard_to, 0);
3505 438 : e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3506 438 : slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
3507 :
3508 438 : scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
3509 438 : scale_loop_profile (prolog, prob_prolog,
3510 438 : estimated_poly_value (bound_prolog) - 1);
3511 : }
3512 :
3513 : /* Update init address of DRs. */
3514 438 : vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
3515 438 : if (!uncounted_p)
3516 : {
3517 : /* Update niters for vector loop. */
3518 407 : LOOP_VINFO_NITERS (loop_vinfo)
3519 407 : = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
3520 407 : LOOP_VINFO_NITERSM1 (loop_vinfo)
3521 407 : = fold_build2 (MINUS_EXPR, type,
3522 : LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
3523 : }
3524 438 : bool new_var_p = false;
3525 438 : niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
3526 : /* It's guaranteed that vector loop bound before vectorization is at
3527 : least VF, so set range information for newly generated var. */
3528 438 : if (new_var_p)
3529 : {
3530 222 : int_range<1> vr (type,
3531 444 : wi::to_wide (build_int_cst (type, lowest_vf)),
3532 444 : wi::to_wide (TYPE_MAX_VALUE (type)));
3533 222 : set_range_info (niters, vr);
3534 222 : }
3535 :
3536 : /* Prolog iterates at most bound_prolog times, latch iterates at
3537 : most bound_prolog - 1 times. */
3538 438 : if (bound_prolog.is_constant ())
3539 438 : record_niter_bound (prolog, bound_prolog.to_constant () - 1, false,
3540 : true);
3541 438 : delete_update_ssa ();
3542 438 : adjust_vec_debug_stmts ();
3543 438 : scev_reset ();
3544 : }
3545 33373 : basic_block bb_before_epilog = NULL;
3546 :
3547 33373 : if (epilog_peeling)
3548 : {
3549 33342 : e = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
3550 33342 : gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, e, e));
3551 :
3552 : /* Peel epilog and put it on exit edge of loop. If we are vectorizing
3553 : said epilog then we should use a copy of the main loop as a starting
3554 : point. This loop may have already had some preliminary transformations
3555 : to allow for more optimal vectorization, for example if-conversion.
3556 : If we are not vectorizing the epilog then we should use the scalar loop
3557 : as the transformations mentioned above make less or no sense when not
3558 : vectorizing. */
3559 33342 : edge scalar_e = LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo);
3560 33342 : epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
3561 7056 : edge epilog_e = vect_epilogues ? e : scalar_e;
3562 33342 : edge new_epilog_e = NULL;
3563 33342 : auto_vec<basic_block> doms;
3564 33342 : epilog
3565 33342 : = slpeel_tree_duplicate_loop_to_edge_cfg (loop, e, epilog, epilog_e, e,
3566 : &new_epilog_e, true, &doms,
3567 : uncounted_p);
3568 :
3569 33342 : LOOP_VINFO_EPILOGUE_MAIN_EXIT (loop_vinfo) = new_epilog_e;
3570 33342 : gcc_assert (epilog);
3571 33342 : gcc_assert (new_epilog_e);
3572 33342 : epilog->force_vectorize = false;
3573 33342 : bb_before_epilog = loop_preheader_edge (epilog)->src;
3574 :
3575 : /* Assign hierarchical discriminators to distinguish epilog loop.
3576 : Only assign if it's a scalar epilog. If it will be vectorized
3577 : (vect_epilogues), discriminators will be assigned.
3578 : Use dynamic copy_id allocation instead of hardcoded constants. */
3579 33342 : if (!vect_epilogues)
3580 : {
3581 26286 : gimple *epilog_last = last_nondebug_stmt (epilog->header);
3582 26286 : location_t epilog_loc
3583 26286 : = epilog_last ? gimple_location (epilog_last) : UNKNOWN_LOCATION;
3584 26257 : if (epilog_loc != UNKNOWN_LOCATION)
3585 : {
3586 21424 : unsigned int epilog_copyid = allocate_copyid_base (epilog_loc, 1);
3587 21424 : assign_discriminators_to_loop (epilog, 0, epilog_copyid);
3588 : }
3589 : }
3590 :
3591 : /* Scalar version loop may be preferred. In this case, add guard
3592 : and skip to epilog. Note this only happens when the number of
3593 : iterations of loop is unknown at compile time, otherwise this
3594 : won't be vectorized. */
3595 33342 : if (skip_vector)
3596 : {
3597 : /* Additional epilogue iteration is peeled if gap exists. */
3598 24799 : tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
3599 24799 : bound_prolog, bound_epilog,
3600 : th, &bound_scalar,
3601 : check_profitability);
3602 : /* Build guard against NITERSM1 since NITERS may overflow. */
3603 24799 : guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
3604 24799 : guard_bb = anchor;
3605 24799 : guard_to = split_edge (loop_preheader_edge (epilog));
3606 24799 : guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
3607 : guard_to, guard_bb,
3608 : prob_vector.invert (),
3609 : irred_flag);
3610 24799 : skip_e = guard_e;
3611 24799 : e = EDGE_PRED (guard_to, 0);
3612 24799 : e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3613 :
3614 : /* Handle any remaining dominator updates needed after
3615 : inserting the loop skip edge above. */
3616 24799 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
3617 286 : && prolog_peeling)
3618 : {
3619 : /* Adding a skip edge to skip a loop with multiple exits
3620 : means the dominator of the join blocks for all exits shifts
3621 : from the prolog skip guard to the loop skip guard. */
3622 163 : auto prolog_skip_bb
3623 163 : = single_pred (loop_preheader_edge (prolog)->src);
3624 163 : auto needs_update
3625 163 : = get_dominated_by (CDI_DOMINATORS, prolog_skip_bb);
3626 :
3627 : /* Update everything except for the immediate children of
3628 : the prolog skip block (the prolog and vector preheaders).
3629 : Those should remain dominated by the prolog skip block itself,
3630 : since the loop guard edge goes to the epilogue. */
3631 866 : for (auto bb : needs_update)
3632 377 : if (bb != EDGE_SUCC (prolog_skip_bb, 0)->dest
3633 377 : && bb != EDGE_SUCC (prolog_skip_bb, 1)->dest)
3634 51 : set_immediate_dominator (CDI_DOMINATORS, bb, guard_bb);
3635 163 : }
3636 :
3637 24799 : slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
3638 :
3639 : /* Simply propagate profile info from guard_bb to guard_to which is
3640 : a merge point of control flow. */
3641 24799 : profile_count old_count = guard_to->count;
3642 24799 : guard_to->count = guard_bb->count;
3643 :
3644 : /* Restore the counts of the epilog loop if we didn't use the scalar loop. */
3645 24799 : if (vect_epilogues || scalar_loop == NULL)
3646 : {
3647 22585 : gcc_assert(epilog->num_nodes == loop->num_nodes);
3648 22585 : basic_block *bbs = get_loop_body (epilog);
3649 68178 : for (unsigned int i = 0; i < epilog->num_nodes; i++)
3650 : {
3651 45593 : gcc_assert(get_bb_original (bbs[i]) == original_bbs[i]);
3652 45593 : bbs[i]->count = original_counts[i];
3653 : }
3654 22585 : free (bbs);
3655 22585 : free (original_bbs);
3656 : }
3657 2214 : else if (old_count.nonzero_p ())
3658 2214 : scale_loop_profile (epilog, guard_to->count.probability_in (old_count), -1);
3659 :
3660 : /* Only need to handle basic block before epilog loop if it's not
3661 : the guard_bb, which is the case when skip_vector is true. */
3662 24799 : if (guard_bb != bb_before_epilog && single_pred_p (bb_before_epilog))
3663 24513 : bb_before_epilog->count = single_pred_edge (bb_before_epilog)->count ();
3664 24799 : bb_before_epilog = loop_preheader_edge (epilog)->src;
3665 : }
3666 :
3667 33342 : if (!uncounted_p)
3668 : {
3669 : /* If loop is peeled for non-zero constant times, now niters refers to
3670 : orig_niters - prolog_peeling, it won't overflow even the
3671 : orig_niters overflows. */
3672 33299 : niters_no_overflow |= (prolog_peeling > 0);
3673 33299 : vect_gen_vector_loop_niters (loop_vinfo, niters,
3674 : niters_vector, step_vector,
3675 : niters_no_overflow);
3676 33299 : if (!integer_onep (*step_vector))
3677 : {
3678 : /* On exit from the loop we will have an easy way of calcalating
3679 : NITERS_VECTOR / STEP * STEP. Install a dummy definition
3680 : until then. */
3681 0 : niters_vector_mult_vf
3682 0 : = make_ssa_name (TREE_TYPE (*niters_vector));
3683 0 : SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
3684 0 : *niters_vector_mult_vf_var = niters_vector_mult_vf;
3685 : }
3686 : else
3687 33299 : vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
3688 : &niters_vector_mult_vf);
3689 : /* Update IVs of original loop as if they were advanced by
3690 : niters_vector_mult_vf steps. */
3691 33299 : gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
3692 33299 : update_e = skip_vector ? e : loop_preheader_edge (epilog);
3693 : }
3694 33342 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3695 1406 : update_e = single_succ_edge (LOOP_VINFO_MAIN_EXIT (loop_vinfo)->dest);
3696 :
3697 : /* If we have a peeled vector iteration we will never skip the epilog loop
3698 : and we can simplify the cfg a lot by not doing the edge split. */
3699 33342 : if (skip_epilog
3700 33342 : || (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
3701 1406 : && !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)))
3702 : {
3703 25690 : guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3704 : niters, niters_vector_mult_vf);
3705 :
3706 25690 : guard_bb = LOOP_VINFO_MAIN_EXIT (loop_vinfo)->dest;
3707 25690 : edge epilog_e = LOOP_VINFO_EPILOGUE_MAIN_EXIT (loop_vinfo);
3708 25690 : guard_to = epilog_e->dest;
3709 26761 : guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
3710 : skip_vector ? anchor : guard_bb,
3711 : prob_epilog.invert (),
3712 : irred_flag);
3713 :
3714 25690 : doms.safe_push (guard_to);
3715 25690 : if (vect_epilogues)
3716 5655 : epilogue_vinfo->skip_this_loop_edge = guard_e;
3717 25690 : edge main_iv = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
3718 25690 : gphi_iterator gsi2 = gsi_start_phis (main_iv->dest);
3719 25690 : for (gphi_iterator gsi = gsi_start_phis (guard_to);
3720 62253 : !gsi_end_p (gsi); gsi_next (&gsi))
3721 : {
3722 : /* We are expecting all of the PHIs we have on epilog_e
3723 : to be also on the main loop exit. But sometimes
3724 : a stray virtual definition can appear at epilog_e
3725 : which we can then take as the same on all exits,
3726 : we've removed the LC SSA PHI on the main exit before
3727 : so we wouldn't need to create a loop PHI for it. */
3728 36563 : if (virtual_operand_p (gimple_phi_result (*gsi))
3729 36563 : && (gsi_end_p (gsi2)
3730 36356 : || !virtual_operand_p (gimple_phi_result (*gsi2))))
3731 180 : add_phi_arg (*gsi,
3732 90 : gimple_phi_arg_def_from_edge (*gsi, epilog_e),
3733 : guard_e, UNKNOWN_LOCATION);
3734 : else
3735 : {
3736 36473 : add_phi_arg (*gsi, gimple_phi_result (*gsi2), guard_e,
3737 : UNKNOWN_LOCATION);
3738 36473 : gsi_next (&gsi2);
3739 : }
3740 : }
3741 :
3742 : /* Only need to handle basic block before epilog loop if it's not
3743 : the guard_bb, which is the case when skip_vector is true. */
3744 25690 : if (guard_bb != bb_before_epilog)
3745 : {
3746 25687 : prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
3747 :
3748 25687 : scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
3749 : }
3750 25690 : scale_loop_profile (epilog, prob_epilog, -1);
3751 : }
3752 :
3753 : /* If we have a peeled vector iteration, all exits are the same, leave it
3754 : and so the main exit needs to be treated the same as the alternative
3755 : exits in that we leave their updates to vectorizable_live_operations.
3756 : */
3757 33342 : tree vector_iters_vf = niters_vector_mult_vf;
3758 33342 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3759 : {
3760 1406 : tree tmp_niters_vf
3761 1406 : = make_ssa_name (LOOP_VINFO_EARLY_BRK_IV_TYPE (loop_vinfo));
3762 :
3763 1449 : if (!(LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo)
3764 1463 : && get_loop_exit_edges (loop).length () == 1))
3765 : {
3766 1377 : basic_block exit_bb = NULL;
3767 1377 : edge update_e = NULL;
3768 :
3769 : /* Identify the early exit merge block. I wish we had stored
3770 : this. */
3771 4131 : for (auto e : get_loop_exit_edges (loop))
3772 1377 : if (e != LOOP_VINFO_MAIN_EXIT (loop_vinfo))
3773 : {
3774 1377 : exit_bb = e->dest;
3775 1377 : update_e = single_succ_edge (exit_bb);
3776 1377 : break;
3777 1377 : }
3778 1377 : vect_update_ivs_after_vectorizer (loop_vinfo, tmp_niters_vf,
3779 : update_e, true);
3780 : }
3781 1406 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
3782 : vector_iters_vf = tmp_niters_vf;
3783 :
3784 1406 : LOOP_VINFO_EARLY_BRK_NITERS_VAR (loop_vinfo) = tmp_niters_vf;
3785 : }
3786 :
3787 33342 : bool recalculate_peel_niters_init
3788 33342 : = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
3789 33342 : vect_update_ivs_after_vectorizer (loop_vinfo, vector_iters_vf,
3790 : update_e,
3791 : recalculate_peel_niters_init);
3792 :
3793 : /* Recalculate the dominators after adding the guard edge. */
3794 33342 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3795 1406 : iterate_fix_dominators (CDI_DOMINATORS, doms, false);
3796 :
3797 : /* When we do not have a loop-around edge to the epilog we know
3798 : the vector loop covered at least VF scalar iterations unless
3799 : we have early breaks.
3800 : Update any known upper bound with this knowledge. */
3801 33342 : if (! skip_vector
3802 8543 : && ! LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3803 : {
3804 7423 : if (epilog->any_upper_bound)
3805 7423 : epilog->nb_iterations_upper_bound -= lowest_vf;
3806 7423 : if (epilog->any_likely_upper_bound)
3807 7423 : epilog->nb_iterations_likely_upper_bound -= lowest_vf;
3808 7423 : if (epilog->any_estimate)
3809 7421 : epilog->nb_iterations_estimate -= lowest_vf;
3810 : }
3811 :
3812 33342 : unsigned HOST_WIDE_INT bound;
3813 33342 : if (bound_scalar.is_constant (&bound))
3814 : {
3815 33342 : gcc_assert (bound != 0);
3816 : /* Adjust the upper bound by the extra peeled vector iteration if we
3817 : are an epilogue of an peeled vect loop and not VLA. For VLA the
3818 : loop bounds are unknown. */
3819 66665 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)
3820 33342 : && vf.is_constant ())
3821 62 : bound += vf.to_constant ();
3822 : /* -1 to convert loop iterations to latch iterations. */
3823 33342 : record_niter_bound (epilog, bound - 1, false, true);
3824 33342 : scale_loop_profile (epilog, profile_probability::always (),
3825 : bound - 1);
3826 : }
3827 :
3828 33342 : delete_update_ssa ();
3829 33342 : adjust_vec_debug_stmts ();
3830 33342 : scev_reset ();
3831 33342 : }
3832 :
3833 33373 : if (vect_epilogues)
3834 : {
3835 7056 : epilog->aux = epilogue_vinfo;
3836 7056 : LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3837 7056 : LOOP_VINFO_MAIN_EXIT (epilogue_vinfo)
3838 7056 : = LOOP_VINFO_EPILOGUE_MAIN_EXIT (loop_vinfo);
3839 :
3840 7056 : loop_constraint_clear (epilog, LOOP_C_INFINITE);
3841 :
3842 : /* We now must calculate the number of NITERS performed by the previous
3843 : loop and EPILOGUE_NITERS to be performed by the epilogue. */
3844 7056 : tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3845 : niters_prolog, niters_vector_mult_vf);
3846 :
3847 : /* If skip_vector we may skip the previous loop, we insert a phi-node to
3848 : determine whether we are coming from the previous vectorized loop
3849 : using the update_e edge or the skip_vector basic block using the
3850 : skip_e edge. */
3851 7056 : if (skip_vector)
3852 : {
3853 5725 : gcc_assert (update_e != NULL && skip_e != NULL);
3854 5725 : gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3855 : update_e->dest);
3856 5725 : tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3857 5725 : gimple *stmt = gimple_build_assign (new_ssa, niters);
3858 5725 : gimple_stmt_iterator gsi;
3859 5725 : if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3860 5725 : && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3861 : {
3862 5725 : gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3863 5725 : gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3864 : }
3865 : else
3866 : {
3867 0 : gsi = gsi_last_bb (update_e->src);
3868 0 : gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3869 : }
3870 :
3871 5725 : niters = new_ssa;
3872 5725 : add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3873 5725 : add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3874 : UNKNOWN_LOCATION);
3875 5725 : niters = PHI_RESULT (new_phi);
3876 5725 : epilogue_vinfo->main_loop_edge = update_e;
3877 5725 : epilogue_vinfo->skip_main_loop_edge = skip_e;
3878 : }
3879 :
3880 : /* Set ADVANCE to the number of iterations performed by the previous
3881 : loop and its prologue. */
3882 7056 : *advance = niters;
3883 :
3884 : /* Subtract the number of iterations performed by the vectorized loop
3885 : from the number of total iterations. */
3886 7056 : tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3887 : before_loop_niters,
3888 : niters);
3889 :
3890 7056 : LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3891 7056 : LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3892 7056 : = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3893 : epilogue_niters,
3894 : build_one_cst (TREE_TYPE (epilogue_niters)));
3895 : }
3896 :
3897 33373 : adjust_vec.release ();
3898 33373 : free_original_copy_tables ();
3899 :
3900 33373 : return vect_epilogues ? epilog : NULL;
3901 33373 : }
3902 :
3903 : /* Function vect_create_cond_for_niters_checks.
3904 :
3905 : Create a conditional expression that represents the run-time checks for
3906 : loop's niter. The loop is guaranteed to terminate if the run-time
3907 : checks hold.
3908 :
3909 : Input:
3910 : COND_EXPR - input conditional expression. New conditions will be chained
3911 : with logical AND operation. If it is NULL, then the function
3912 : is used to return the number of alias checks.
3913 : LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3914 : to be checked.
3915 :
3916 : Output:
3917 : COND_EXPR - conditional expression.
3918 :
3919 : The returned COND_EXPR is the conditional expression to be used in the
3920 : if statement that controls which version of the loop gets executed at
3921 : runtime. */
3922 :
3923 : static void
3924 388 : vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3925 : {
3926 388 : tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3927 :
3928 388 : if (*cond_expr)
3929 388 : *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3930 : *cond_expr, part_cond_expr);
3931 : else
3932 0 : *cond_expr = part_cond_expr;
3933 388 : }
3934 :
3935 : /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3936 : and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
3937 :
3938 : static void
3939 349 : chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3940 : {
3941 349 : if (*cond_expr)
3942 345 : *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3943 : *cond_expr, part_cond_expr);
3944 : else
3945 4 : *cond_expr = part_cond_expr;
3946 349 : }
3947 :
3948 : /* Function vect_create_cond_for_align_checks.
3949 :
3950 : Create a conditional expression that represents the alignment checks for
3951 : all of data references (array element references) whose alignment must be
3952 : checked at runtime.
3953 :
3954 : Input:
3955 : COND_EXPR - input conditional expression. New conditions will be chained
3956 : with logical AND operation.
3957 : LOOP_VINFO - three fields of the loop information are used.
3958 : LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3959 : LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3960 : LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT indicates which check applies.
3961 :
3962 : Output:
3963 : COND_EXPR_STMT_LIST - statements needed to construct the conditional
3964 : expression.
3965 : The returned value is the conditional expression to be used in the if
3966 : statement that controls which version of the loop gets executed at runtime.
3967 :
3968 : Based on the boolean value of LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT, we decide
3969 : which type of check should be applied and create two different expressions
3970 : accordingly.
3971 : 1) When LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT is false, we see if all data refs
3972 : to be checked are already aligned to an alignment boundary. We create
3973 : an expression of "(a_1 | a_2 | a_3 | ... | a_n) & mask", where "a_i" is
3974 : the address of i'th data reference.
3975 : 2) When LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT is true, we see if all data refs
3976 : can be aligned to a boundary after a certain amount of peeling, in other
3977 : words, their addresses have the same bottom bits according to the mask.
3978 : We create "((a_1 ^ a_2) | (a_2 ^ a_3) | ... | (a_n-1 ^ a_n)) & mask",
3979 : where "a_i" is the address of i'th data reference.
3980 :
3981 : Both algorithms make two assumptions:
3982 : 1) The number of bytes "n" in a vector is a power of 2.
3983 : 2) An address "a" is aligned if a%n is zero and that this
3984 : test can be done as a&(n-1) == 0. For example, for 16
3985 : byte vectors the test is a&0xf == 0. */
3986 :
3987 : static void
3988 43 : vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3989 : tree *cond_expr,
3990 : gimple_seq *cond_expr_stmt_list)
3991 : {
3992 43 : const vec<stmt_vec_info> &may_misalign_stmts
3993 : = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3994 43 : stmt_vec_info stmt_info;
3995 43 : poly_uint64 mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3996 43 : tree mask_cst;
3997 43 : unsigned int i;
3998 43 : tree int_ptrsize_type;
3999 43 : char tmp_name[30];
4000 43 : tree or_tmp_name = NULL_TREE;
4001 43 : tree prev_addr_tmp_name = NULL_TREE;
4002 43 : tree and_tmp_name;
4003 43 : gimple *and_stmt;
4004 43 : tree ptrsize_zero;
4005 43 : tree part_cond_expr;
4006 :
4007 43 : gcc_assert (known_ne (mask, 0U));
4008 :
4009 43 : int_ptrsize_type = signed_type_for (ptr_type_node);
4010 :
4011 : /* If LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT is true, we should have at least two
4012 : datarefs to check the mutual alignment. */
4013 43 : gcc_assert (may_misalign_stmts.length () > 1
4014 : || !LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT (loop_vinfo));
4015 :
4016 103 : FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
4017 : {
4018 60 : gimple_seq new_stmt_list = NULL;
4019 60 : tree addr_base;
4020 60 : tree addr_tmp_name;
4021 60 : tree xor_tmp_name;
4022 60 : tree new_or_tmp_name;
4023 60 : gimple *addr_stmt, *or_stmt, *xor_stmt;
4024 60 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4025 60 : bool negative = tree_int_cst_compare
4026 60 : (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
4027 60 : tree offset = negative
4028 60 : ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
4029 : * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
4030 60 : : size_zero_node;
4031 :
4032 : /* create: addr_tmp = (int)(address_of_first_vector) */
4033 60 : addr_base =
4034 60 : vect_create_addr_base_for_vector_ref (loop_vinfo,
4035 : stmt_info, &new_stmt_list,
4036 : offset);
4037 60 : if (new_stmt_list != NULL)
4038 14 : gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
4039 :
4040 60 : sprintf (tmp_name, "addr2int%d", i);
4041 60 : addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
4042 60 : addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
4043 60 : gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
4044 :
4045 60 : if (LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT (loop_vinfo))
4046 : {
4047 : /* Create "((a_1 ^ a_2) | (a_2 ^ a_3) | ... | (a_n-1 ^ a_n)) & mask"
4048 : to check mutual alignment. */
4049 32 : if (prev_addr_tmp_name != NULL_TREE)
4050 : {
4051 16 : sprintf (tmp_name, "xorptrs%d_%d", i - 1, i);
4052 16 : xor_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL,
4053 : tmp_name);
4054 16 : xor_stmt = gimple_build_assign (xor_tmp_name, BIT_XOR_EXPR,
4055 : prev_addr_tmp_name,
4056 : addr_tmp_name);
4057 16 : gimple_seq_add_stmt (cond_expr_stmt_list, xor_stmt);
4058 16 : if (or_tmp_name == NULL_TREE)
4059 : {
4060 : /* Create the 1st XOR when the 2nd data ref is seen. */
4061 : or_tmp_name = xor_tmp_name;
4062 : }
4063 : else
4064 : {
4065 : /* Create: or_tmp = or_tmp | new_xor_tmp. */
4066 0 : sprintf (tmp_name, "orxors%d", i - 1);
4067 0 : new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL,
4068 : tmp_name);
4069 0 : or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
4070 : or_tmp_name, xor_tmp_name);
4071 0 : gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
4072 0 : or_tmp_name = new_or_tmp_name;
4073 : }
4074 : }
4075 : prev_addr_tmp_name = addr_tmp_name;
4076 : }
4077 : else
4078 : {
4079 : /* Create: "(a_1 | a_2 | a_3 | ... | a_n) & mask" to check if all
4080 : addresses are already aligned. */
4081 28 : if (or_tmp_name != NULL_TREE)
4082 : {
4083 : /* Create: or_tmp = or_tmp | addr_tmp. */
4084 1 : sprintf (tmp_name, "orptrs%d", i);
4085 1 : new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL,
4086 : tmp_name);
4087 1 : or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
4088 : or_tmp_name, addr_tmp_name);
4089 1 : gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
4090 1 : or_tmp_name = new_or_tmp_name;
4091 : }
4092 : else
4093 : or_tmp_name = addr_tmp_name;
4094 : }
4095 :
4096 : } /* end for i */
4097 :
4098 43 : mask_cst = build_int_cst (int_ptrsize_type, mask);
4099 :
4100 : /* create: and_tmp = or_tmp & mask */
4101 43 : and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
4102 :
4103 43 : and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
4104 : or_tmp_name, mask_cst);
4105 43 : gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
4106 :
4107 : /* Make and_tmp the left operand of the conditional test against zero.
4108 : if and_tmp has a nonzero bit then some address is unaligned. */
4109 43 : ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
4110 43 : part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
4111 : and_tmp_name, ptrsize_zero);
4112 43 : chain_cond_expr (cond_expr, part_cond_expr);
4113 43 : }
4114 :
4115 : /* Function vect_create_cond_for_vla_spec_read.
4116 :
4117 : Create a conditional expression that represents the run-time checks with
4118 : max speculative read amount in VLA modes. We check two things:
4119 : 1) if the max speculative read amount exceeds the min page size
4120 : 2) if the VF is power-of-2 - done by checking the max read amount instead
4121 :
4122 : Input:
4123 : COND_EXPR - input conditional expression. New conditions will be chained
4124 : with logical AND operation.
4125 : LOOP_VINFO - field LOOP_VINFO_MAX_SPEC_READ_AMOUNT contains the max
4126 : possible speculative read amount in VLA modes.
4127 :
4128 : Output:
4129 : COND_EXPR - conditional expression.
4130 :
4131 : The returned COND_EXPR is the conditional expression to be used in the
4132 : if statement that controls which version of the loop gets executed at
4133 : runtime. */
4134 :
4135 : static void
4136 0 : vect_create_cond_for_vla_spec_read (loop_vec_info loop_vinfo, tree *cond_expr)
4137 : {
4138 0 : poly_uint64 read_amount_poly = LOOP_VINFO_MAX_SPEC_READ_AMOUNT (loop_vinfo);
4139 0 : tree amount = build_int_cst (long_unsigned_type_node, read_amount_poly);
4140 :
4141 : /* Both the read amount and the VF must be variants, and the read amount must
4142 : be a constant power-of-2 multiple of the VF. */
4143 0 : unsigned HOST_WIDE_INT multiple;
4144 0 : gcc_assert (!read_amount_poly.is_constant ()
4145 : && !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()
4146 : && constant_multiple_p (read_amount_poly,
4147 : LOOP_VINFO_VECT_FACTOR (loop_vinfo),
4148 : &multiple)
4149 : && pow2p_hwi (multiple));
4150 :
4151 : tree cst_ul_zero = build_int_cstu (long_unsigned_type_node, 0U);
4152 : tree cst_ul_one = build_int_cstu (long_unsigned_type_node, 1U);
4153 : tree cst_ul_pagesize = build_int_cstu (long_unsigned_type_node,
4154 : (unsigned long) param_min_pagesize);
4155 :
4156 : /* Create an expression of "amount & (amount - 1) == 0". */
4157 : tree amount_m1 = fold_build2 (MINUS_EXPR, long_unsigned_type_node,
4158 : amount, cst_ul_one);
4159 : tree amount_and_expr = fold_build2 (BIT_AND_EXPR, long_unsigned_type_node,
4160 : amount, amount_m1);
4161 : tree powof2_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
4162 : amount_and_expr, cst_ul_zero);
4163 : chain_cond_expr (cond_expr, powof2_cond_expr);
4164 :
4165 : /* Create an expression of "amount <= cst_ul_pagesize". */
4166 : tree pagesize_cond_expr = fold_build2 (LE_EXPR, boolean_type_node,
4167 : amount, cst_ul_pagesize);
4168 : chain_cond_expr (cond_expr, pagesize_cond_expr);
4169 : }
4170 :
4171 : /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
4172 : create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
4173 : Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
4174 : and this new condition are true. Treat a null *COND_EXPR as "true". */
4175 :
4176 : static void
4177 3315 : vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
4178 : {
4179 3315 : const vec<vec_object_pair> &pairs
4180 : = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
4181 3315 : unsigned int i;
4182 3315 : vec_object_pair *pair;
4183 3327 : FOR_EACH_VEC_ELT (pairs, i, pair)
4184 : {
4185 12 : tree addr1 = build_fold_addr_expr (pair->first);
4186 12 : tree addr2 = build_fold_addr_expr (pair->second);
4187 12 : tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
4188 : addr1, addr2);
4189 12 : chain_cond_expr (cond_expr, part_cond_expr);
4190 : }
4191 3315 : }
4192 :
4193 : /* Create an expression that is true when all lower-bound conditions for
4194 : the vectorized loop are met. Chain this condition with *COND_EXPR. */
4195 :
4196 : static void
4197 3315 : vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
4198 : {
4199 3315 : const vec<vec_lower_bound> &lower_bounds
4200 : = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
4201 3609 : for (unsigned int i = 0; i < lower_bounds.length (); ++i)
4202 : {
4203 294 : tree expr = lower_bounds[i].expr;
4204 294 : tree type = unsigned_type_for (TREE_TYPE (expr));
4205 294 : expr = fold_convert (type, expr);
4206 294 : poly_uint64 bound = lower_bounds[i].min_value;
4207 294 : if (!lower_bounds[i].unsigned_p)
4208 : {
4209 125 : expr = fold_build2 (PLUS_EXPR, type, expr,
4210 : build_int_cstu (type, bound - 1));
4211 125 : bound += bound - 1;
4212 : }
4213 294 : tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
4214 : build_int_cstu (type, bound));
4215 294 : chain_cond_expr (cond_expr, part_cond_expr);
4216 : }
4217 3315 : }
4218 :
4219 : /* Function vect_create_cond_for_alias_checks.
4220 :
4221 : Create a conditional expression that represents the run-time checks for
4222 : overlapping of address ranges represented by a list of data references
4223 : relations passed as input.
4224 :
4225 : Input:
4226 : COND_EXPR - input conditional expression. New conditions will be chained
4227 : with logical AND operation. If it is NULL, then the function
4228 : is used to return the number of alias checks.
4229 : LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
4230 : to be checked.
4231 :
4232 : Output:
4233 : COND_EXPR - conditional expression.
4234 :
4235 : The returned COND_EXPR is the conditional expression to be used in the if
4236 : statement that controls which version of the loop gets executed at runtime.
4237 : */
4238 :
4239 : void
4240 3315 : vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
4241 : {
4242 3315 : const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
4243 : LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
4244 :
4245 3315 : if (comp_alias_ddrs.is_empty ())
4246 : return;
4247 :
4248 3168 : create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
4249 : &comp_alias_ddrs, cond_expr);
4250 3168 : if (dump_enabled_p ())
4251 1290 : dump_printf_loc (MSG_NOTE, vect_location,
4252 : "created %u versioning for alias checks.\n",
4253 : comp_alias_ddrs.length ());
4254 : }
4255 :
4256 :
4257 : /* Function vect_loop_versioning.
4258 :
4259 : If the loop has data references that may or may not be aligned or/and
4260 : has data reference relations whose independence was not proven then
4261 : two versions of the loop need to be generated, one which is vectorized
4262 : and one which isn't. A test is then generated to control which of the
4263 : loops is executed. The test checks for the alignment of all of the
4264 : data references that may or may not be aligned. An additional
4265 : sequence of runtime tests is generated for each pairs of DDRs whose
4266 : independence was not proven. The vectorized version of loop is
4267 : executed only if both alias and alignment tests are passed.
4268 :
4269 : The test generated to check which version of loop is executed
4270 : is modified to also check for profitability as indicated by the
4271 : cost model threshold TH.
4272 :
4273 : The versioning precondition(s) are placed in *COND_EXPR and
4274 : *COND_EXPR_STMT_LIST. */
4275 :
4276 : class loop *
4277 3783 : vect_loop_versioning (loop_vec_info loop_vinfo,
4278 : gimple *loop_vectorized_call)
4279 : {
4280 3783 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
4281 3783 : class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
4282 3783 : basic_block condition_bb;
4283 3783 : gphi_iterator gsi;
4284 3783 : gimple_stmt_iterator cond_exp_gsi;
4285 3783 : basic_block merge_bb;
4286 3783 : basic_block new_exit_bb;
4287 3783 : edge new_exit_e, e;
4288 3783 : gphi *orig_phi, *new_phi;
4289 3783 : tree cond_expr = NULL_TREE;
4290 3783 : gimple_seq cond_expr_stmt_list = NULL;
4291 3783 : tree arg;
4292 3783 : profile_probability prob = profile_probability::likely ();
4293 3783 : gimple_seq gimplify_stmt_list = NULL;
4294 3783 : tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
4295 3783 : bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
4296 3783 : bool version_spec_read = LOOP_REQUIRES_VERSIONING_FOR_SPEC_READ (loop_vinfo);
4297 3783 : bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
4298 3783 : bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
4299 3783 : poly_uint64 versioning_threshold
4300 : = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
4301 3783 : tree version_simd_if_cond
4302 : = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
4303 3783 : unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
4304 3783 : bool uncounted_p = LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo);
4305 :
4306 3783 : if (!uncounted_p && vect_apply_runtime_profitability_check_p (loop_vinfo)
4307 : && !ordered_p (th, versioning_threshold))
4308 : cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
4309 : build_int_cst (TREE_TYPE (scalar_loop_iters),
4310 : th - 1));
4311 3783 : if (!uncounted_p && maybe_ne (versioning_threshold, 0U))
4312 : {
4313 3771 : tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
4314 : build_int_cst (TREE_TYPE (scalar_loop_iters),
4315 : versioning_threshold - 1));
4316 3771 : if (cond_expr)
4317 0 : cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
4318 : expr, cond_expr);
4319 : else
4320 3771 : cond_expr = expr;
4321 : }
4322 :
4323 3783 : tree cost_name = NULL_TREE;
4324 3783 : profile_probability prob2 = profile_probability::always ();
4325 3783 : if (cond_expr
4326 3771 : && EXPR_P (cond_expr)
4327 2594 : && (version_niter
4328 2594 : || version_align
4329 : || version_alias
4330 2248 : || version_simd_if_cond))
4331 : {
4332 2594 : cost_name = cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
4333 : &cond_expr_stmt_list,
4334 : is_gimple_val, NULL_TREE);
4335 : /* Split prob () into two so that the overall probability of passing
4336 : both the cost-model and versioning checks is the orig prob. */
4337 2594 : prob2 = prob = prob.sqrt ();
4338 : }
4339 :
4340 3783 : if (version_niter)
4341 388 : vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
4342 :
4343 3783 : if (cond_expr)
4344 : {
4345 3771 : gimple_seq tem = NULL;
4346 3771 : cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
4347 : &tem, is_gimple_condexpr_for_cond,
4348 : NULL_TREE);
4349 3771 : gimple_seq_add_seq (&cond_expr_stmt_list, tem);
4350 : }
4351 :
4352 3783 : if (version_align)
4353 43 : vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
4354 : &cond_expr_stmt_list);
4355 :
4356 3783 : if (version_spec_read)
4357 0 : vect_create_cond_for_vla_spec_read (loop_vinfo, &cond_expr);
4358 :
4359 3783 : if (version_alias)
4360 : {
4361 3315 : vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
4362 3315 : vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
4363 3315 : vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
4364 : }
4365 :
4366 3783 : if (version_simd_if_cond)
4367 : {
4368 58 : gcc_assert (dom_info_available_p (CDI_DOMINATORS));
4369 58 : if (flag_checking)
4370 58 : if (basic_block bb
4371 58 : = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
4372 58 : gcc_assert (bb != loop->header
4373 : && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
4374 : && (scalar_loop == NULL
4375 : || (bb != scalar_loop->header
4376 : && dominated_by_p (CDI_DOMINATORS,
4377 : scalar_loop->header, bb))));
4378 58 : tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
4379 58 : tree c = fold_build2 (NE_EXPR, boolean_type_node,
4380 : version_simd_if_cond, zero);
4381 58 : if (cond_expr)
4382 58 : cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
4383 : c, cond_expr);
4384 : else
4385 0 : cond_expr = c;
4386 58 : if (dump_enabled_p ())
4387 5 : dump_printf_loc (MSG_NOTE, vect_location,
4388 : "created versioning for simd if condition check.\n");
4389 : }
4390 :
4391 3783 : cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
4392 : &gimplify_stmt_list,
4393 : is_gimple_condexpr_for_cond, NULL_TREE);
4394 3783 : gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
4395 :
4396 : /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
4397 : invariant in. */
4398 3783 : class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
4399 3783 : for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
4400 70638 : !gsi_end_p (gsi); gsi_next (&gsi))
4401 : {
4402 66855 : gimple *stmt = gsi_stmt (gsi);
4403 66855 : update_stmt (stmt);
4404 66855 : ssa_op_iter iter;
4405 66855 : use_operand_p use_p;
4406 66855 : basic_block def_bb;
4407 153967 : FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
4408 87112 : if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
4409 87112 : && flow_bb_inside_loop_p (outermost, def_bb))
4410 2546 : outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
4411 : }
4412 :
4413 : /* Search for the outermost loop we can version. Avoid versioning of
4414 : non-perfect nests but allow if-conversion versioned loops inside. */
4415 3783 : class loop *loop_to_version = loop;
4416 3783 : if (flow_loop_nested_p (outermost, loop))
4417 : {
4418 1475 : if (dump_enabled_p ())
4419 674 : dump_printf_loc (MSG_NOTE, vect_location,
4420 : "trying to apply versioning to outer loop %d\n",
4421 : outermost->num);
4422 1475 : if (outermost->num == 0)
4423 1369 : outermost = superloop_at_depth (loop, 1);
4424 : /* And avoid applying versioning on non-perfect nests. */
4425 : while (loop_to_version != outermost
4426 132 : && (e = single_exit (loop_outer (loop_to_version)))
4427 111 : && !(e->flags & EDGE_COMPLEX)
4428 110 : && (!loop_outer (loop_to_version)->inner->next
4429 63 : || vect_loop_vectorized_call (loop_to_version))
4430 1579 : && (!loop_outer (loop_to_version)->inner->next
4431 5 : || !loop_outer (loop_to_version)->inner->next->next))
4432 : loop_to_version = loop_outer (loop_to_version);
4433 : }
4434 :
4435 : /* Apply versioning. If there is already a scalar version created by
4436 : if-conversion re-use that. Note we cannot re-use the copy of
4437 : an if-converted outer-loop when vectorizing the inner loop only. */
4438 3783 : gcond *cond;
4439 3783 : if ((!loop_to_version->inner || loop == loop_to_version)
4440 3743 : && loop_vectorized_call)
4441 : {
4442 91 : gcc_assert (scalar_loop);
4443 91 : condition_bb = gimple_bb (loop_vectorized_call);
4444 182 : cond = as_a <gcond *> (*gsi_last_bb (condition_bb));
4445 91 : gimple_cond_set_condition_from_tree (cond, cond_expr);
4446 91 : update_stmt (cond);
4447 :
4448 91 : if (cond_expr_stmt_list)
4449 : {
4450 91 : cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
4451 91 : gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
4452 : GSI_SAME_STMT);
4453 : }
4454 :
4455 : /* if-conversion uses profile_probability::always () for both paths,
4456 : reset the paths probabilities appropriately. */
4457 91 : edge te, fe;
4458 91 : extract_true_false_edges_from_block (condition_bb, &te, &fe);
4459 91 : te->probability = prob;
4460 91 : fe->probability = prob.invert ();
4461 : /* We can scale loops counts immediately but have to postpone
4462 : scaling the scalar loop because we re-use it during peeling.
4463 :
4464 : Ifcvt duplicates loop preheader, loop body and produces an basic
4465 : block after loop exit. We need to scale all that. */
4466 91 : basic_block preheader = loop_preheader_edge (loop_to_version)->src;
4467 91 : preheader->count = preheader->count.apply_probability (prob * prob2);
4468 91 : scale_loop_frequencies (loop_to_version, prob * prob2);
4469 : /* When the loop has multiple exits then we can only version itself.
4470 : This is denoted by loop_to_version == loop. In this case we can
4471 : do the versioning by selecting the exit edge the vectorizer is
4472 : currently using. */
4473 91 : edge exit_edge;
4474 91 : if (loop_to_version == loop)
4475 91 : exit_edge = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
4476 : else
4477 0 : exit_edge = single_exit (loop_to_version);
4478 91 : exit_edge->dest->count = preheader->count;
4479 91 : LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = (prob * prob2).invert ();
4480 :
4481 91 : nloop = scalar_loop;
4482 91 : if (dump_enabled_p ())
4483 96 : dump_printf_loc (MSG_NOTE, vect_location,
4484 : "reusing %sloop version created by if conversion\n",
4485 : loop_to_version != loop ? "outer " : "");
4486 91 : }
4487 : else
4488 : {
4489 3692 : if (loop_to_version != loop
4490 3692 : && dump_enabled_p ())
4491 14 : dump_printf_loc (MSG_NOTE, vect_location,
4492 : "applying loop versioning to outer loop %d\n",
4493 : loop_to_version->num);
4494 :
4495 3692 : unsigned orig_pe_idx = loop_preheader_edge (loop)->dest_idx;
4496 :
4497 3692 : initialize_original_copy_tables ();
4498 7384 : nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
4499 3692 : prob * prob2, (prob * prob2).invert (),
4500 3692 : prob * prob2, (prob * prob2).invert (),
4501 : true);
4502 :
4503 : /* If the PHI nodes in the loop header were reallocated, we need to fix up
4504 : our internally stashed copies of those. */
4505 3692 : if (loop_to_version == loop)
4506 3652 : for (auto gsi = gsi_start_phis (loop->header);
4507 14528 : !gsi_end_p (gsi); gsi_next (&gsi))
4508 10876 : loop_vinfo->resync_stmt_addr (gsi.phi ());
4509 :
4510 : /* We will later insert second conditional so overall outcome of
4511 : both is prob * prob2. */
4512 3692 : edge true_e, false_e;
4513 3692 : extract_true_false_edges_from_block (condition_bb, &true_e, &false_e);
4514 3692 : true_e->probability = prob;
4515 3692 : false_e->probability = prob.invert ();
4516 3692 : gcc_assert (nloop);
4517 3692 : nloop = get_loop_copy (loop);
4518 :
4519 : /* Assign hierarchical discriminators to distinguish loop versions.
4520 : Only assign to the scalar version here; the vectorized version will
4521 : get discriminators later during transformation/peeling.
4522 : Use dynamic copy_id allocation instead of hardcoded constants. */
4523 3692 : gimple *nloop_last = last_nondebug_stmt (nloop->header);
4524 3692 : location_t nloop_loc
4525 3692 : = nloop_last ? gimple_location (nloop_last) : UNKNOWN_LOCATION;
4526 3692 : if (nloop_loc != UNKNOWN_LOCATION)
4527 : {
4528 3231 : unsigned int nloop_copyid = allocate_copyid_base (nloop_loc, 1);
4529 3231 : assign_discriminators_to_loop (nloop, 0, nloop_copyid);
4530 : }
4531 : /* For cycle vectorization with SLP we rely on the PHI arguments
4532 : appearing in the same order as the SLP node operands which for the
4533 : loop PHI nodes means the preheader edge dest index needs to remain
4534 : the same for the analyzed loop which also becomes the vectorized one.
4535 : Make it so in case the state after versioning differs by redirecting
4536 : the first edge into the header to the same destination which moves
4537 : it last. */
4538 3692 : if (loop_preheader_edge (loop)->dest_idx != orig_pe_idx)
4539 : {
4540 288 : edge e = EDGE_PRED (loop->header, 0);
4541 288 : ssa_redirect_edge (e, e->dest);
4542 288 : flush_pending_stmts (e);
4543 : }
4544 3692 : gcc_assert (loop_preheader_edge (loop)->dest_idx == orig_pe_idx);
4545 :
4546 : /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
4547 : reap those otherwise; they also refer to the original
4548 : loops. */
4549 : class loop *l = loop;
4550 3697 : while (gimple *call = vect_loop_vectorized_call (l))
4551 : {
4552 5 : call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
4553 5 : fold_loop_internal_call (call, boolean_false_node);
4554 5 : l = loop_outer (l);
4555 5 : }
4556 3692 : free_original_copy_tables ();
4557 :
4558 3692 : if (cond_expr_stmt_list)
4559 : {
4560 3611 : cond_exp_gsi = gsi_last_bb (condition_bb);
4561 3611 : gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
4562 : GSI_SAME_STMT);
4563 : }
4564 :
4565 : /* Loop versioning violates an assumption we try to maintain during
4566 : vectorization - that the loop exit block has a single predecessor.
4567 : After versioning, the exit block of both loop versions is the same
4568 : basic block (i.e. it has two predecessors). Just in order to simplify
4569 : following transformations in the vectorizer, we fix this situation
4570 : here by adding a new (empty) block on the exit-edge of the loop,
4571 : with the proper loop-exit phis to maintain loop-closed-form.
4572 : If loop versioning wasn't done from loop, but scalar_loop instead,
4573 : merge_bb will have already just a single successor. */
4574 :
4575 : /* When the loop has multiple exits then we can only version itself.
4576 : This is denoted by loop_to_version == loop. In this case we can
4577 : do the versioning by selecting the exit edge the vectorizer is
4578 : currently using. */
4579 3692 : edge exit_edge;
4580 3692 : if (loop_to_version == loop)
4581 3652 : exit_edge = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
4582 : else
4583 40 : exit_edge = single_exit (loop_to_version);
4584 :
4585 3692 : gcc_assert (exit_edge);
4586 3692 : merge_bb = exit_edge->dest;
4587 3692 : if (EDGE_COUNT (merge_bb->preds) >= 2)
4588 : {
4589 3692 : gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
4590 3692 : new_exit_bb = split_edge (exit_edge);
4591 3692 : new_exit_e = exit_edge;
4592 3692 : e = EDGE_SUCC (new_exit_bb, 0);
4593 :
4594 7633 : for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
4595 3941 : gsi_next (&gsi))
4596 : {
4597 3941 : tree new_res;
4598 3941 : orig_phi = gsi.phi ();
4599 3941 : new_res = copy_ssa_name (PHI_RESULT (orig_phi));
4600 3941 : new_phi = create_phi_node (new_res, new_exit_bb);
4601 3941 : arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
4602 3941 : add_phi_arg (new_phi, arg, new_exit_e,
4603 : gimple_phi_arg_location_from_edge (orig_phi, e));
4604 3941 : adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
4605 : }
4606 : }
4607 :
4608 3692 : update_ssa (TODO_update_ssa_no_phi);
4609 : }
4610 :
4611 : /* Split the cost model check off to a separate BB. Costing assumes
4612 : this is the only thing we perform when we enter the scalar loop
4613 : from a failed cost decision. */
4614 3783 : if (cost_name && TREE_CODE (cost_name) == SSA_NAME)
4615 : {
4616 2594 : gimple *def = SSA_NAME_DEF_STMT (cost_name);
4617 2594 : gcc_assert (gimple_bb (def) == condition_bb);
4618 : /* All uses of the cost check are 'true' after the check we
4619 : are going to insert. */
4620 2594 : replace_uses_by (cost_name, boolean_true_node);
4621 : /* And we're going to build the new single use of it. */
4622 2594 : gcond *cond = gimple_build_cond (NE_EXPR, cost_name, boolean_false_node,
4623 : NULL_TREE, NULL_TREE);
4624 2594 : edge e = split_block (gimple_bb (def), def);
4625 2594 : gimple_stmt_iterator gsi = gsi_for_stmt (def);
4626 2594 : gsi_insert_after (&gsi, cond, GSI_NEW_STMT);
4627 2594 : edge true_e, false_e;
4628 2594 : extract_true_false_edges_from_block (e->dest, &true_e, &false_e);
4629 2594 : e->flags &= ~EDGE_FALLTHRU;
4630 2594 : e->flags |= EDGE_TRUE_VALUE;
4631 2594 : edge e2 = make_edge (e->src, false_e->dest, EDGE_FALSE_VALUE);
4632 2594 : e->probability = prob2;
4633 2594 : e2->probability = prob2.invert ();
4634 2594 : e->dest->count = e->count ();
4635 2594 : set_immediate_dominator (CDI_DOMINATORS, false_e->dest, e->src);
4636 2594 : auto_vec<basic_block, 3> adj;
4637 2594 : for (basic_block son = first_dom_son (CDI_DOMINATORS, e->dest);
4638 7673 : son;
4639 5079 : son = next_dom_son (CDI_DOMINATORS, son))
4640 7564 : if (EDGE_COUNT (son->preds) > 1)
4641 2485 : adj.safe_push (son);
4642 10267 : for (auto son : adj)
4643 2485 : set_immediate_dominator (CDI_DOMINATORS, son, e->src);
4644 : //debug_bb (condition_bb);
4645 : //debug_bb (e->src);
4646 2594 : }
4647 :
4648 3783 : if (version_niter)
4649 : {
4650 : /* The versioned loop could be infinite, we need to clear existing
4651 : niter information which is copied from the original loop. */
4652 388 : gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
4653 388 : vect_free_loop_info_assumptions (nloop);
4654 : }
4655 :
4656 3783 : if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
4657 3783 : && dump_enabled_p ())
4658 : {
4659 816 : if (version_alias)
4660 747 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
4661 : vect_location,
4662 : "loop versioned for vectorization because of "
4663 : "possible aliasing\n");
4664 816 : if (version_align)
4665 25 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
4666 : vect_location,
4667 : "loop versioned for vectorization to enhance "
4668 : "alignment\n");
4669 :
4670 : }
4671 :
4672 3783 : return nloop;
4673 : }
|