Branch data Line data Source code
1 : : /* Vectorizer Specific Loop Manipulations
2 : : Copyright (C) 2003-2025 Free Software Foundation, Inc.
3 : : Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 : : and Ira Rosen <irar@il.ibm.com>
5 : :
6 : : This file is part of GCC.
7 : :
8 : : GCC is free software; you can redistribute it and/or modify it under
9 : : the terms of the GNU General Public License as published by the Free
10 : : Software Foundation; either version 3, or (at your option) any later
11 : : version.
12 : :
13 : : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : : for more details.
17 : :
18 : : You should have received a copy of the GNU General Public License
19 : : along with GCC; see the file COPYING3. If not see
20 : : <http://www.gnu.org/licenses/>. */
21 : :
22 : : #include "config.h"
23 : : #include "system.h"
24 : : #include "coretypes.h"
25 : : #include "backend.h"
26 : : #include "tree.h"
27 : : #include "gimple.h"
28 : : #include "cfghooks.h"
29 : : #include "tree-pass.h"
30 : : #include "ssa.h"
31 : : #include "fold-const.h"
32 : : #include "cfganal.h"
33 : : #include "gimplify.h"
34 : : #include "gimple-iterator.h"
35 : : #include "gimplify-me.h"
36 : : #include "tree-cfg.h"
37 : : #include "tree-ssa-loop-manip.h"
38 : : #include "tree-into-ssa.h"
39 : : #include "tree-ssa.h"
40 : : #include "cfgloop.h"
41 : : #include "tree-scalar-evolution.h"
42 : : #include "tree-vectorizer.h"
43 : : #include "tree-ssa-loop-ivopts.h"
44 : : #include "gimple-fold.h"
45 : : #include "tree-ssa-loop-niter.h"
46 : : #include "internal-fn.h"
47 : : #include "stor-layout.h"
48 : : #include "optabs-query.h"
49 : : #include "vec-perm-indices.h"
50 : : #include "insn-config.h"
51 : : #include "rtl.h"
52 : : #include "recog.h"
53 : : #include "langhooks.h"
54 : : #include "tree-vector-builder.h"
55 : : #include "optabs-tree.h"
56 : :
57 : : /*************************************************************************
58 : : Simple Loop Peeling Utilities
59 : :
60 : : Utilities to support loop peeling for vectorization purposes.
61 : : *************************************************************************/
62 : :
63 : :
64 : : /* Renames the use *OP_P. */
65 : :
66 : : static void
67 : 844851 : rename_use_op (use_operand_p op_p)
68 : : {
69 : 844851 : tree new_name;
70 : :
71 : 844851 : if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
72 : : return;
73 : :
74 : 842309 : new_name = get_current_def (USE_FROM_PTR (op_p));
75 : :
76 : : /* Something defined outside of the loop. */
77 : 842309 : if (!new_name)
78 : : return;
79 : :
80 : : /* An ordinary ssa name defined in the loop. */
81 : :
82 : 726988 : SET_USE (op_p, new_name);
83 : : }
84 : :
85 : :
86 : : /* Renames the variables in basic block BB. Allow renaming of PHI arguments
87 : : on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
88 : : true. */
89 : :
90 : : static void
91 : 100872 : rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
92 : : {
93 : 100872 : gimple *stmt;
94 : 100872 : use_operand_p use_p;
95 : 100872 : ssa_op_iter iter;
96 : 100872 : edge e;
97 : 100872 : edge_iterator ei;
98 : 100872 : class loop *loop = bb->loop_father;
99 : 100872 : class loop *outer_loop = NULL;
100 : :
101 : 100872 : if (rename_from_outer_loop)
102 : : {
103 : 633 : gcc_assert (loop);
104 : 633 : outer_loop = loop_outer (loop);
105 : : }
106 : :
107 : 682414 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
108 : 480670 : gsi_next (&gsi))
109 : : {
110 : 480670 : stmt = gsi_stmt (gsi);
111 : 1185676 : FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
112 : 705006 : rename_use_op (use_p);
113 : : }
114 : :
115 : 205556 : FOR_EACH_EDGE (e, ei, bb->preds)
116 : : {
117 : 104684 : if (!flow_bb_inside_loop_p (loop, e->src))
118 : : {
119 : 31771 : if (!rename_from_outer_loop)
120 : 31563 : continue;
121 : 208 : if (e->src != outer_loop->header)
122 : : {
123 : 114 : if (outer_loop->inner->next)
124 : : {
125 : : /* If outer_loop has 2 inner loops, allow there to
126 : : be an extra basic block which decides which of the
127 : : two loops to use using LOOP_VECTORIZED. */
128 : 111 : if (!single_pred_p (e->src)
129 : 10 : || single_pred (e->src) != outer_loop->header)
130 : 101 : continue;
131 : : }
132 : : }
133 : : }
134 : 168211 : for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
135 : 95191 : gsi_next (&gsi))
136 : 95191 : rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
137 : : }
138 : 100872 : }
139 : :
140 : :
141 : : struct adjust_info
142 : : {
143 : : tree from, to;
144 : : basic_block bb;
145 : : };
146 : :
147 : : /* A stack of values to be adjusted in debug stmts. We have to
148 : : process them LIFO, so that the closest substitution applies. If we
149 : : processed them FIFO, without the stack, we might substitute uses
150 : : with a PHI DEF that would soon become non-dominant, and when we got
151 : : to the suitable one, it wouldn't have anything to substitute any
152 : : more. */
153 : : static vec<adjust_info, va_heap> adjust_vec;
154 : :
155 : : /* Adjust any debug stmts that referenced AI->from values to use the
156 : : loop-closed AI->to, if the references are dominated by AI->bb and
157 : : not by the definition of AI->from. */
158 : :
159 : : static void
160 : 75853 : adjust_debug_stmts_now (adjust_info *ai)
161 : : {
162 : 75853 : basic_block bbphi = ai->bb;
163 : 75853 : tree orig_def = ai->from;
164 : 75853 : tree new_def = ai->to;
165 : 75853 : imm_use_iterator imm_iter;
166 : 75853 : gimple *stmt;
167 : 75853 : basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
168 : :
169 : 75853 : gcc_assert (dom_info_available_p (CDI_DOMINATORS));
170 : :
171 : : /* Adjust any debug stmts that held onto non-loop-closed
172 : : references. */
173 : 310828 : FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
174 : : {
175 : 234975 : use_operand_p use_p;
176 : 234975 : basic_block bbuse;
177 : :
178 : 234975 : if (!is_gimple_debug (stmt))
179 : 174495 : continue;
180 : :
181 : 60480 : gcc_assert (gimple_debug_bind_p (stmt));
182 : :
183 : 60480 : bbuse = gimple_bb (stmt);
184 : :
185 : 60480 : if ((bbuse == bbphi
186 : 60480 : || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
187 : 61748 : && !(bbuse == bbdef
188 : 634 : || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
189 : : {
190 : 0 : if (new_def)
191 : 0 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
192 : 0 : SET_USE (use_p, new_def);
193 : : else
194 : : {
195 : 0 : gimple_debug_bind_reset_value (stmt);
196 : 0 : update_stmt (stmt);
197 : : }
198 : : }
199 : 75853 : }
200 : 75853 : }
201 : :
202 : : /* Adjust debug stmts as scheduled before. */
203 : :
204 : : static void
205 : 30661 : adjust_vec_debug_stmts (void)
206 : : {
207 : 30661 : if (!MAY_HAVE_DEBUG_BIND_STMTS)
208 : : return;
209 : :
210 : 11480 : gcc_assert (adjust_vec.exists ());
211 : :
212 : 87222 : while (!adjust_vec.is_empty ())
213 : : {
214 : 75742 : adjust_debug_stmts_now (&adjust_vec.last ());
215 : 75742 : adjust_vec.pop ();
216 : : }
217 : : }
218 : :
219 : : /* Adjust any debug stmts that referenced FROM values to use the
220 : : loop-closed TO, if the references are dominated by BB and not by
221 : : the definition of FROM. If adjust_vec is non-NULL, adjustments
222 : : will be postponed until adjust_vec_debug_stmts is called. */
223 : :
224 : : static void
225 : 100746 : adjust_debug_stmts (tree from, tree to, basic_block bb)
226 : : {
227 : 100746 : adjust_info ai;
228 : :
229 : 100746 : if (MAY_HAVE_DEBUG_BIND_STMTS
230 : 100746 : && TREE_CODE (from) == SSA_NAME
231 : 92346 : && ! SSA_NAME_IS_DEFAULT_DEF (from)
232 : 191775 : && ! virtual_operand_p (from))
233 : : {
234 : 75853 : ai.from = from;
235 : 75853 : ai.to = to;
236 : 75853 : ai.bb = bb;
237 : :
238 : 75853 : if (adjust_vec.exists ())
239 : 75742 : adjust_vec.safe_push (ai);
240 : : else
241 : 111 : adjust_debug_stmts_now (&ai);
242 : : }
243 : 100746 : }
244 : :
245 : : /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
246 : : to adjust any debug stmts that referenced the old phi arg,
247 : : presumably non-loop-closed references left over from other
248 : : transformations. */
249 : :
250 : : static void
251 : 199216 : adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
252 : : {
253 : 199216 : tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
254 : :
255 : 199216 : gcc_assert (TREE_CODE (orig_def) != SSA_NAME
256 : : || orig_def != new_def);
257 : :
258 : 199216 : SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
259 : :
260 : 199216 : if (MAY_HAVE_DEBUG_BIND_STMTS)
261 : 78810 : adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
262 : : gimple_bb (update_phi));
263 : 199216 : }
264 : :
265 : : /* Define one loop rgroup control CTRL from loop LOOP. INIT_CTRL is the value
266 : : that the control should have during the first iteration and NEXT_CTRL is the
267 : : value that it should have on subsequent iterations. */
268 : :
269 : : static void
270 : 8 : vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
271 : : tree next_ctrl)
272 : : {
273 : 8 : gphi *phi = create_phi_node (ctrl, loop->header);
274 : 8 : add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
275 : 8 : add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
276 : 8 : }
277 : :
278 : : /* Add SEQ to the end of LOOP's preheader block. */
279 : :
280 : : static void
281 : 7 : add_preheader_seq (class loop *loop, gimple_seq seq)
282 : : {
283 : 7 : if (seq)
284 : : {
285 : 4 : edge pe = loop_preheader_edge (loop);
286 : 4 : basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
287 : 4 : gcc_assert (!new_bb);
288 : : }
289 : 7 : }
290 : :
291 : : /* Add SEQ to the beginning of LOOP's header block. */
292 : :
293 : : static void
294 : 0 : add_header_seq (class loop *loop, gimple_seq seq)
295 : : {
296 : 0 : if (seq)
297 : : {
298 : 0 : gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
299 : 0 : gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
300 : : }
301 : 0 : }
302 : :
303 : : /* Return true if the target can interleave elements of two vectors.
304 : : OFFSET is 0 if the first half of the vectors should be interleaved
305 : : or 1 if the second half should. When returning true, store the
306 : : associated permutation in INDICES. */
307 : :
308 : : static bool
309 : 0 : interleave_supported_p (vec_perm_indices *indices, tree vectype,
310 : : unsigned int offset)
311 : : {
312 : 0 : poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
313 : 0 : poly_uint64 base = exact_div (nelts, 2) * offset;
314 : 0 : vec_perm_builder sel (nelts, 2, 3);
315 : 0 : for (unsigned int i = 0; i < 3; ++i)
316 : : {
317 : 0 : sel.quick_push (base + i);
318 : 0 : sel.quick_push (base + i + nelts);
319 : : }
320 : 0 : indices->new_vector (sel, 2, nelts);
321 : 0 : return can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
322 : 0 : *indices);
323 : 0 : }
324 : :
325 : : /* Try to use permutes to define the masks in DEST_RGM using the masks
326 : : in SRC_RGM, given that the former has twice as many masks as the
327 : : latter. Return true on success, adding any new statements to SEQ. */
328 : :
329 : : static bool
330 : 0 : vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
331 : : rgroup_controls *src_rgm)
332 : : {
333 : 0 : tree src_masktype = src_rgm->type;
334 : 0 : tree dest_masktype = dest_rgm->type;
335 : 0 : machine_mode src_mode = TYPE_MODE (src_masktype);
336 : 0 : insn_code icode1, icode2;
337 : 0 : if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
338 : 0 : && (icode1 = optab_handler (vec_unpacku_hi_optab,
339 : : src_mode)) != CODE_FOR_nothing
340 : 0 : && (icode2 = optab_handler (vec_unpacku_lo_optab,
341 : : src_mode)) != CODE_FOR_nothing)
342 : : {
343 : : /* Unpacking the source masks gives at least as many mask bits as
344 : : we need. We can then VIEW_CONVERT any excess bits away. */
345 : 0 : machine_mode dest_mode = insn_data[icode1].operand[0].mode;
346 : 0 : gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
347 : 0 : tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
348 : 0 : for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
349 : : {
350 : 0 : tree src = src_rgm->controls[i / 2];
351 : 0 : tree dest = dest_rgm->controls[i];
352 : 0 : tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
353 : 0 : ? VEC_UNPACK_HI_EXPR
354 : : : VEC_UNPACK_LO_EXPR);
355 : 0 : gassign *stmt;
356 : 0 : if (dest_masktype == unpack_masktype)
357 : 0 : stmt = gimple_build_assign (dest, code, src);
358 : : else
359 : : {
360 : 0 : tree temp = make_ssa_name (unpack_masktype);
361 : 0 : stmt = gimple_build_assign (temp, code, src);
362 : 0 : gimple_seq_add_stmt (seq, stmt);
363 : 0 : stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
364 : : build1 (VIEW_CONVERT_EXPR,
365 : : dest_masktype, temp));
366 : : }
367 : 0 : gimple_seq_add_stmt (seq, stmt);
368 : : }
369 : : return true;
370 : : }
371 : 0 : vec_perm_indices indices[2];
372 : 0 : if (dest_masktype == src_masktype
373 : 0 : && interleave_supported_p (&indices[0], src_masktype, 0)
374 : 0 : && interleave_supported_p (&indices[1], src_masktype, 1))
375 : : {
376 : : /* The destination requires twice as many mask bits as the source, so
377 : : we can use interleaving permutes to double up the number of bits. */
378 : : tree masks[2];
379 : 0 : for (unsigned int i = 0; i < 2; ++i)
380 : 0 : masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
381 : 0 : for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
382 : : {
383 : 0 : tree src = src_rgm->controls[i / 2];
384 : 0 : tree dest = dest_rgm->controls[i];
385 : 0 : gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
386 : 0 : src, src, masks[i & 1]);
387 : 0 : gimple_seq_add_stmt (seq, stmt);
388 : : }
389 : 0 : return true;
390 : : }
391 : : return false;
392 : 0 : }
393 : :
394 : : /* Populate DEST_RGM->controls, given that they should add up to STEP.
395 : :
396 : : STEP = MIN_EXPR <ivtmp_34, VF>;
397 : :
398 : : First length (MIN (X, VF/N)):
399 : : loop_len_15 = MIN_EXPR <STEP, VF/N>;
400 : :
401 : : Second length:
402 : : tmp = STEP - loop_len_15;
403 : : loop_len_16 = MIN (tmp, VF/N);
404 : :
405 : : Third length:
406 : : tmp2 = tmp - loop_len_16;
407 : : loop_len_17 = MIN (tmp2, VF/N);
408 : :
409 : : Last length:
410 : : loop_len_18 = tmp2 - loop_len_17;
411 : : */
412 : :
413 : : static void
414 : 0 : vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
415 : : rgroup_controls *dest_rgm, tree step)
416 : : {
417 : 0 : tree ctrl_type = dest_rgm->type;
418 : 0 : poly_uint64 nitems_per_ctrl
419 : 0 : = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
420 : 0 : tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
421 : :
422 : 0 : for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
423 : : {
424 : 0 : tree ctrl = dest_rgm->controls[i];
425 : 0 : if (i == 0)
426 : : {
427 : : /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N]. */
428 : 0 : gassign *assign
429 : 0 : = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
430 : 0 : gimple_seq_add_stmt (seq, assign);
431 : : }
432 : 0 : else if (i == dest_rgm->controls.length () - 1)
433 : : {
434 : : /* Last iteration: Remain capped to the range [0, VF/N]. */
435 : 0 : gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step,
436 : 0 : dest_rgm->controls[i - 1]);
437 : 0 : gimple_seq_add_stmt (seq, assign);
438 : : }
439 : : else
440 : : {
441 : : /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N]. */
442 : 0 : step = gimple_build (seq, MINUS_EXPR, iv_type, step,
443 : 0 : dest_rgm->controls[i - 1]);
444 : 0 : gassign *assign
445 : 0 : = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
446 : 0 : gimple_seq_add_stmt (seq, assign);
447 : : }
448 : : }
449 : 0 : }
450 : :
451 : : /* Stores the standard position for induction variable increment in belonging to
452 : : LOOP_EXIT (just before the exit condition of the given exit to BSI.
453 : : INSERT_AFTER is set to true if the increment should be inserted after
454 : : *BSI. */
455 : :
456 : : void
457 : 55498 : vect_iv_increment_position (edge loop_exit, gimple_stmt_iterator *bsi,
458 : : bool *insert_after)
459 : : {
460 : 55498 : basic_block bb = loop_exit->src;
461 : 55498 : *bsi = gsi_last_bb (bb);
462 : 55498 : *insert_after = false;
463 : 55498 : }
464 : :
465 : : /* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
466 : : for all the rgroup controls in RGC and return a control that is nonzero
467 : : when the loop needs to iterate. Add any new preheader statements to
468 : : PREHEADER_SEQ. Use LOOP_COND_GSI to insert code before the exit gcond.
469 : :
470 : : RGC belongs to loop LOOP. The loop originally iterated NITERS
471 : : times and has been vectorized according to LOOP_VINFO.
472 : :
473 : : If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
474 : : starts with NITERS_SKIP dummy iterations of the scalar loop before
475 : : the real work starts. The mask elements for these dummy iterations
476 : : must be 0, to ensure that the extra iterations do not have an effect.
477 : :
478 : : It is known that:
479 : :
480 : : NITERS * RGC->max_nscalars_per_iter * RGC->factor
481 : :
482 : : does not overflow. However, MIGHT_WRAP_P says whether an induction
483 : : variable that starts at 0 and has step:
484 : :
485 : : VF * RGC->max_nscalars_per_iter * RGC->factor
486 : :
487 : : might overflow before hitting a value above:
488 : :
489 : : (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
490 : :
491 : : This means that we cannot guarantee that such an induction variable
492 : : would ever hit a value that produces a set of all-false masks or zero
493 : : lengths for RGC.
494 : :
495 : : Note: the cost of the code generated by this function is modeled
496 : : by vect_estimate_min_profitable_iters, so changes here may need
497 : : corresponding changes there. */
498 : :
499 : : static tree
500 : 0 : vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
501 : : gimple_seq *preheader_seq,
502 : : gimple_seq *header_seq,
503 : : gimple_stmt_iterator loop_cond_gsi,
504 : : rgroup_controls *rgc, tree niters,
505 : : tree niters_skip, bool might_wrap_p,
506 : : tree *iv_step, tree *compare_step)
507 : : {
508 : 0 : tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
509 : 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
510 : 0 : bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
511 : :
512 : 0 : tree ctrl_type = rgc->type;
513 : 0 : unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
514 : 0 : poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
515 : 0 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
516 : 0 : tree length_limit = NULL_TREE;
517 : : /* For length, we need length_limit to ensure length in range. */
518 : 0 : if (!use_masks_p)
519 : 0 : length_limit = build_int_cst (compare_type, nitems_per_ctrl);
520 : :
521 : : /* Calculate the maximum number of item values that the rgroup
522 : : handles in total, the number that it handles for each iteration
523 : : of the vector loop, and the number that it should skip during the
524 : : first iteration of the vector loop. */
525 : 0 : tree nitems_total = niters;
526 : 0 : tree nitems_step = build_int_cst (iv_type, vf);
527 : 0 : tree nitems_skip = niters_skip;
528 : 0 : if (nitems_per_iter != 1)
529 : : {
530 : : /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
531 : : these multiplications don't overflow. */
532 : 0 : tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
533 : 0 : tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
534 : 0 : nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
535 : : nitems_total, compare_factor);
536 : 0 : nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
537 : : nitems_step, iv_factor);
538 : 0 : if (nitems_skip)
539 : 0 : nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
540 : : nitems_skip, compare_factor);
541 : : }
542 : :
543 : : /* Create an induction variable that counts the number of items
544 : : processed. */
545 : 0 : tree index_before_incr, index_after_incr;
546 : 0 : gimple_stmt_iterator incr_gsi;
547 : 0 : bool insert_after;
548 : 0 : edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
549 : 0 : vect_iv_increment_position (exit_e, &incr_gsi, &insert_after);
550 : 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
551 : : {
552 : : /* Create an IV that counts down from niters_total and whose step
553 : : is the (variable) amount processed in the current iteration:
554 : : ...
555 : : _10 = (unsigned long) count_12(D);
556 : : ...
557 : : # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
558 : : _36 = (MIN_EXPR | SELECT_VL) <ivtmp_9, POLY_INT_CST [4, 4]>;
559 : : ...
560 : : vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
561 : : ...
562 : : ivtmp_35 = ivtmp_9 - POLY_INT_CST [4, 4];
563 : : ...
564 : : if (ivtmp_9 > POLY_INT_CST [4, 4])
565 : : goto <bb 4>; [83.33%]
566 : : else
567 : : goto <bb 5>; [16.67%]
568 : : */
569 : 0 : nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
570 : 0 : tree step = rgc->controls.length () == 1 ? rgc->controls[0]
571 : 0 : : make_ssa_name (iv_type);
572 : : /* Create decrement IV. */
573 : 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
574 : : {
575 : 0 : create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
576 : : insert_after, &index_before_incr, &index_after_incr);
577 : 0 : tree len = gimple_build (header_seq, IFN_SELECT_VL, iv_type,
578 : : index_before_incr, nitems_step);
579 : 0 : gimple_seq_add_stmt (header_seq, gimple_build_assign (step, len));
580 : : }
581 : : else
582 : : {
583 : 0 : create_iv (nitems_total, MINUS_EXPR, nitems_step, NULL_TREE, loop,
584 : : &incr_gsi, insert_after, &index_before_incr,
585 : : &index_after_incr);
586 : 0 : gimple_seq_add_stmt (header_seq,
587 : 0 : gimple_build_assign (step, MIN_EXPR,
588 : : index_before_incr,
589 : : nitems_step));
590 : : }
591 : 0 : *iv_step = step;
592 : 0 : *compare_step = nitems_step;
593 : 0 : return LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) ? index_after_incr
594 : 0 : : index_before_incr;
595 : : }
596 : :
597 : : /* Create increment IV. */
598 : 0 : create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
599 : : loop, &incr_gsi, insert_after, &index_before_incr,
600 : : &index_after_incr);
601 : :
602 : 0 : tree zero_index = build_int_cst (compare_type, 0);
603 : 0 : tree test_index, test_limit, first_limit;
604 : 0 : gimple_stmt_iterator *test_gsi;
605 : 0 : if (might_wrap_p)
606 : : {
607 : : /* In principle the loop should stop iterating once the incremented
608 : : IV reaches a value greater than or equal to:
609 : :
610 : : NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
611 : :
612 : : However, there's no guarantee that this addition doesn't overflow
613 : : the comparison type, or that the IV hits a value above it before
614 : : wrapping around. We therefore adjust the limit down by one
615 : : IV step:
616 : :
617 : : (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
618 : : -[infinite-prec] NITEMS_STEP
619 : :
620 : : and compare the IV against this limit _before_ incrementing it.
621 : : Since the comparison type is unsigned, we actually want the
622 : : subtraction to saturate at zero:
623 : :
624 : : (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
625 : : -[sat] NITEMS_STEP
626 : :
627 : : And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
628 : :
629 : : NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
630 : :
631 : : where the rightmost subtraction can be done directly in
632 : : COMPARE_TYPE. */
633 : 0 : test_index = index_before_incr;
634 : 0 : tree adjust = gimple_convert (preheader_seq, compare_type,
635 : : nitems_step);
636 : 0 : if (nitems_skip)
637 : 0 : adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
638 : : adjust, nitems_skip);
639 : 0 : test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
640 : : nitems_total, adjust);
641 : 0 : test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
642 : : test_limit, adjust);
643 : 0 : test_gsi = &incr_gsi;
644 : :
645 : : /* Get a safe limit for the first iteration. */
646 : 0 : if (nitems_skip)
647 : : {
648 : : /* The first vector iteration can handle at most NITEMS_STEP
649 : : items. NITEMS_STEP <= CONST_LIMIT, and adding
650 : : NITEMS_SKIP to that cannot overflow. */
651 : 0 : tree const_limit = build_int_cst (compare_type,
652 : 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)
653 : 0 : * nitems_per_iter);
654 : 0 : first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
655 : : nitems_total, const_limit);
656 : 0 : first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
657 : : first_limit, nitems_skip);
658 : : }
659 : : else
660 : : /* For the first iteration it doesn't matter whether the IV hits
661 : : a value above NITEMS_TOTAL. That only matters for the latch
662 : : condition. */
663 : : first_limit = nitems_total;
664 : : }
665 : : else
666 : : {
667 : : /* Test the incremented IV, which will always hit a value above
668 : : the bound before wrapping. */
669 : 0 : test_index = index_after_incr;
670 : 0 : test_limit = nitems_total;
671 : 0 : if (nitems_skip)
672 : 0 : test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
673 : : test_limit, nitems_skip);
674 : : test_gsi = &loop_cond_gsi;
675 : :
676 : : first_limit = test_limit;
677 : : }
678 : :
679 : : /* Convert the IV value to the comparison type (either a no-op or
680 : : a demotion). */
681 : 0 : gimple_seq test_seq = NULL;
682 : 0 : test_index = gimple_convert (&test_seq, compare_type, test_index);
683 : 0 : gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
684 : :
685 : : /* Provide a definition of each control in the group. */
686 : 0 : tree next_ctrl = NULL_TREE;
687 : 0 : tree ctrl;
688 : 0 : unsigned int i;
689 : 0 : FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
690 : : {
691 : : /* Previous controls will cover BIAS items. This control covers the
692 : : next batch. */
693 : 0 : poly_uint64 bias = nitems_per_ctrl * i;
694 : 0 : tree bias_tree = build_int_cst (compare_type, bias);
695 : :
696 : : /* See whether the first iteration of the vector loop is known
697 : : to have a full control. */
698 : 0 : poly_uint64 const_limit;
699 : 0 : bool first_iteration_full
700 : 0 : = (poly_int_tree_p (first_limit, &const_limit)
701 : 0 : && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
702 : :
703 : : /* Rather than have a new IV that starts at BIAS and goes up to
704 : : TEST_LIMIT, prefer to use the same 0-based IV for each control
705 : : and adjust the bound down by BIAS. */
706 : 0 : tree this_test_limit = test_limit;
707 : 0 : if (i != 0)
708 : : {
709 : 0 : this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
710 : : compare_type, this_test_limit,
711 : : bias_tree);
712 : 0 : this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
713 : : compare_type, this_test_limit,
714 : : bias_tree);
715 : : }
716 : :
717 : : /* Create the initial control. First include all items that
718 : : are within the loop limit. */
719 : 0 : tree init_ctrl = NULL_TREE;
720 : 0 : if (!first_iteration_full)
721 : : {
722 : 0 : tree start, end;
723 : 0 : if (first_limit == test_limit)
724 : : {
725 : : /* Use a natural test between zero (the initial IV value)
726 : : and the loop limit. The "else" block would be valid too,
727 : : but this choice can avoid the need to load BIAS_TREE into
728 : : a register. */
729 : : start = zero_index;
730 : : end = this_test_limit;
731 : : }
732 : : else
733 : : {
734 : : /* FIRST_LIMIT is the maximum number of items handled by the
735 : : first iteration of the vector loop. Test the portion
736 : : associated with this control. */
737 : 0 : start = bias_tree;
738 : 0 : end = first_limit;
739 : : }
740 : :
741 : 0 : if (use_masks_p)
742 : 0 : init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
743 : : start, end, "max_mask");
744 : : else
745 : : {
746 : 0 : init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
747 : 0 : gimple_seq seq = vect_gen_len (init_ctrl, start,
748 : : end, length_limit);
749 : 0 : gimple_seq_add_seq (preheader_seq, seq);
750 : : }
751 : : }
752 : :
753 : : /* Now AND out the bits that are within the number of skipped
754 : : items. */
755 : 0 : poly_uint64 const_skip;
756 : 0 : if (nitems_skip
757 : 0 : && !(poly_int_tree_p (nitems_skip, &const_skip)
758 : 0 : && known_le (const_skip, bias)))
759 : : {
760 : 0 : gcc_assert (use_masks_p);
761 : 0 : tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
762 : : bias_tree, nitems_skip);
763 : 0 : if (init_ctrl)
764 : 0 : init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
765 : : init_ctrl, unskipped_mask);
766 : : else
767 : : init_ctrl = unskipped_mask;
768 : : }
769 : :
770 : 0 : if (!init_ctrl)
771 : : {
772 : : /* First iteration is full. */
773 : 0 : if (use_masks_p)
774 : 0 : init_ctrl = build_minus_one_cst (ctrl_type);
775 : : else
776 : : init_ctrl = length_limit;
777 : : }
778 : :
779 : : /* Get the control value for the next iteration of the loop. */
780 : 0 : if (use_masks_p)
781 : : {
782 : 0 : gimple_seq stmts = NULL;
783 : 0 : next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
784 : : this_test_limit, "next_mask");
785 : 0 : gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
786 : : }
787 : : else
788 : : {
789 : 0 : next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
790 : 0 : gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
791 : : length_limit);
792 : 0 : gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
793 : : }
794 : :
795 : 0 : vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
796 : : }
797 : :
798 : 0 : int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
799 : 0 : if (partial_load_bias != 0)
800 : : {
801 : 0 : tree adjusted_len = rgc->bias_adjusted_ctrl;
802 : 0 : gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
803 : 0 : rgc->controls[0],
804 : : build_int_cst
805 : 0 : (TREE_TYPE (rgc->controls[0]),
806 : : partial_load_bias));
807 : 0 : gimple_seq_add_stmt (header_seq, minus);
808 : : }
809 : :
810 : : return next_ctrl;
811 : : }
812 : :
813 : : /* Set up the iteration condition and rgroup controls for LOOP, given
814 : : that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
815 : : loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is
816 : : the number of iterations of the original scalar loop that should be
817 : : handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as
818 : : for vect_set_loop_condition.
819 : :
820 : : Insert the branch-back condition before LOOP_COND_GSI and return the
821 : : final gcond. */
822 : :
823 : : static gcond *
824 : 0 : vect_set_loop_condition_partial_vectors (class loop *loop, edge exit_edge,
825 : : loop_vec_info loop_vinfo, tree niters,
826 : : tree final_iv, bool niters_maybe_zero,
827 : : gimple_stmt_iterator loop_cond_gsi)
828 : : {
829 : 0 : gimple_seq preheader_seq = NULL;
830 : 0 : gimple_seq header_seq = NULL;
831 : :
832 : 0 : bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
833 : 0 : tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
834 : 0 : unsigned int compare_precision = TYPE_PRECISION (compare_type);
835 : 0 : tree orig_niters = niters;
836 : :
837 : : /* Type of the initial value of NITERS. */
838 : 0 : tree ni_actual_type = TREE_TYPE (niters);
839 : 0 : unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
840 : 0 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
841 : 0 : if (niters_skip)
842 : 0 : niters_skip = gimple_convert (&preheader_seq, compare_type, niters_skip);
843 : :
844 : : /* Convert NITERS to the same size as the compare. */
845 : 0 : if (compare_precision > ni_actual_precision
846 : 0 : && niters_maybe_zero)
847 : : {
848 : : /* We know that there is always at least one iteration, so if the
849 : : count is zero then it must have wrapped. Cope with this by
850 : : subtracting 1 before the conversion and adding 1 to the result. */
851 : 0 : gcc_assert (TYPE_UNSIGNED (ni_actual_type));
852 : 0 : niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
853 : : niters, build_minus_one_cst (ni_actual_type));
854 : 0 : niters = gimple_convert (&preheader_seq, compare_type, niters);
855 : 0 : niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
856 : : niters, build_one_cst (compare_type));
857 : : }
858 : : else
859 : 0 : niters = gimple_convert (&preheader_seq, compare_type, niters);
860 : :
861 : : /* Iterate over all the rgroups and fill in their controls. We could use
862 : : the first control from any rgroup for the loop condition; here we
863 : : arbitrarily pick the last. */
864 : 0 : tree test_ctrl = NULL_TREE;
865 : 0 : tree iv_step = NULL_TREE;
866 : 0 : tree compare_step = NULL_TREE;
867 : 0 : rgroup_controls *rgc;
868 : 0 : rgroup_controls *iv_rgc = nullptr;
869 : 0 : unsigned int i;
870 : 0 : auto_vec<rgroup_controls> *controls = use_masks_p
871 : 0 : ? &LOOP_VINFO_MASKS (loop_vinfo).rgc_vec
872 : : : &LOOP_VINFO_LENS (loop_vinfo);
873 : 0 : FOR_EACH_VEC_ELT (*controls, i, rgc)
874 : 0 : if (!rgc->controls.is_empty ())
875 : : {
876 : : /* First try using permutes. This adds a single vector
877 : : instruction to the loop for each mask, but needs no extra
878 : : loop invariants or IVs. */
879 : 0 : unsigned int nmasks = i + 1;
880 : 0 : if (use_masks_p && (nmasks & 1) == 0)
881 : : {
882 : 0 : rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
883 : 0 : if (!half_rgc->controls.is_empty ()
884 : 0 : && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
885 : 0 : continue;
886 : : }
887 : :
888 : 0 : if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
889 : 0 : || !iv_rgc
890 : 0 : || (iv_rgc->max_nscalars_per_iter * iv_rgc->factor
891 : 0 : != rgc->max_nscalars_per_iter * rgc->factor))
892 : : {
893 : : /* See whether zero-based IV would ever generate all-false masks
894 : : or zero length before wrapping around. */
895 : 0 : bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
896 : :
897 : : /* Set up all controls for this group. */
898 : 0 : test_ctrl
899 : 0 : = vect_set_loop_controls_directly (loop, loop_vinfo,
900 : : &preheader_seq, &header_seq,
901 : : loop_cond_gsi, rgc, niters,
902 : : niters_skip, might_wrap_p,
903 : : &iv_step, &compare_step);
904 : :
905 : 0 : iv_rgc = rgc;
906 : : }
907 : :
908 : 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
909 : 0 : && rgc->controls.length () > 1)
910 : : {
911 : : /* vect_set_loop_controls_directly creates an IV whose step
912 : : is equal to the expected sum of RGC->controls. Use that
913 : : information to populate RGC->controls. */
914 : 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
915 : 0 : gcc_assert (iv_step);
916 : 0 : vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, iv_step);
917 : : }
918 : : }
919 : :
920 : : /* Emit all accumulated statements. */
921 : 0 : add_preheader_seq (loop, preheader_seq);
922 : 0 : add_header_seq (loop, header_seq);
923 : :
924 : : /* Get a boolean result that tells us whether to iterate. */
925 : 0 : gcond *cond_stmt;
926 : 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
927 : 0 : && !LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
928 : : {
929 : 0 : gcc_assert (compare_step);
930 : 0 : tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
931 : 0 : cond_stmt = gimple_build_cond (code, test_ctrl, compare_step, NULL_TREE,
932 : : NULL_TREE);
933 : 0 : }
934 : : else
935 : : {
936 : 0 : tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
937 : 0 : tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
938 : 0 : cond_stmt
939 : 0 : = gimple_build_cond (code, test_ctrl, zero_ctrl, NULL_TREE, NULL_TREE);
940 : : }
941 : 0 : gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
942 : :
943 : : /* The loop iterates (NITERS - 1) / VF + 1 times.
944 : : Subtract one from this to get the latch count. */
945 : 0 : tree step = build_int_cst (compare_type,
946 : 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo));
947 : 0 : tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
948 : : build_minus_one_cst (compare_type));
949 : 0 : loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
950 : : niters_minus_one, step);
951 : :
952 : 0 : if (final_iv)
953 : : {
954 : 0 : gassign *assign;
955 : : /* If vectorizing an inverted early break loop we have to restart the
956 : : scalar loop at niters - vf. This matches what we do in
957 : : vect_gen_vector_loop_niters_mult_vf for non-masked loops. */
958 : 0 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
959 : : {
960 : 0 : tree ftype = TREE_TYPE (orig_niters);
961 : 0 : tree vf = build_int_cst (ftype, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
962 : 0 : assign = gimple_build_assign (final_iv, MINUS_EXPR, orig_niters, vf);
963 : : }
964 : : else
965 : 0 : assign = gimple_build_assign (final_iv, orig_niters);
966 : 0 : gsi_insert_on_edge_immediate (exit_edge, assign);
967 : : }
968 : :
969 : 0 : return cond_stmt;
970 : : }
971 : :
972 : : /* Set up the iteration condition and rgroup controls for LOOP in AVX512
973 : : style, given that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the
974 : : vectorized loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is
975 : : the number of iterations of the original scalar loop that should be
976 : : handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as
977 : : for vect_set_loop_condition.
978 : :
979 : : Insert the branch-back condition before LOOP_COND_GSI and return the
980 : : final gcond. */
981 : :
982 : : static gcond *
983 : 7 : vect_set_loop_condition_partial_vectors_avx512 (class loop *loop,
984 : : edge exit_edge,
985 : : loop_vec_info loop_vinfo, tree niters,
986 : : tree final_iv,
987 : : bool niters_maybe_zero,
988 : : gimple_stmt_iterator loop_cond_gsi)
989 : : {
990 : 7 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
991 : 7 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
992 : 7 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
993 : 7 : tree orig_niters = niters;
994 : 7 : gimple_seq preheader_seq = NULL;
995 : :
996 : : /* Create an IV that counts down from niters and whose step
997 : : is the number of iterations processed in the current iteration.
998 : : Produce the controls with compares like the following.
999 : :
1000 : : # iv_2 = PHI <niters, iv_3>
1001 : : rem_4 = MIN <iv_2, VF>;
1002 : : remv_6 = { rem_4, rem_4, rem_4, ... }
1003 : : mask_5 = { 0, 0, 1, 1, 2, 2, ... } < remv6;
1004 : : iv_3 = iv_2 - VF;
1005 : : if (iv_2 > VF)
1006 : : continue;
1007 : :
1008 : : Where the constant is built with elements at most VF - 1 and
1009 : : repetitions according to max_nscalars_per_iter which is guarnateed
1010 : : to be the same within a group. */
1011 : :
1012 : : /* Convert NITERS to the determined IV type. */
1013 : 7 : if (TYPE_PRECISION (iv_type) > TYPE_PRECISION (TREE_TYPE (niters))
1014 : 7 : && niters_maybe_zero)
1015 : : {
1016 : : /* We know that there is always at least one iteration, so if the
1017 : : count is zero then it must have wrapped. Cope with this by
1018 : : subtracting 1 before the conversion and adding 1 to the result. */
1019 : 0 : gcc_assert (TYPE_UNSIGNED (TREE_TYPE (niters)));
1020 : 0 : niters = gimple_build (&preheader_seq, PLUS_EXPR, TREE_TYPE (niters),
1021 : 0 : niters, build_minus_one_cst (TREE_TYPE (niters)));
1022 : 0 : niters = gimple_convert (&preheader_seq, iv_type, niters);
1023 : 0 : niters = gimple_build (&preheader_seq, PLUS_EXPR, iv_type,
1024 : : niters, build_one_cst (iv_type));
1025 : : }
1026 : : else
1027 : 7 : niters = gimple_convert (&preheader_seq, iv_type, niters);
1028 : :
1029 : : /* Bias the initial value of the IV in case we need to skip iterations
1030 : : at the beginning. */
1031 : 7 : tree niters_adj = niters;
1032 : 7 : if (niters_skip)
1033 : : {
1034 : 1 : tree skip = gimple_convert (&preheader_seq, iv_type, niters_skip);
1035 : 1 : niters_adj = gimple_build (&preheader_seq, PLUS_EXPR,
1036 : : iv_type, niters, skip);
1037 : : }
1038 : :
1039 : : /* The iteration step is the vectorization factor. */
1040 : 7 : tree iv_step = build_int_cst (iv_type, vf);
1041 : :
1042 : : /* Create the decrement IV. */
1043 : 7 : tree index_before_incr, index_after_incr;
1044 : 7 : gimple_stmt_iterator incr_gsi;
1045 : 7 : bool insert_after;
1046 : 7 : vect_iv_increment_position (exit_edge, &incr_gsi, &insert_after);
1047 : 7 : create_iv (niters_adj, MINUS_EXPR, iv_step, NULL_TREE, loop,
1048 : : &incr_gsi, insert_after, &index_before_incr,
1049 : : &index_after_incr);
1050 : :
1051 : : /* Iterate over all the rgroups and fill in their controls. */
1052 : 29 : for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1053 : : {
1054 : 8 : if (rgc.controls.is_empty ())
1055 : 1 : continue;
1056 : :
1057 : 7 : tree ctrl_type = rgc.type;
1058 : 7 : poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type);
1059 : :
1060 : 7 : tree vectype = rgc.compare_type;
1061 : :
1062 : : /* index_after_incr is the IV specifying the remaining iterations in
1063 : : the next iteration. */
1064 : 7 : tree rem = index_after_incr;
1065 : : /* When the data type for the compare to produce the mask is
1066 : : smaller than the IV type we need to saturate. Saturate to
1067 : : the smallest possible value (IV_TYPE) so we only have to
1068 : : saturate once (CSE will catch redundant ones we add). */
1069 : 7 : if (TYPE_PRECISION (TREE_TYPE (vectype)) < TYPE_PRECISION (iv_type))
1070 : 2 : rem = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
1071 : : UNKNOWN_LOCATION,
1072 : 2 : MIN_EXPR, TREE_TYPE (rem), rem, iv_step);
1073 : 7 : rem = gimple_convert (&incr_gsi, false, GSI_CONTINUE_LINKING,
1074 : 7 : UNKNOWN_LOCATION, TREE_TYPE (vectype), rem);
1075 : :
1076 : : /* Build a data vector composed of the remaining iterations. */
1077 : 7 : rem = gimple_build_vector_from_val (&incr_gsi, false, GSI_CONTINUE_LINKING,
1078 : : UNKNOWN_LOCATION, vectype, rem);
1079 : :
1080 : : /* Provide a definition of each vector in the control group. */
1081 : 7 : tree next_ctrl = NULL_TREE;
1082 : 7 : tree first_rem = NULL_TREE;
1083 : 7 : tree ctrl;
1084 : 7 : unsigned int i;
1085 : 38 : FOR_EACH_VEC_ELT_REVERSE (rgc.controls, i, ctrl)
1086 : : {
1087 : : /* Previous controls will cover BIAS items. This control covers the
1088 : : next batch. */
1089 : 8 : poly_uint64 bias = nitems_per_ctrl * i;
1090 : :
1091 : : /* Build the constant to compare the remaining iters against,
1092 : : this is sth like { 0, 0, 1, 1, 2, 2, 3, 3, ... } appropriately
1093 : : split into pieces. */
1094 : 8 : unsigned n = TYPE_VECTOR_SUBPARTS (ctrl_type).to_constant ();
1095 : 8 : tree_vector_builder builder (vectype, n, 1);
1096 : 188 : for (unsigned i = 0; i < n; ++i)
1097 : : {
1098 : 180 : unsigned HOST_WIDE_INT val
1099 : 180 : = (i + bias.to_constant ()) / rgc.max_nscalars_per_iter;
1100 : 180 : gcc_assert (val < vf.to_constant ());
1101 : 180 : builder.quick_push (build_int_cst (TREE_TYPE (vectype), val));
1102 : : }
1103 : 8 : tree cmp_series = builder.build ();
1104 : :
1105 : : /* Create the initial control. First include all items that
1106 : : are within the loop limit. */
1107 : 8 : tree init_ctrl = NULL_TREE;
1108 : 8 : poly_uint64 const_limit;
1109 : : /* See whether the first iteration of the vector loop is known
1110 : : to have a full control. */
1111 : 8 : if (poly_int_tree_p (niters, &const_limit)
1112 : 8 : && known_ge (const_limit, (i + 1) * nitems_per_ctrl))
1113 : 1 : init_ctrl = build_minus_one_cst (ctrl_type);
1114 : : else
1115 : : {
1116 : : /* The remaining work items initially are niters. Saturate,
1117 : : splat and compare. */
1118 : 7 : if (!first_rem)
1119 : : {
1120 : 6 : first_rem = niters;
1121 : 6 : if (TYPE_PRECISION (TREE_TYPE (vectype))
1122 : 6 : < TYPE_PRECISION (iv_type))
1123 : 2 : first_rem = gimple_build (&preheader_seq,
1124 : 2 : MIN_EXPR, TREE_TYPE (first_rem),
1125 : : first_rem, iv_step);
1126 : 6 : first_rem = gimple_convert (&preheader_seq, TREE_TYPE (vectype),
1127 : : first_rem);
1128 : 6 : first_rem = gimple_build_vector_from_val (&preheader_seq,
1129 : : vectype, first_rem);
1130 : : }
1131 : 7 : init_ctrl = gimple_build (&preheader_seq, LT_EXPR, ctrl_type,
1132 : : cmp_series, first_rem);
1133 : : }
1134 : :
1135 : : /* Now AND out the bits that are within the number of skipped
1136 : : items. */
1137 : 8 : poly_uint64 const_skip;
1138 : 8 : if (niters_skip
1139 : 8 : && !(poly_int_tree_p (niters_skip, &const_skip)
1140 : 1 : && known_le (const_skip, bias)))
1141 : : {
1142 : : /* For integer mode masks it's cheaper to shift out the bits
1143 : : since that avoids loading a constant. */
1144 : 1 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (ctrl_type)) == MODE_INT);
1145 : 1 : init_ctrl = gimple_build (&preheader_seq, VIEW_CONVERT_EXPR,
1146 : 1 : lang_hooks.types.type_for_mode
1147 : 1 : (TYPE_MODE (ctrl_type), 1),
1148 : : init_ctrl);
1149 : : /* ??? But when the shift amount isn't constant this requires
1150 : : a round-trip to GRPs. We could apply the bias to either
1151 : : side of the compare instead. */
1152 : 2 : tree shift = gimple_build (&preheader_seq, MINUS_EXPR,
1153 : 1 : TREE_TYPE (niters_skip), niters_skip,
1154 : 1 : build_int_cst (TREE_TYPE (niters_skip),
1155 : : bias));
1156 : 2 : shift = gimple_build (&preheader_seq, MULT_EXPR,
1157 : 1 : TREE_TYPE (niters_skip), shift,
1158 : 1 : build_int_cst (TREE_TYPE (niters_skip),
1159 : 1 : rgc.max_nscalars_per_iter));
1160 : 1 : init_ctrl = gimple_build (&preheader_seq, LSHIFT_EXPR,
1161 : 1 : TREE_TYPE (init_ctrl),
1162 : : init_ctrl, shift);
1163 : 1 : init_ctrl = gimple_build (&preheader_seq, VIEW_CONVERT_EXPR,
1164 : : ctrl_type, init_ctrl);
1165 : : }
1166 : :
1167 : : /* Get the control value for the next iteration of the loop. */
1168 : 8 : next_ctrl = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
1169 : : UNKNOWN_LOCATION,
1170 : : LT_EXPR, ctrl_type, cmp_series, rem);
1171 : :
1172 : 8 : vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
1173 : 8 : }
1174 : : }
1175 : :
1176 : : /* Emit all accumulated statements. */
1177 : 7 : add_preheader_seq (loop, preheader_seq);
1178 : :
1179 : : /* Adjust the exit test using the decrementing IV. */
1180 : 7 : tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
1181 : : /* When we peel for alignment with niter_skip != 0 this can
1182 : : cause niter + niter_skip to wrap and since we are comparing the
1183 : : value before the decrement here we get a false early exit.
1184 : : We can't compare the value after decrement either because that
1185 : : decrement could wrap as well as we're not doing a saturating
1186 : : decrement. To avoid this situation we force a larger
1187 : : iv_type. */
1188 : 7 : gcond *cond_stmt = gimple_build_cond (code, index_before_incr, iv_step,
1189 : : NULL_TREE, NULL_TREE);
1190 : 7 : gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
1191 : :
1192 : : /* The loop iterates (NITERS - 1 + NITERS_SKIP) / VF + 1 times.
1193 : : Subtract one from this to get the latch count. */
1194 : 7 : tree niters_minus_one
1195 : 7 : = fold_build2 (PLUS_EXPR, TREE_TYPE (orig_niters), orig_niters,
1196 : : build_minus_one_cst (TREE_TYPE (orig_niters)));
1197 : 7 : tree niters_adj2 = fold_convert (iv_type, niters_minus_one);
1198 : 7 : if (niters_skip)
1199 : 1 : niters_adj2 = fold_build2 (PLUS_EXPR, iv_type, niters_minus_one,
1200 : : fold_convert (iv_type, niters_skip));
1201 : 7 : loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, iv_type,
1202 : : niters_adj2, iv_step);
1203 : :
1204 : 7 : if (final_iv)
1205 : : {
1206 : 0 : gassign *assign;
1207 : : /* If vectorizing an inverted early break loop we have to restart the
1208 : : scalar loop at niters - vf. This matches what we do in
1209 : : vect_gen_vector_loop_niters_mult_vf for non-masked loops. */
1210 : 0 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
1211 : : {
1212 : 0 : tree ftype = TREE_TYPE (orig_niters);
1213 : 0 : tree vf = build_int_cst (ftype, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1214 : 0 : assign = gimple_build_assign (final_iv, MINUS_EXPR, orig_niters, vf);
1215 : : }
1216 : : else
1217 : 0 : assign = gimple_build_assign (final_iv, orig_niters);
1218 : 0 : gsi_insert_on_edge_immediate (exit_edge, assign);
1219 : : }
1220 : :
1221 : 7 : return cond_stmt;
1222 : : }
1223 : :
1224 : :
1225 : : /* Like vect_set_loop_condition, but handle the case in which the vector
1226 : : loop handles exactly VF scalars per iteration. */
1227 : :
1228 : : static gcond *
1229 : 55418 : vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge exit_edge,
1230 : : class loop *loop, tree niters, tree step,
1231 : : tree final_iv, bool niters_maybe_zero,
1232 : : gimple_stmt_iterator loop_cond_gsi)
1233 : : {
1234 : 55418 : tree indx_before_incr, indx_after_incr;
1235 : 55418 : gcond *cond_stmt;
1236 : 55418 : gcond *orig_cond;
1237 : 55418 : edge pe = loop_preheader_edge (loop);
1238 : 55418 : gimple_stmt_iterator incr_gsi;
1239 : 55418 : bool insert_after;
1240 : 55418 : enum tree_code code;
1241 : 55418 : tree niters_type = TREE_TYPE (niters);
1242 : :
1243 : 55418 : orig_cond = get_loop_exit_condition (exit_edge);
1244 : 55418 : gcc_assert (orig_cond);
1245 : 55418 : loop_cond_gsi = gsi_for_stmt (orig_cond);
1246 : :
1247 : 55418 : tree init, limit;
1248 : 55418 : if (!niters_maybe_zero && integer_onep (step))
1249 : : {
1250 : : /* In this case we can use a simple 0-based IV:
1251 : :
1252 : : A:
1253 : : x = 0;
1254 : : do
1255 : : {
1256 : : ...
1257 : : x += 1;
1258 : : }
1259 : : while (x < NITERS); */
1260 : 55418 : code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
1261 : 55418 : init = build_zero_cst (niters_type);
1262 : 55418 : limit = niters;
1263 : : }
1264 : : else
1265 : : {
1266 : : /* The following works for all values of NITERS except 0:
1267 : :
1268 : : B:
1269 : : x = 0;
1270 : : do
1271 : : {
1272 : : ...
1273 : : x += STEP;
1274 : : }
1275 : : while (x <= NITERS - STEP);
1276 : :
1277 : : so that the loop continues to iterate if x + STEP - 1 < NITERS
1278 : : but stops if x + STEP - 1 >= NITERS.
1279 : :
1280 : : However, if NITERS is zero, x never hits a value above NITERS - STEP
1281 : : before wrapping around. There are two obvious ways of dealing with
1282 : : this:
1283 : :
1284 : : - start at STEP - 1 and compare x before incrementing it
1285 : : - start at -1 and compare x after incrementing it
1286 : :
1287 : : The latter is simpler and is what we use. The loop in this case
1288 : : looks like:
1289 : :
1290 : : C:
1291 : : x = -1;
1292 : : do
1293 : : {
1294 : : ...
1295 : : x += STEP;
1296 : : }
1297 : : while (x < NITERS - STEP);
1298 : :
1299 : : In both cases the loop limit is NITERS - STEP. */
1300 : 0 : gimple_seq seq = NULL;
1301 : 0 : limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
1302 : 0 : limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
1303 : 0 : if (seq)
1304 : : {
1305 : 0 : basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1306 : 0 : gcc_assert (!new_bb);
1307 : : }
1308 : 0 : if (niters_maybe_zero)
1309 : : {
1310 : : /* Case C. */
1311 : 0 : code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
1312 : 0 : init = build_all_ones_cst (niters_type);
1313 : : }
1314 : : else
1315 : : {
1316 : : /* Case B. */
1317 : 0 : code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
1318 : 0 : init = build_zero_cst (niters_type);
1319 : : }
1320 : : }
1321 : :
1322 : 55418 : vect_iv_increment_position (exit_edge, &incr_gsi, &insert_after);
1323 : 55418 : create_iv (init, PLUS_EXPR, step, NULL_TREE, loop,
1324 : : &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
1325 : 55418 : indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
1326 : : true, NULL_TREE, true,
1327 : : GSI_SAME_STMT);
1328 : 55418 : limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
1329 : : true, GSI_SAME_STMT);
1330 : :
1331 : 55418 : cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
1332 : : NULL_TREE);
1333 : :
1334 : 55418 : gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
1335 : :
1336 : : /* Record the number of latch iterations. */
1337 : 55418 : if (limit == niters)
1338 : : /* Case A: the loop iterates NITERS times. Subtract one to get the
1339 : : latch count. */
1340 : 55418 : loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
1341 : : build_int_cst (niters_type, 1));
1342 : : else
1343 : : /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
1344 : : Subtract one from this to get the latch count. */
1345 : 0 : loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
1346 : : limit, step);
1347 : :
1348 : 55418 : if (final_iv)
1349 : : {
1350 : 0 : gassign *assign;
1351 : 0 : gcc_assert (single_pred_p (exit_edge->dest));
1352 : 0 : tree phi_dest
1353 : 0 : = integer_zerop (init) ? final_iv : copy_ssa_name (indx_after_incr);
1354 : : /* Make sure to maintain LC SSA form here and elide the subtraction
1355 : : if the value is zero. */
1356 : 0 : gphi *phi = create_phi_node (phi_dest, exit_edge->dest);
1357 : 0 : add_phi_arg (phi, indx_after_incr, exit_edge, UNKNOWN_LOCATION);
1358 : 0 : if (!integer_zerop (init))
1359 : : {
1360 : 0 : assign = gimple_build_assign (final_iv, MINUS_EXPR,
1361 : : phi_dest, init);
1362 : 0 : gimple_stmt_iterator gsi = gsi_after_labels (exit_edge->dest);
1363 : 0 : gsi_insert_before (&gsi, assign, GSI_SAME_STMT);
1364 : : }
1365 : : }
1366 : :
1367 : 55418 : return cond_stmt;
1368 : : }
1369 : :
1370 : : /* If we're using fully-masked loops, make LOOP iterate:
1371 : :
1372 : : N == (NITERS - 1) / STEP + 1
1373 : :
1374 : : times. When NITERS is zero, this is equivalent to making the loop
1375 : : execute (1 << M) / STEP times, where M is the precision of NITERS.
1376 : : NITERS_MAYBE_ZERO is true if this last case might occur.
1377 : :
1378 : : If we're not using fully-masked loops, make LOOP iterate:
1379 : :
1380 : : N == (NITERS - STEP) / STEP + 1
1381 : :
1382 : : times, where NITERS is known to be outside the range [1, STEP - 1].
1383 : : This is equivalent to making the loop execute NITERS / STEP times
1384 : : when NITERS is nonzero and (1 << M) / STEP times otherwise.
1385 : : NITERS_MAYBE_ZERO again indicates whether this last case might occur.
1386 : :
1387 : : If FINAL_IV is nonnull, it is an SSA name that should be set to
1388 : : N * STEP on exit from the loop.
1389 : :
1390 : : Assumption: the exit-condition of LOOP is the last stmt in the loop. */
1391 : :
1392 : : void
1393 : 55425 : vect_set_loop_condition (class loop *loop, edge loop_e, loop_vec_info loop_vinfo,
1394 : : tree niters, tree step, tree final_iv,
1395 : : bool niters_maybe_zero)
1396 : : {
1397 : 55425 : gcond *cond_stmt;
1398 : 55425 : gcond *orig_cond = get_loop_exit_condition (loop_e);
1399 : 55425 : gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
1400 : :
1401 : 55425 : if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1402 : : {
1403 : 7 : if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_avx512)
1404 : 7 : cond_stmt = vect_set_loop_condition_partial_vectors_avx512 (loop, loop_e,
1405 : : loop_vinfo,
1406 : : niters, final_iv,
1407 : : niters_maybe_zero,
1408 : : loop_cond_gsi);
1409 : : else
1410 : 0 : cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_e,
1411 : : loop_vinfo,
1412 : : niters, final_iv,
1413 : : niters_maybe_zero,
1414 : : loop_cond_gsi);
1415 : : }
1416 : : else
1417 : 55418 : cond_stmt = vect_set_loop_condition_normal (loop_vinfo, loop_e, loop,
1418 : : niters,
1419 : : step, final_iv,
1420 : : niters_maybe_zero,
1421 : : loop_cond_gsi);
1422 : :
1423 : : /* Remove old loop exit test. */
1424 : 55425 : stmt_vec_info orig_cond_info;
1425 : 55425 : if (loop_vinfo
1426 : 55425 : && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
1427 : 55100 : loop_vinfo->remove_stmt (orig_cond_info);
1428 : : else
1429 : 325 : gsi_remove (&loop_cond_gsi, true);
1430 : :
1431 : 55425 : if (dump_enabled_p ())
1432 : 10074 : dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
1433 : : (gimple *) cond_stmt);
1434 : 55425 : }
1435 : :
1436 : : /* Get the virtual operand live on E. The precondition on this is valid
1437 : : immediate dominators and an actual virtual definition dominating E. */
1438 : : /* ??? Costly band-aid. For the use in question we can populate a
1439 : : live-on-exit/end-of-BB virtual operand when copying stmts. */
1440 : :
1441 : : static tree
1442 : 10 : get_live_virtual_operand_on_edge (edge e)
1443 : : {
1444 : 10 : basic_block bb = e->src;
1445 : 22 : do
1446 : : {
1447 : 61 : for (auto gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1448 : : {
1449 : 37 : gimple *stmt = gsi_stmt (gsi);
1450 : 55 : if (gimple_vdef (stmt))
1451 : 10 : return gimple_vdef (stmt);
1452 : 39 : if (gimple_vuse (stmt))
1453 : : return gimple_vuse (stmt);
1454 : : }
1455 : 8 : if (gphi *vphi = get_virtual_phi (bb))
1456 : 2 : return gimple_phi_result (vphi);
1457 : 6 : bb = get_immediate_dominator (CDI_DOMINATORS, bb);
1458 : 6 : }
1459 : : while (1);
1460 : : }
1461 : :
1462 : : /* Given LOOP this function generates a new copy of it and puts it
1463 : : on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
1464 : : non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1465 : : basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1466 : : entry or exit of LOOP. If FLOW_LOOPS then connect LOOP to SCALAR_LOOP as a
1467 : : continuation. This is correct for cases where one loop continues from the
1468 : : other like in the vectorizer, but not true for uses in e.g. loop distribution
1469 : : where the contents of the loop body are split but the iteration space of both
1470 : : copies remains the same.
1471 : :
1472 : : If UPDATED_DOMS is not NULL it is update with the list of basic blocks whoms
1473 : : dominators were updated during the peeling. When doing early break vectorization
1474 : : then LOOP_VINFO needs to be provided and is used to keep track of any newly created
1475 : : memory references that need to be updated should we decide to vectorize. */
1476 : :
1477 : : class loop *
1478 : 31664 : slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
1479 : : class loop *scalar_loop,
1480 : : edge scalar_exit, edge e, edge *new_e,
1481 : : bool flow_loops,
1482 : : vec<basic_block> *updated_doms)
1483 : : {
1484 : 31664 : class loop *new_loop;
1485 : 31664 : basic_block *new_bbs, *bbs, *pbbs;
1486 : 31664 : bool at_exit;
1487 : 31664 : bool was_imm_dom;
1488 : 31664 : basic_block exit_dest;
1489 : 31664 : edge exit, new_exit;
1490 : 31664 : bool duplicate_outer_loop = false;
1491 : :
1492 : 31664 : exit = loop_exit;
1493 : 31664 : at_exit = (e == exit);
1494 : 31664 : if (!at_exit && e != loop_preheader_edge (loop))
1495 : : return NULL;
1496 : :
1497 : 31664 : if (scalar_loop == NULL)
1498 : : {
1499 : 29880 : scalar_loop = loop;
1500 : 29880 : scalar_exit = loop_exit;
1501 : : }
1502 : 1784 : else if (scalar_loop == loop)
1503 : 0 : scalar_exit = loop_exit;
1504 : : else
1505 : : {
1506 : : /* Loop has been version, match exits up using the aux index. */
1507 : 5352 : for (edge exit : get_loop_exit_edges (scalar_loop))
1508 : 1784 : if (exit->aux == loop_exit->aux)
1509 : : {
1510 : 1784 : scalar_exit = exit;
1511 : 1784 : break;
1512 : 1784 : }
1513 : :
1514 : 1784 : gcc_assert (scalar_exit);
1515 : : }
1516 : :
1517 : 31664 : bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1518 : 31664 : pbbs = bbs + 1;
1519 : 31664 : get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1520 : : /* Allow duplication of outer loops. */
1521 : 31664 : if (scalar_loop->inner)
1522 : 101 : duplicate_outer_loop = true;
1523 : :
1524 : : /* Generate new loop structure. */
1525 : 31664 : new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1526 : 31664 : duplicate_subloops (scalar_loop, new_loop);
1527 : :
1528 : 31664 : exit_dest = exit->dest;
1529 : 31664 : was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1530 : 31664 : exit_dest) == exit->src ?
1531 : : true : false);
1532 : :
1533 : : /* Also copy the pre-header, this avoids jumping through hoops to
1534 : : duplicate the loop entry PHI arguments. Create an empty
1535 : : pre-header unconditionally for this. */
1536 : 31664 : basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1537 : 31664 : edge entry_e = single_pred_edge (preheader);
1538 : 31664 : bbs[0] = preheader;
1539 : 31664 : new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1540 : :
1541 : 31664 : copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1542 : : &scalar_exit, 1, &new_exit, NULL,
1543 : : at_exit ? loop->latch : e->src, true);
1544 : 31664 : exit = loop_exit;
1545 : 31664 : basic_block new_preheader = new_bbs[0];
1546 : :
1547 : 31664 : gcc_assert (new_exit);
1548 : :
1549 : : /* Record the new loop exit information. new_loop doesn't have SCEV data and
1550 : : so we must initialize the exit information. */
1551 : 31664 : if (new_e)
1552 : 30661 : *new_e = new_exit;
1553 : :
1554 : : /* Before installing PHI arguments make sure that the edges
1555 : : into them match that of the scalar loop we analyzed. This
1556 : : makes sure the SLP tree matches up between the main vectorized
1557 : : loop and the epilogue vectorized copies. */
1558 : 31664 : if (single_succ_edge (preheader)->dest_idx
1559 : 31664 : != single_succ_edge (new_bbs[0])->dest_idx)
1560 : : {
1561 : 27469 : basic_block swap_bb = new_bbs[1];
1562 : 27469 : gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1563 : 27469 : std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1564 : 27469 : EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1565 : 27469 : EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1566 : : }
1567 : 31664 : if (duplicate_outer_loop)
1568 : : {
1569 : 101 : class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1570 : 101 : if (loop_preheader_edge (scalar_loop)->dest_idx
1571 : 101 : != loop_preheader_edge (new_inner_loop)->dest_idx)
1572 : : {
1573 : 82 : basic_block swap_bb = new_inner_loop->header;
1574 : 82 : gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1575 : 82 : std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1576 : 82 : EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1577 : 82 : EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1578 : : }
1579 : : }
1580 : :
1581 : 31664 : add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1582 : :
1583 : : /* Skip new preheader since it's deleted if copy loop is added at entry. */
1584 : 164200 : for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1585 : 100872 : rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1586 : :
1587 : : /* Rename the exit uses. */
1588 : 128316 : for (edge exit : get_loop_exit_edges (new_loop))
1589 : 33324 : for (auto gsi = gsi_start_phis (exit->dest);
1590 : 77978 : !gsi_end_p (gsi); gsi_next (&gsi))
1591 : : {
1592 : 44654 : tree orig_def = PHI_ARG_DEF_FROM_EDGE (gsi.phi (), exit);
1593 : 44654 : rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), exit));
1594 : 44654 : if (MAY_HAVE_DEBUG_BIND_STMTS)
1595 : 21936 : adjust_debug_stmts (orig_def, PHI_RESULT (gsi.phi ()), exit->dest);
1596 : 31664 : }
1597 : :
1598 : 31664 : auto loop_exits = get_loop_exit_edges (loop);
1599 : 31664 : bool multiple_exits_p = loop_exits.length () > 1;
1600 : 31664 : auto_vec<basic_block> doms;
1601 : :
1602 : 31664 : if (at_exit) /* Add the loop copy at exit. */
1603 : : {
1604 : 30336 : if (scalar_loop != loop && new_exit->dest != exit_dest)
1605 : : {
1606 : 1780 : new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1607 : 1780 : flush_pending_stmts (new_exit);
1608 : : }
1609 : :
1610 : 30336 : bool need_virtual_phi = get_virtual_phi (loop->header);
1611 : :
1612 : : /* For the main loop exit preserve the LC PHI nodes. For vectorization
1613 : : we need them to continue or finalize reductions. Since we do not
1614 : : copy the loop exit blocks we have to materialize PHIs at the
1615 : : new destination before redirecting edges. */
1616 : 30336 : for (auto gsi_from = gsi_start_phis (loop_exit->dest);
1617 : 72379 : !gsi_end_p (gsi_from); gsi_next (&gsi_from))
1618 : : {
1619 : 42043 : tree res = gimple_phi_result (*gsi_from);
1620 : 42043 : create_phi_node (copy_ssa_name (res), new_preheader);
1621 : : }
1622 : 30336 : edge e = redirect_edge_and_branch (loop_exit, new_preheader);
1623 : 30336 : gcc_assert (e == loop_exit);
1624 : 30336 : flush_pending_stmts (loop_exit);
1625 : 30336 : set_immediate_dominator (CDI_DOMINATORS, new_preheader, loop_exit->src);
1626 : :
1627 : : /* If we ended up choosing an exit leading to a path not using memory
1628 : : we can end up without a virtual LC PHI. Create it when it is
1629 : : needed because of the epilog loop continuation. */
1630 : 30336 : if (need_virtual_phi && !get_virtual_phi (loop_exit->dest))
1631 : : {
1632 : 8 : tree header_def = gimple_phi_result (get_virtual_phi (loop->header));
1633 : 8 : gphi *vphi = create_phi_node (copy_ssa_name (header_def),
1634 : : new_preheader);
1635 : 8 : add_phi_arg (vphi, get_live_virtual_operand_on_edge (loop_exit),
1636 : : loop_exit, UNKNOWN_LOCATION);
1637 : : }
1638 : :
1639 : 30336 : bool multiple_exits_p = loop_exits.length () > 1;
1640 : 60672 : basic_block main_loop_exit_block = new_preheader;
1641 : 30336 : basic_block alt_loop_exit_block = NULL;
1642 : : /* Create the CFG for multiple exits.
1643 : : | loop_exit | alt1 | altN
1644 : : v v ... v
1645 : : main_loop_exit_block: alt_loop_exit_block:
1646 : : | /
1647 : : v v
1648 : : new_preheader:
1649 : : where in the new preheader we need merge PHIs for
1650 : : the continuation values into the epilogue header.
1651 : : Do not bother with exit PHIs for the early exits but
1652 : : their live virtual operand. We'll fix up things below. */
1653 : 30336 : if (multiple_exits_p)
1654 : : {
1655 : 1340 : edge loop_e = single_succ_edge (new_preheader);
1656 : 1340 : new_preheader = split_edge (loop_e);
1657 : :
1658 : 1340 : gphi *vphi = NULL;
1659 : 1340 : alt_loop_exit_block = new_preheader;
1660 : 6888 : for (auto exit : loop_exits)
1661 : 2868 : if (exit != loop_exit)
1662 : : {
1663 : 1528 : tree vphi_def = NULL_TREE;
1664 : 1528 : if (gphi *evphi = get_virtual_phi (exit->dest))
1665 : 605 : vphi_def = gimple_phi_arg_def_from_edge (evphi, exit);
1666 : 1528 : edge res = redirect_edge_and_branch (exit, alt_loop_exit_block);
1667 : 1528 : gcc_assert (res == exit);
1668 : 1528 : redirect_edge_var_map_clear (exit);
1669 : 1528 : if (alt_loop_exit_block == new_preheader)
1670 : 1340 : alt_loop_exit_block = split_edge (exit);
1671 : 1528 : if (!need_virtual_phi)
1672 : 1098 : continue;
1673 : : /* When the edge has no virtual LC PHI get at the live
1674 : : virtual operand by other means. */
1675 : 430 : if (!vphi_def)
1676 : 2 : vphi_def = get_live_virtual_operand_on_edge (exit);
1677 : 430 : if (!vphi)
1678 : 408 : vphi = create_phi_node (copy_ssa_name (vphi_def),
1679 : : alt_loop_exit_block);
1680 : : else
1681 : : /* Edge redirection might re-allocate the PHI node
1682 : : so we have to rediscover it. */
1683 : 22 : vphi = get_virtual_phi (alt_loop_exit_block);
1684 : 430 : add_phi_arg (vphi, vphi_def, exit, UNKNOWN_LOCATION);
1685 : : }
1686 : :
1687 : 1340 : set_immediate_dominator (CDI_DOMINATORS, new_preheader,
1688 : : loop->header);
1689 : :
1690 : : /* Fix up the profile counts of the new exit blocks.
1691 : : main_loop_exit_block was created by duplicating the
1692 : : preheader, so needs its count scaling according to the main
1693 : : exit edge's probability. The remaining count from the
1694 : : preheader goes to the alt_loop_exit_block, since all
1695 : : alternative exits have been redirected there. */
1696 : 1340 : main_loop_exit_block->count = loop_exit->count ();
1697 : 1340 : alt_loop_exit_block->count
1698 : 1340 : = preheader->count - main_loop_exit_block->count;
1699 : : }
1700 : :
1701 : : /* Adjust the epilog loop PHI entry values to continue iteration.
1702 : : This adds remaining necessary LC PHI nodes to the main exit
1703 : : and creates merge PHIs when we have multiple exits with
1704 : : their appropriate continuation. */
1705 : 30336 : if (flow_loops)
1706 : : {
1707 : 30336 : edge loop_entry = single_succ_edge (new_preheader);
1708 : 30336 : bool peeled_iters = single_pred (loop->latch) != loop_exit->src;
1709 : :
1710 : : /* Record the new SSA names in the cache so that we can skip
1711 : : materializing them again when we fill in the rest of the LC SSA
1712 : : variables. */
1713 : 30336 : hash_map <tree, tree> new_phi_args;
1714 : 30336 : for (auto psi = gsi_start_phis (main_loop_exit_block);
1715 : 72387 : !gsi_end_p (psi); gsi_next (&psi))
1716 : : {
1717 : 42051 : gphi *phi = *psi;
1718 : 42051 : tree new_arg = gimple_phi_arg_def_from_edge (phi, loop_exit);
1719 : 42051 : if (TREE_CODE (new_arg) != SSA_NAME)
1720 : 198 : continue;
1721 : :
1722 : : /* If the loop doesn't have a virtual def then only possibly keep
1723 : : the epilog LC PHI for it and avoid creating new defs. */
1724 : 41885 : if (virtual_operand_p (new_arg) && !need_virtual_phi)
1725 : : {
1726 : 32 : auto gsi = gsi_for_stmt (phi);
1727 : 32 : remove_phi_node (&gsi, true);
1728 : 32 : continue;
1729 : 32 : }
1730 : :
1731 : : /* If we decided not to remove the PHI node we should also not
1732 : : rematerialize it later on. */
1733 : 41853 : new_phi_args.put (new_arg, gimple_phi_result (phi));
1734 : : }
1735 : :
1736 : : /* Create the merge PHI nodes in new_preheader and populate the
1737 : : arguments for the exits. */
1738 : 30336 : if (multiple_exits_p)
1739 : : {
1740 : 1340 : for (auto gsi_from = gsi_start_phis (loop->header),
1741 : 1340 : gsi_to = gsi_start_phis (new_loop->header);
1742 : 4301 : !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1743 : 2961 : gsi_next (&gsi_from), gsi_next (&gsi_to))
1744 : : {
1745 : 2961 : gimple *from_phi = gsi_stmt (gsi_from);
1746 : 2961 : gimple *to_phi = gsi_stmt (gsi_to);
1747 : :
1748 : : /* When the vector loop is peeled then we need to use the
1749 : : value at start of the loop, otherwise the main loop exit
1750 : : should use the final iter value. */
1751 : 2961 : tree new_arg;
1752 : 2961 : if (peeled_iters)
1753 : 53 : new_arg = gimple_phi_result (from_phi);
1754 : : else
1755 : 2908 : new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1756 : : loop_latch_edge (loop));
1757 : :
1758 : : /* Check if we've already created a new phi node during edge
1759 : : redirection and re-use it if so. Otherwise create a
1760 : : LC PHI node to feed the merge PHI. */
1761 : 2961 : tree *res;
1762 : 5922 : if (virtual_operand_p (new_arg))
1763 : : {
1764 : : /* Use the existing virtual LC SSA from exit block. */
1765 : 408 : gphi *vphi = get_virtual_phi (main_loop_exit_block);
1766 : 408 : new_arg = gimple_phi_result (vphi);
1767 : : }
1768 : 2553 : else if ((res = new_phi_args.get (new_arg)))
1769 : 97 : new_arg = *res;
1770 : : else
1771 : : {
1772 : : /* Create the LC PHI node for the exit. */
1773 : 2456 : tree new_def = copy_ssa_name (new_arg);
1774 : 2456 : gphi *lc_phi
1775 : 2456 : = create_phi_node (new_def, main_loop_exit_block);
1776 : 2456 : SET_PHI_ARG_DEF (lc_phi, 0, new_arg);
1777 : 2456 : new_arg = new_def;
1778 : : }
1779 : :
1780 : : /* Create the PHI node in the merge block merging the
1781 : : main and early exit values. */
1782 : 2961 : tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
1783 : 2961 : gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
1784 : 2961 : edge main_e = single_succ_edge (main_loop_exit_block);
1785 : 2961 : SET_PHI_ARG_DEF_ON_EDGE (lcssa_phi, main_e, new_arg);
1786 : :
1787 : : /* And adjust the epilog entry value. */
1788 : 2961 : adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
1789 : : }
1790 : :
1791 : : /* After creating the merge PHIs handle the early exits those
1792 : : should use the values at the start of the loop. */
1793 : 1340 : for (auto gsi_from = gsi_start_phis (loop->header),
1794 : 1340 : gsi_to = gsi_start_phis (new_preheader);
1795 : 4301 : !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1796 : 2961 : gsi_next (&gsi_from), gsi_next (&gsi_to))
1797 : : {
1798 : 2961 : gimple *from_phi = gsi_stmt (gsi_from);
1799 : 2961 : gimple *to_phi = gsi_stmt (gsi_to);
1800 : :
1801 : : /* Now update the virtual PHI nodes with the right value. */
1802 : 2961 : tree alt_arg = gimple_phi_result (from_phi);
1803 : 5922 : if (virtual_operand_p (alt_arg))
1804 : : {
1805 : 408 : gphi *vphi = get_virtual_phi (alt_loop_exit_block);
1806 : 408 : alt_arg = gimple_phi_result (vphi);
1807 : : }
1808 : : /* For other live args we didn't create LC PHI nodes.
1809 : : Do so here. */
1810 : : else
1811 : : {
1812 : 2553 : tree alt_def = copy_ssa_name (alt_arg);
1813 : 2553 : gphi *lc_phi
1814 : 2553 : = create_phi_node (alt_def, alt_loop_exit_block);
1815 : 5474 : for (unsigned i = 0; i < gimple_phi_num_args (lc_phi);
1816 : : ++i)
1817 : 2921 : SET_PHI_ARG_DEF (lc_phi, i, alt_arg);
1818 : : alt_arg = alt_def;
1819 : : }
1820 : 2961 : edge alt_e = single_succ_edge (alt_loop_exit_block);
1821 : 2961 : SET_PHI_ARG_DEF_ON_EDGE (to_phi, alt_e, alt_arg);
1822 : : }
1823 : : }
1824 : : /* For the single exit case only create the missing LC PHI nodes
1825 : : for the continuation of the loop IVs that are not also already
1826 : : reductions and thus had LC PHI nodes on the exit already. */
1827 : : else
1828 : : {
1829 : 28996 : for (auto gsi_from = gsi_start_phis (loop->header),
1830 : 28996 : gsi_to = gsi_start_phis (new_loop->header);
1831 : 112380 : !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1832 : 83384 : gsi_next (&gsi_from), gsi_next (&gsi_to))
1833 : : {
1834 : 83384 : gimple *from_phi = gsi_stmt (gsi_from);
1835 : 83384 : gimple *to_phi = gsi_stmt (gsi_to);
1836 : 83384 : tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1837 : : loop_latch_edge (loop));
1838 : :
1839 : : /* Check if we've already created a new phi node during edge
1840 : : redirection. If we have, only propagate the value
1841 : : downwards. */
1842 : 83384 : if (tree *res = new_phi_args.get (new_arg))
1843 : : {
1844 : 39551 : adjust_phi_and_debug_stmts (to_phi, loop_entry, *res);
1845 : 39551 : continue;
1846 : : }
1847 : :
1848 : 43833 : tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
1849 : 43833 : gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
1850 : 43833 : SET_PHI_ARG_DEF_ON_EDGE (lcssa_phi, loop_exit, new_arg);
1851 : 43833 : adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
1852 : : }
1853 : : }
1854 : 30336 : }
1855 : :
1856 : 30336 : if (was_imm_dom || duplicate_outer_loop)
1857 : 30117 : set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1858 : :
1859 : : /* And remove the non-necessary forwarder again. Keep the other
1860 : : one so we have a proper pre-header for the loop at the exit edge. */
1861 : 30336 : redirect_edge_pred (single_succ_edge (preheader),
1862 : : single_pred (preheader));
1863 : 30336 : delete_basic_block (preheader);
1864 : 30336 : set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1865 : 30336 : loop_preheader_edge (scalar_loop)->src);
1866 : :
1867 : : /* Finally after wiring the new epilogue we need to update its main exit
1868 : : to the original function exit we recorded. Other exits are already
1869 : : correct. */
1870 : 30336 : if (multiple_exits_p)
1871 : : {
1872 : 1340 : class loop *update_loop = new_loop;
1873 : 1340 : doms = get_all_dominated_blocks (CDI_DOMINATORS, loop->header);
1874 : 72187 : for (unsigned i = 0; i < doms.length (); ++i)
1875 : 70847 : if (flow_bb_inside_loop_p (loop, doms[i]))
1876 : 4195 : doms.unordered_remove (i);
1877 : :
1878 : 6888 : for (edge e : get_loop_exit_edges (update_loop))
1879 : : {
1880 : 2868 : edge ex;
1881 : 2868 : edge_iterator ei;
1882 : 5670 : FOR_EACH_EDGE (ex, ei, e->dest->succs)
1883 : : {
1884 : : /* Find the first non-fallthrough block as fall-throughs can't
1885 : : dominate other blocks. */
1886 : 2802 : if (single_succ_p (ex->dest))
1887 : : {
1888 : 1462 : doms.safe_push (ex->dest);
1889 : 1462 : ex = single_succ_edge (ex->dest);
1890 : : }
1891 : 2802 : doms.safe_push (ex->dest);
1892 : : }
1893 : 2868 : doms.safe_push (e->dest);
1894 : 1340 : }
1895 : :
1896 : 1340 : iterate_fix_dominators (CDI_DOMINATORS, doms, false);
1897 : 1340 : if (updated_doms)
1898 : 1340 : updated_doms->safe_splice (doms);
1899 : : }
1900 : : }
1901 : : else /* Add the copy at entry. */
1902 : : {
1903 : : /* Copy the current loop LC PHI nodes between the original loop exit
1904 : : block and the new loop header. This allows us to later split the
1905 : : preheader block and still find the right LC nodes. */
1906 : 1328 : if (flow_loops)
1907 : 325 : for (auto gsi_from = gsi_start_phis (new_loop->header),
1908 : 325 : gsi_to = gsi_start_phis (loop->header);
1909 : 1075 : !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1910 : 750 : gsi_next (&gsi_from), gsi_next (&gsi_to))
1911 : : {
1912 : 750 : gimple *from_phi = gsi_stmt (gsi_from);
1913 : 750 : gimple *to_phi = gsi_stmt (gsi_to);
1914 : 750 : tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1915 : : loop_latch_edge (new_loop));
1916 : 750 : adjust_phi_and_debug_stmts (to_phi, loop_preheader_edge (loop),
1917 : : new_arg);
1918 : : }
1919 : :
1920 : 1328 : if (scalar_loop != loop)
1921 : : {
1922 : : /* Remove the non-necessary forwarder of scalar_loop again. */
1923 : 4 : redirect_edge_pred (single_succ_edge (preheader),
1924 : : single_pred (preheader));
1925 : 4 : delete_basic_block (preheader);
1926 : 4 : set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1927 : 4 : loop_preheader_edge (scalar_loop)->src);
1928 : 4 : preheader = split_edge (loop_preheader_edge (loop));
1929 : 4 : entry_e = single_pred_edge (preheader);
1930 : : }
1931 : :
1932 : 1328 : redirect_edge_and_branch_force (entry_e, new_preheader);
1933 : 1328 : flush_pending_stmts (entry_e);
1934 : 1328 : set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1935 : :
1936 : 1328 : redirect_edge_and_branch_force (new_exit, preheader);
1937 : 1328 : flush_pending_stmts (new_exit);
1938 : 1328 : set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1939 : :
1940 : : /* And remove the non-necessary forwarder again. Keep the other
1941 : : one so we have a proper pre-header for the loop at the exit edge. */
1942 : 1328 : redirect_edge_pred (single_succ_edge (new_preheader),
1943 : : single_pred (new_preheader));
1944 : 1328 : delete_basic_block (new_preheader);
1945 : 1328 : set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1946 : 1328 : loop_preheader_edge (new_loop)->src);
1947 : :
1948 : : /* Update dominators for multiple exits. */
1949 : 1328 : if (multiple_exits_p)
1950 : : {
1951 : 652 : for (edge alt_e : loop_exits)
1952 : : {
1953 : 262 : if (alt_e == loop_exit)
1954 : 130 : continue;
1955 : 132 : basic_block old_dom
1956 : 132 : = get_immediate_dominator (CDI_DOMINATORS, alt_e->dest);
1957 : 132 : if (flow_bb_inside_loop_p (loop, old_dom))
1958 : : {
1959 : 45 : auto_vec<basic_block, 8> queue;
1960 : 45 : for (auto son = first_dom_son (CDI_DOMINATORS, old_dom);
1961 : 149 : son; son = next_dom_son (CDI_DOMINATORS, son))
1962 : 104 : if (!flow_bb_inside_loop_p (loop, son))
1963 : 59 : queue.safe_push (son);
1964 : 194 : for (auto son : queue)
1965 : 59 : set_immediate_dominator (CDI_DOMINATORS,
1966 : : son, get_bb_copy (old_dom));
1967 : 45 : }
1968 : : }
1969 : : }
1970 : : }
1971 : :
1972 : 31664 : free (new_bbs);
1973 : 31664 : free (bbs);
1974 : :
1975 : 31664 : checking_verify_dominators (CDI_DOMINATORS);
1976 : :
1977 : 31664 : return new_loop;
1978 : 31664 : }
1979 : :
1980 : :
1981 : : /* Given the condition expression COND, put it as the last statement of
1982 : : GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1983 : : DOM_BB; return the skip edge. GUARD_TO is the target basic block to
1984 : : skip the loop. PROBABILITY is the skip edge's probability. Mark the
1985 : : new edge as irreducible if IRREDUCIBLE_P is true. */
1986 : :
1987 : : static edge
1988 : 45936 : slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1989 : : basic_block guard_to, basic_block dom_bb,
1990 : : profile_probability probability, bool irreducible_p)
1991 : : {
1992 : 45936 : gimple_stmt_iterator gsi;
1993 : 45936 : edge new_e, enter_e;
1994 : 45936 : gcond *cond_stmt;
1995 : 45936 : gimple_seq gimplify_stmt_list = NULL;
1996 : :
1997 : 45936 : enter_e = EDGE_SUCC (guard_bb, 0);
1998 : 45936 : enter_e->flags &= ~EDGE_FALLTHRU;
1999 : 45936 : enter_e->flags |= EDGE_FALSE_VALUE;
2000 : 45936 : gsi = gsi_last_bb (guard_bb);
2001 : :
2002 : 45936 : cond = force_gimple_operand_1 (cond, &gimplify_stmt_list,
2003 : : is_gimple_condexpr_for_cond, NULL_TREE);
2004 : 45936 : if (gimplify_stmt_list)
2005 : 20984 : gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
2006 : :
2007 : 45936 : cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
2008 : 45936 : gsi = gsi_last_bb (guard_bb);
2009 : 45936 : gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
2010 : :
2011 : : /* Add new edge to connect guard block to the merge/loop-exit block. */
2012 : 45936 : new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
2013 : :
2014 : 45936 : new_e->probability = probability;
2015 : 45936 : if (irreducible_p)
2016 : 16 : new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
2017 : :
2018 : 45936 : enter_e->probability = probability.invert ();
2019 : 45936 : set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
2020 : :
2021 : : /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
2022 : 45936 : if (enter_e->dest->loop_father->header == enter_e->dest)
2023 : 330 : split_edge (enter_e);
2024 : :
2025 : 45936 : return new_e;
2026 : : }
2027 : :
2028 : :
2029 : : /* This function verifies that the following restrictions apply to LOOP:
2030 : : (1) it consists of exactly 2 basic blocks - header, and an empty latch
2031 : : for innermost loop and 5 basic blocks for outer-loop.
2032 : : (2) it is single entry, single exit
2033 : : (3) its exit condition is the last stmt in the header
2034 : : (4) E is the entry/exit edge of LOOP.
2035 : : */
2036 : :
2037 : : bool
2038 : 385095 : slpeel_can_duplicate_loop_p (const class loop *loop, const_edge exit_e,
2039 : : const_edge e)
2040 : : {
2041 : 385095 : edge entry_e = loop_preheader_edge (loop);
2042 : 385095 : gcond *orig_cond = get_loop_exit_condition (exit_e);
2043 : 385095 : gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
2044 : :
2045 : : /* All loops have an outer scope; the only case loop->outer is NULL is for
2046 : : the function itself. */
2047 : 385095 : if (!loop_outer (loop)
2048 : 385095 : || !empty_block_p (loop->latch)
2049 : : || !exit_e
2050 : : /* Verify that new loop exit condition can be trivially modified. */
2051 : 385095 : || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
2052 : 770190 : || (e != exit_e && e != entry_e))
2053 : 0 : return false;
2054 : :
2055 : 385095 : basic_block *bbs = XNEWVEC (basic_block, loop->num_nodes);
2056 : 385095 : get_loop_body_with_size (loop, bbs, loop->num_nodes);
2057 : 385095 : bool ret = can_copy_bbs_p (bbs, loop->num_nodes);
2058 : 385095 : free (bbs);
2059 : 385095 : return ret;
2060 : : }
2061 : :
2062 : : /* Function find_loop_location.
2063 : :
2064 : : Extract the location of the loop in the source code.
2065 : : If the loop is not well formed for vectorization, an estimated
2066 : : location is calculated.
2067 : : Return the loop location if succeed and NULL if not. */
2068 : :
2069 : : dump_user_location_t
2070 : 3277255 : find_loop_location (class loop *loop)
2071 : : {
2072 : 3277255 : gimple *stmt = NULL;
2073 : 3277255 : basic_block bb;
2074 : 3277255 : gimple_stmt_iterator si;
2075 : :
2076 : 3277255 : if (!loop)
2077 : 0 : return dump_user_location_t ();
2078 : :
2079 : : /* For the root of the loop tree return the function location. */
2080 : 3277255 : if (!loop_outer (loop))
2081 : 0 : return dump_user_location_t::from_function_decl (cfun->decl);
2082 : :
2083 : 3277255 : if (loops_state_satisfies_p (LOOPS_HAVE_RECORDED_EXITS))
2084 : : {
2085 : : /* We only care about the loop location, so use any exit with location
2086 : : information. */
2087 : 10336818 : for (edge e : get_loop_exit_edges (loop))
2088 : : {
2089 : 3394450 : stmt = get_loop_exit_condition (e);
2090 : :
2091 : 3394450 : if (stmt
2092 : 3394450 : && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
2093 : 2832141 : return stmt;
2094 : 3277255 : }
2095 : : }
2096 : :
2097 : : /* If we got here the loop is probably not "well formed",
2098 : : try to estimate the loop location */
2099 : :
2100 : 445114 : if (!loop->header)
2101 : 0 : return dump_user_location_t ();
2102 : :
2103 : 445114 : bb = loop->header;
2104 : :
2105 : 1592758 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2106 : : {
2107 : 1022656 : stmt = gsi_stmt (si);
2108 : 1022656 : if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
2109 : 320126 : return stmt;
2110 : : }
2111 : :
2112 : 124988 : return dump_user_location_t ();
2113 : : }
2114 : :
2115 : : /* Return true if the phi described by STMT_INFO defines an IV of the
2116 : : loop to be vectorized. */
2117 : :
2118 : : static bool
2119 : 1047896 : iv_phi_p (stmt_vec_info stmt_info)
2120 : : {
2121 : 1047896 : gphi *phi = as_a <gphi *> (stmt_info->stmt);
2122 : 2095792 : if (virtual_operand_p (PHI_RESULT (phi)))
2123 : : return false;
2124 : :
2125 : 823880 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2126 : 823880 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2127 : 109404 : return false;
2128 : :
2129 : : return true;
2130 : : }
2131 : :
2132 : : /* Return true if vectorizer can peel for nonlinear iv. */
2133 : : static bool
2134 : 7562 : vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo,
2135 : : stmt_vec_info stmt_info)
2136 : : {
2137 : 7562 : enum vect_induction_op_type induction_type
2138 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
2139 : 7562 : tree niters_skip;
2140 : : /* Init_expr will be update by vect_update_ivs_after_vectorizer,
2141 : : if niters or vf is unkown:
2142 : : For shift, when shift mount >= precision, there would be UD.
2143 : : For mult, don't known how to generate
2144 : : init_expr * pow (step, niters) for variable niters.
2145 : : For neg unknown niters are ok, since niters of vectorized main loop
2146 : : will always be multiple of 2.
2147 : : See also PR113163, PR114196 and PR114485. */
2148 : 7562 : if (!LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()
2149 : 7562 : || LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2150 : 7562 : || (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2151 : 4759 : && induction_type != vect_step_op_neg))
2152 : : {
2153 : 4589 : if (dump_enabled_p ())
2154 : 28 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2155 : : "Peeling for epilogue is not supported"
2156 : : " for this nonlinear induction"
2157 : : " when iteration count is unknown or"
2158 : : " when using partial vectorization.\n");
2159 : 4589 : return false;
2160 : : }
2161 : :
2162 : : /* Avoid compile time hog on vect_peel_nonlinear_iv_init. */
2163 : 2973 : if (induction_type == vect_step_op_mul)
2164 : : {
2165 : 415 : tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
2166 : 415 : tree type = TREE_TYPE (step_expr);
2167 : :
2168 : 824 : if (wi::exact_log2 (wi::to_wide (step_expr)) == -1
2169 : 415 : && LOOP_VINFO_INT_NITERS(loop_vinfo) >= TYPE_PRECISION (type))
2170 : : {
2171 : 6 : if (dump_enabled_p ())
2172 : 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2173 : : "Avoid compile time hog on"
2174 : : " vect_peel_nonlinear_iv_init"
2175 : : " for nonlinear induction vec_step_op_mul"
2176 : : " when iteration count is too big.\n");
2177 : 6 : return false;
2178 : : }
2179 : : }
2180 : :
2181 : : /* Also doens't support peel for neg when niter is variable.
2182 : : ??? generate something like niter_expr & 1 ? init_expr : -init_expr? */
2183 : 2967 : niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
2184 : 2967 : if ((niters_skip != NULL_TREE
2185 : 0 : && (TREE_CODE (niters_skip) != INTEGER_CST
2186 : 0 : || (HOST_WIDE_INT) TREE_INT_CST_LOW (niters_skip) < 0))
2187 : 2967 : || (!vect_use_loop_mask_for_alignment_p (loop_vinfo)
2188 : 2967 : && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0))
2189 : : {
2190 : 4 : if (dump_enabled_p ())
2191 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2192 : : "Peeling for alignement is not supported"
2193 : : " for nonlinear induction when niters_skip"
2194 : : " is not constant.\n");
2195 : 4 : return false;
2196 : : }
2197 : :
2198 : : return true;
2199 : : }
2200 : :
2201 : : /* Function vect_can_advance_ivs_p
2202 : :
2203 : : In case the number of iterations that LOOP iterates is unknown at compile
2204 : : time, an epilog loop will be generated, and the loop induction variables
2205 : : (IVs) will be "advanced" to the value they are supposed to take just before
2206 : : the epilog loop. Here we check that the access function of the loop IVs
2207 : : and the expression that represents the loop bound are simple enough.
2208 : : These restrictions will be relaxed in the future. */
2209 : :
2210 : : bool
2211 : 391718 : vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
2212 : : {
2213 : 391718 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2214 : 391718 : basic_block bb = loop->header;
2215 : 391718 : gphi_iterator gsi;
2216 : :
2217 : : /* Analyze phi functions of the loop header. */
2218 : :
2219 : 391718 : if (dump_enabled_p ())
2220 : 23076 : dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
2221 : 1346374 : for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2222 : : {
2223 : 961604 : tree evolution_part;
2224 : 961604 : enum vect_induction_op_type induction_type;
2225 : :
2226 : 961604 : gphi *phi = gsi.phi ();
2227 : 961604 : stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
2228 : 961604 : if (dump_enabled_p ())
2229 : 66602 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
2230 : : phi_info->stmt);
2231 : :
2232 : : /* Skip virtual phi's. The data dependences that are associated with
2233 : : virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
2234 : :
2235 : : Skip reduction phis. */
2236 : 961604 : if (!iv_phi_p (phi_info))
2237 : : {
2238 : 293815 : if (dump_enabled_p ())
2239 : 23494 : dump_printf_loc (MSG_NOTE, vect_location,
2240 : : "reduc or virtual phi. skip.\n");
2241 : 293815 : continue;
2242 : : }
2243 : :
2244 : 667789 : induction_type = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
2245 : 667789 : if (induction_type != vect_step_op_add)
2246 : : {
2247 : 7562 : if (!vect_can_peel_nonlinear_iv_p (loop_vinfo, phi_info))
2248 : : return false;
2249 : :
2250 : 2963 : continue;
2251 : : }
2252 : :
2253 : : /* Analyze the evolution function. */
2254 : :
2255 : 660227 : evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
2256 : 660227 : if (evolution_part == NULL_TREE)
2257 : : {
2258 : 2317 : if (dump_enabled_p ())
2259 : 78 : dump_printf (MSG_MISSED_OPTIMIZATION,
2260 : : "No access function or evolution.\n");
2261 : 2317 : return false;
2262 : : }
2263 : :
2264 : : /* FORNOW: We do not transform initial conditions of IVs
2265 : : which evolution functions are not invariants in the loop. */
2266 : :
2267 : 657910 : if (!expr_invariant_in_loop_p (loop, evolution_part))
2268 : : {
2269 : 32 : if (dump_enabled_p ())
2270 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2271 : : "evolution not invariant in loop.\n");
2272 : 32 : return false;
2273 : : }
2274 : :
2275 : : /* FORNOW: We do not transform initial conditions of IVs
2276 : : which evolution functions are a polynomial of degree >= 2. */
2277 : :
2278 : 1612534 : if (tree_is_chrec (evolution_part))
2279 : : {
2280 : 0 : if (dump_enabled_p ())
2281 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2282 : : "evolution is chrec.\n");
2283 : 0 : return false;
2284 : : }
2285 : : }
2286 : :
2287 : : return true;
2288 : : }
2289 : :
2290 : :
2291 : : /* Function vect_update_ivs_after_vectorizer.
2292 : :
2293 : : "Advance" the induction variables of LOOP to the value they should take
2294 : : after the execution of LOOP. This is currently necessary because the
2295 : : vectorizer does not handle induction variables that are used after the
2296 : : loop. Such a situation occurs when the last iterations of LOOP are
2297 : : peeled, because:
2298 : : 1. We introduced new uses after LOOP for IVs that were not originally used
2299 : : after LOOP: the IVs of LOOP are now used by an epilog loop.
2300 : : 2. LOOP is going to be vectorized; this means that it will iterate N/VF
2301 : : times, whereas the loop IVs should be bumped N times.
2302 : :
2303 : : Input:
2304 : : - LOOP - a loop that is going to be vectorized. The last few iterations
2305 : : of LOOP were peeled.
2306 : : - NITERS - the number of iterations that LOOP executes (before it is
2307 : : vectorized). i.e, the number of times the ivs should be bumped.
2308 : : - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
2309 : : coming out from LOOP on which there are uses of the LOOP ivs
2310 : : (this is the path from LOOP->exit to epilog_loop->preheader).
2311 : :
2312 : : The new definitions of the ivs are placed in LOOP->exit.
2313 : : The phi args associated with the edge UPDATE_E in the bb
2314 : : UPDATE_E->dest are updated accordingly.
2315 : :
2316 : : Assumption 1: Like the rest of the vectorizer, this function assumes
2317 : : a single loop exit that has a single predecessor.
2318 : :
2319 : : Assumption 2: The phi nodes in the LOOP header and in update_bb are
2320 : : organized in the same order.
2321 : :
2322 : : Assumption 3: The access function of the ivs is simple enough (see
2323 : : vect_can_advance_ivs_p). This assumption will be relaxed in the future.
2324 : :
2325 : : Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
2326 : : coming out of LOOP on which the ivs of LOOP are used (this is the path
2327 : : that leads to the epilog loop; other paths skip the epilog loop). This
2328 : : path starts with the edge UPDATE_E, and its destination (denoted update_bb)
2329 : : needs to have its phis updated.
2330 : : */
2331 : :
2332 : : static void
2333 : 30317 : vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
2334 : : tree niters, edge update_e)
2335 : : {
2336 : 30317 : gphi_iterator gsi, gsi1;
2337 : 30317 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2338 : 30317 : basic_block update_bb = update_e->dest;
2339 : 30317 : basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
2340 : 30317 : gimple_stmt_iterator last_gsi = gsi_last_bb (exit_bb);
2341 : :
2342 : 30317 : for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
2343 : 116609 : !gsi_end_p (gsi) && !gsi_end_p (gsi1);
2344 : 86292 : gsi_next (&gsi), gsi_next (&gsi1))
2345 : : {
2346 : 86292 : tree init_expr;
2347 : 86292 : tree step_expr, off;
2348 : 86292 : tree type;
2349 : 86292 : tree var, ni, ni_name;
2350 : :
2351 : 86292 : gphi *phi = gsi.phi ();
2352 : 86292 : gphi *phi1 = gsi1.phi ();
2353 : 86292 : stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
2354 : 86292 : if (dump_enabled_p ())
2355 : 10327 : dump_printf_loc (MSG_NOTE, vect_location,
2356 : : "vect_update_ivs_after_vectorizer: phi: %G",
2357 : : (gimple *) phi);
2358 : :
2359 : : /* Skip reduction and virtual phis. */
2360 : 86292 : if (!iv_phi_p (phi_info))
2361 : : {
2362 : 39605 : if (dump_enabled_p ())
2363 : 4031 : dump_printf_loc (MSG_NOTE, vect_location,
2364 : : "reduc or virtual phi. skip.\n");
2365 : 39605 : continue;
2366 : : }
2367 : :
2368 : 46687 : type = TREE_TYPE (gimple_phi_result (phi));
2369 : 46687 : step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
2370 : 46687 : step_expr = unshare_expr (step_expr);
2371 : :
2372 : : /* FORNOW: We do not support IVs whose evolution function is a polynomial
2373 : : of degree >= 2 or exponential. */
2374 : 46687 : gcc_assert (!tree_is_chrec (step_expr));
2375 : :
2376 : 46687 : init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
2377 : 46687 : gimple_seq stmts = NULL;
2378 : 46687 : enum vect_induction_op_type induction_type
2379 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
2380 : :
2381 : 46687 : if (induction_type == vect_step_op_add)
2382 : : {
2383 : 46567 : tree stype = TREE_TYPE (step_expr);
2384 : 46567 : off = fold_build2 (MULT_EXPR, stype,
2385 : : fold_convert (stype, niters), step_expr);
2386 : :
2387 : 46567 : if (POINTER_TYPE_P (type))
2388 : 3179 : ni = fold_build_pointer_plus (init_expr, off);
2389 : : else
2390 : 43388 : ni = fold_convert (type,
2391 : : fold_build2 (PLUS_EXPR, stype,
2392 : : fold_convert (stype, init_expr),
2393 : : off));
2394 : : }
2395 : : /* Don't bother call vect_peel_nonlinear_iv_init. */
2396 : 120 : else if (induction_type == vect_step_op_neg)
2397 : : ni = init_expr;
2398 : : else
2399 : 84 : ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
2400 : : niters, step_expr,
2401 : : induction_type);
2402 : :
2403 : 46687 : var = create_tmp_var (type, "tmp");
2404 : :
2405 : 46687 : gimple_seq new_stmts = NULL;
2406 : 46687 : ni_name = force_gimple_operand (ni, &new_stmts, false, var);
2407 : :
2408 : : /* Exit_bb shouldn't be empty. */
2409 : 46687 : if (!gsi_end_p (last_gsi))
2410 : : {
2411 : 29697 : gsi_insert_seq_after (&last_gsi, stmts, GSI_SAME_STMT);
2412 : 29697 : gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
2413 : : }
2414 : : else
2415 : : {
2416 : 16990 : gsi_insert_seq_before (&last_gsi, stmts, GSI_SAME_STMT);
2417 : 16990 : gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
2418 : : }
2419 : :
2420 : : /* Fix phi expressions in the successor bb. */
2421 : 46687 : adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
2422 : : }
2423 : 30317 : }
2424 : :
2425 : : /* Return a gimple value containing the misalignment (measured in vector
2426 : : elements) for the loop described by LOOP_VINFO, i.e. how many elements
2427 : : it is away from a perfectly aligned address. Add any new statements
2428 : : to SEQ. */
2429 : :
2430 : : static tree
2431 : 117 : get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
2432 : : {
2433 : 117 : dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2434 : 117 : stmt_vec_info stmt_info = dr_info->stmt;
2435 : 117 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2436 : :
2437 : 117 : poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
2438 : 117 : unsigned HOST_WIDE_INT target_align_c;
2439 : 117 : tree target_align_minus_1;
2440 : :
2441 : 117 : bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
2442 : 117 : size_zero_node) < 0;
2443 : 117 : tree offset = (negative
2444 : 117 : ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
2445 : : * TREE_INT_CST_LOW
2446 : : (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
2447 : 117 : : size_zero_node);
2448 : 117 : tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
2449 : : stmt_info, seq,
2450 : : offset);
2451 : 117 : tree type = unsigned_type_for (TREE_TYPE (start_addr));
2452 : 117 : if (target_align.is_constant (&target_align_c))
2453 : 117 : target_align_minus_1 = build_int_cst (type, target_align_c - 1);
2454 : : else
2455 : : {
2456 : : tree vla = build_int_cst (type, target_align);
2457 : : tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
2458 : : fold_build2 (MINUS_EXPR, type,
2459 : : build_int_cst (type, 0), vla));
2460 : : target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
2461 : : build_int_cst (type, 1));
2462 : : }
2463 : :
2464 : 117 : HOST_WIDE_INT elem_size
2465 : 117 : = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
2466 : 234 : tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
2467 : :
2468 : : /* Create: misalign_in_bytes = addr & (target_align - 1). */
2469 : 117 : tree int_start_addr = fold_convert (type, start_addr);
2470 : 117 : tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
2471 : : target_align_minus_1);
2472 : :
2473 : : /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
2474 : 117 : tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
2475 : : elem_size_log);
2476 : :
2477 : 117 : return misalign_in_elems;
2478 : : }
2479 : :
2480 : : /* Function vect_gen_prolog_loop_niters
2481 : :
2482 : : Generate the number of iterations which should be peeled as prolog for the
2483 : : loop represented by LOOP_VINFO. It is calculated as the misalignment of
2484 : : DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
2485 : : As a result, after the execution of this loop, the data reference DR will
2486 : : refer to an aligned location. The following computation is generated:
2487 : :
2488 : : If the misalignment of DR is known at compile time:
2489 : : addr_mis = int mis = DR_MISALIGNMENT (dr);
2490 : : Else, compute address misalignment in bytes:
2491 : : addr_mis = addr & (target_align - 1)
2492 : :
2493 : : prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
2494 : :
2495 : : (elem_size = element type size; an element is the scalar element whose type
2496 : : is the inner type of the vectype)
2497 : :
2498 : : The computations will be emitted at the end of BB. We also compute and
2499 : : store upper bound (included) of the result in BOUND.
2500 : :
2501 : : When the step of the data-ref in the loop is not 1 (as in interleaved data
2502 : : and SLP), the number of iterations of the prolog must be divided by the step
2503 : : (which is equal to the size of interleaved group).
2504 : :
2505 : : The above formulas assume that VF == number of elements in the vector. This
2506 : : may not hold when there are multiple-types in the loop.
2507 : : In this case, for some data-references in the loop the VF does not represent
2508 : : the number of elements that fit in the vector. Therefore, instead of VF we
2509 : : use TYPE_VECTOR_SUBPARTS. */
2510 : :
2511 : : static tree
2512 : 325 : vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
2513 : : basic_block bb, int *bound)
2514 : : {
2515 : 325 : dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2516 : 325 : tree var;
2517 : 325 : tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
2518 : 325 : gimple_seq stmts = NULL, new_stmts = NULL;
2519 : 325 : tree iters, iters_name;
2520 : 325 : stmt_vec_info stmt_info = dr_info->stmt;
2521 : 325 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2522 : 325 : poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
2523 : :
2524 : 325 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2525 : : {
2526 : 208 : int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2527 : :
2528 : 208 : if (dump_enabled_p ())
2529 : 199 : dump_printf_loc (MSG_NOTE, vect_location,
2530 : : "known peeling = %d.\n", npeel);
2531 : :
2532 : 208 : iters = build_int_cst (niters_type, npeel);
2533 : 208 : *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2534 : : }
2535 : : else
2536 : : {
2537 : 117 : tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
2538 : 117 : tree type = TREE_TYPE (misalign_in_elems);
2539 : 117 : HOST_WIDE_INT elem_size
2540 : 117 : = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
2541 : : /* We only do prolog peeling if the target alignment is known at compile
2542 : : time. */
2543 : 117 : poly_uint64 align_in_elems =
2544 : 117 : exact_div (target_align, elem_size);
2545 : 117 : tree align_in_elems_minus_1 =
2546 : 117 : build_int_cst (type, align_in_elems - 1);
2547 : 117 : tree align_in_elems_tree = build_int_cst (type, align_in_elems);
2548 : :
2549 : : /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
2550 : : & (align_in_elems - 1)). */
2551 : 117 : bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
2552 : 117 : size_zero_node) < 0;
2553 : 117 : if (negative)
2554 : 1 : iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
2555 : : align_in_elems_tree);
2556 : : else
2557 : 116 : iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
2558 : : misalign_in_elems);
2559 : 117 : iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
2560 : 117 : iters = fold_convert (niters_type, iters);
2561 : 117 : unsigned HOST_WIDE_INT align_in_elems_c;
2562 : 117 : if (align_in_elems.is_constant (&align_in_elems_c))
2563 : 117 : *bound = align_in_elems_c - 1;
2564 : : else
2565 : : *bound = -1;
2566 : : }
2567 : :
2568 : 325 : if (dump_enabled_p ())
2569 : 219 : dump_printf_loc (MSG_NOTE, vect_location,
2570 : : "niters for prolog loop: %T\n", iters);
2571 : :
2572 : 325 : var = create_tmp_var (niters_type, "prolog_loop_niters");
2573 : 325 : iters_name = force_gimple_operand (iters, &new_stmts, false, var);
2574 : :
2575 : 325 : if (new_stmts)
2576 : 117 : gimple_seq_add_seq (&stmts, new_stmts);
2577 : 325 : if (stmts)
2578 : : {
2579 : 117 : gcc_assert (single_succ_p (bb));
2580 : 117 : gimple_stmt_iterator gsi = gsi_last_bb (bb);
2581 : 117 : if (gsi_end_p (gsi))
2582 : 5 : gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2583 : : else
2584 : 112 : gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
2585 : : }
2586 : 325 : return iters_name;
2587 : : }
2588 : :
2589 : :
2590 : : /* Function vect_update_init_of_dr
2591 : :
2592 : : If CODE is PLUS, the vector loop starts NITERS iterations after the
2593 : : scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
2594 : : iterations before the scalar one (using masking to skip inactive
2595 : : elements). This function updates the information recorded in DR to
2596 : : account for the difference. Specifically, it updates the OFFSET
2597 : : field of DR_INFO. */
2598 : :
2599 : : static void
2600 : 21373 : vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
2601 : : {
2602 : 21373 : struct data_reference *dr = dr_info->dr;
2603 : 21373 : tree offset = dr_info->offset;
2604 : 21373 : if (!offset)
2605 : 21373 : offset = build_zero_cst (sizetype);
2606 : :
2607 : 21373 : niters = fold_build2 (MULT_EXPR, sizetype,
2608 : : fold_convert (sizetype, niters),
2609 : : fold_convert (sizetype, DR_STEP (dr)));
2610 : 21373 : offset = fold_build2 (code, sizetype,
2611 : : fold_convert (sizetype, offset), niters);
2612 : 21373 : dr_info->offset = offset;
2613 : 21373 : }
2614 : :
2615 : :
2616 : : /* Function vect_update_inits_of_drs
2617 : :
2618 : : Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
2619 : : CODE and NITERS are as for vect_update_inits_of_dr. */
2620 : :
2621 : : void
2622 : 6699 : vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
2623 : : tree_code code)
2624 : : {
2625 : 6699 : unsigned int i;
2626 : 6699 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2627 : 6699 : struct data_reference *dr;
2628 : :
2629 : 6699 : DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
2630 : :
2631 : : /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
2632 : : here, but since we might use these niters to update the epilogues niters
2633 : : and data references we can't insert them here as this definition might not
2634 : : always dominate its uses. */
2635 : 6699 : if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
2636 : 4020 : niters = fold_convert (sizetype, niters);
2637 : :
2638 : 35102 : FOR_EACH_VEC_ELT (datarefs, i, dr)
2639 : : {
2640 : 21788 : dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2641 : 21788 : if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
2642 : 21373 : && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
2643 : 21373 : vect_update_init_of_dr (dr_info, niters, code);
2644 : : }
2645 : 6699 : }
2646 : :
2647 : : /* For the information recorded in LOOP_VINFO prepare the loop for peeling
2648 : : by masking. This involves calculating the number of iterations to
2649 : : be peeled and then aligning all memory references appropriately. */
2650 : :
2651 : : void
2652 : 1 : vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
2653 : : {
2654 : 1 : tree misalign_in_elems;
2655 : 1 : tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
2656 : :
2657 : 1 : gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
2658 : :
2659 : : /* From the information recorded in LOOP_VINFO get the number of iterations
2660 : : that need to be skipped via masking. */
2661 : 1 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2662 : : {
2663 : 1 : poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2664 : 1 : - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
2665 : 1 : misalign_in_elems = build_int_cst (type, misalign);
2666 : : }
2667 : : else
2668 : : {
2669 : 0 : gimple_seq seq1 = NULL, seq2 = NULL;
2670 : 0 : misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
2671 : 0 : misalign_in_elems = fold_convert (type, misalign_in_elems);
2672 : 0 : misalign_in_elems = force_gimple_operand (misalign_in_elems,
2673 : : &seq2, true, NULL_TREE);
2674 : 0 : gimple_seq_add_seq (&seq1, seq2);
2675 : 0 : if (seq1)
2676 : : {
2677 : 0 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2678 : 0 : basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
2679 : 0 : gcc_assert (!new_bb);
2680 : : }
2681 : : }
2682 : :
2683 : 1 : if (dump_enabled_p ())
2684 : 1 : dump_printf_loc (MSG_NOTE, vect_location,
2685 : : "misalignment for fully-masked loop: %T\n",
2686 : : misalign_in_elems);
2687 : :
2688 : 1 : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
2689 : :
2690 : 1 : vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
2691 : 1 : }
2692 : :
2693 : : /* This function builds ni_name = number of iterations. Statements
2694 : : are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
2695 : : it to TRUE if new ssa_var is generated. */
2696 : :
2697 : : tree
2698 : 55425 : vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
2699 : : {
2700 : 55425 : tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
2701 : 55425 : if (TREE_CODE (ni) == INTEGER_CST)
2702 : : return ni;
2703 : : else
2704 : : {
2705 : 23188 : tree ni_name, var;
2706 : 23188 : gimple_seq stmts = NULL;
2707 : 23188 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2708 : :
2709 : 23188 : var = create_tmp_var (TREE_TYPE (ni), "niters");
2710 : 23188 : ni_name = force_gimple_operand (ni, &stmts, false, var);
2711 : 23188 : if (stmts)
2712 : : {
2713 : 22207 : gsi_insert_seq_on_edge_immediate (pe, stmts);
2714 : 22207 : if (new_var_p != NULL)
2715 : 159 : *new_var_p = true;
2716 : : }
2717 : :
2718 : 23188 : return ni_name;
2719 : : }
2720 : : }
2721 : :
2722 : : /* Calculate the number of iterations above which vectorized loop will be
2723 : : preferred than scalar loop. NITERS_PROLOG is the number of iterations
2724 : : of prolog loop. If it's integer const, the integer number is also passed
2725 : : in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
2726 : : number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
2727 : : value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
2728 : : threshold below which the scalar (rather than vectorized) loop will be
2729 : : executed. This function stores the upper bound (inclusive) of the result
2730 : : in BOUND_SCALAR. */
2731 : :
2732 : : static tree
2733 : 22330 : vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
2734 : : int bound_prolog, poly_int64 bound_epilog, int th,
2735 : : poly_uint64 *bound_scalar,
2736 : : bool check_profitability)
2737 : : {
2738 : 22330 : tree type = TREE_TYPE (niters_prolog);
2739 : 22330 : tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
2740 : : build_int_cst (type, bound_epilog));
2741 : :
2742 : 22330 : *bound_scalar = bound_prolog + bound_epilog;
2743 : 22330 : if (check_profitability)
2744 : : {
2745 : : /* TH indicates the minimum niters of vectorized loop, while we
2746 : : compute the maximum niters of scalar loop. */
2747 : 13832 : th--;
2748 : : /* Peeling for constant times. */
2749 : 13832 : if (int_niters_prolog >= 0)
2750 : : {
2751 : 13806 : *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
2752 : 13806 : return build_int_cst (type, *bound_scalar);
2753 : : }
2754 : : /* Peeling an unknown number of times. Note that both BOUND_PROLOG
2755 : : and BOUND_EPILOG are inclusive upper bounds. */
2756 : 26 : if (known_ge (th, bound_prolog + bound_epilog))
2757 : : {
2758 : 24 : *bound_scalar = th;
2759 : 24 : return build_int_cst (type, th);
2760 : : }
2761 : : /* Need to do runtime comparison. */
2762 : 2 : else if (maybe_gt (th, bound_epilog))
2763 : : {
2764 : 2 : *bound_scalar = upper_bound (*bound_scalar, th);
2765 : 2 : return fold_build2 (MAX_EXPR, type,
2766 : : build_int_cst (type, th), niters);
2767 : : }
2768 : : }
2769 : : return niters;
2770 : : }
2771 : :
2772 : : /* NITERS is the number of times that the original scalar loop executes
2773 : : after peeling. Work out the maximum number of iterations N that can
2774 : : be handled by the vectorized form of the loop and then either:
2775 : :
2776 : : a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
2777 : :
2778 : : niters_vector = N
2779 : :
2780 : : b) set *STEP_VECTOR_PTR to one and generate:
2781 : :
2782 : : niters_vector = N / vf
2783 : :
2784 : : In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
2785 : : any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
2786 : : is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
2787 : :
2788 : : void
2789 : 31014 : vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
2790 : : tree *niters_vector_ptr, tree *step_vector_ptr,
2791 : : bool niters_no_overflow)
2792 : : {
2793 : 31014 : tree ni_minus_gap, var;
2794 : 31014 : tree niters_vector, step_vector, type = TREE_TYPE (niters);
2795 : 31014 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2796 : 31014 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2797 : 31014 : tree log_vf = NULL_TREE;
2798 : :
2799 : : /* If epilogue loop is required because of data accesses with gaps, we
2800 : : subtract one iteration from the total number of iterations here for
2801 : : correct calculation of RATIO. */
2802 : 31014 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2803 : : {
2804 : 317 : ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
2805 : : build_one_cst (type));
2806 : 317 : if (!is_gimple_val (ni_minus_gap))
2807 : : {
2808 : 147 : var = create_tmp_var (type, "ni_gap");
2809 : 147 : gimple *stmts = NULL;
2810 : 147 : ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
2811 : : true, var);
2812 : 147 : gsi_insert_seq_on_edge_immediate (pe, stmts);
2813 : : }
2814 : : }
2815 : : else
2816 : : ni_minus_gap = niters;
2817 : :
2818 : : /* To silence some unexpected warnings, simply initialize to 0. */
2819 : 31014 : unsigned HOST_WIDE_INT const_vf = 0;
2820 : 31014 : if (vf.is_constant (&const_vf)
2821 : 31014 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2822 : : {
2823 : : /* Create: niters >> log2(vf) */
2824 : : /* If it's known that niters == number of latch executions + 1 doesn't
2825 : : overflow, we can generate niters >> log2(vf); otherwise we generate
2826 : : (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
2827 : : will be at least one. */
2828 : 62014 : log_vf = build_int_cst (type, exact_log2 (const_vf));
2829 : 31007 : if (niters_no_overflow)
2830 : 30825 : niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
2831 : : else
2832 : 182 : niters_vector
2833 : 182 : = fold_build2 (PLUS_EXPR, type,
2834 : : fold_build2 (RSHIFT_EXPR, type,
2835 : : fold_build2 (MINUS_EXPR, type,
2836 : : ni_minus_gap,
2837 : : build_int_cst (type, vf)),
2838 : : log_vf),
2839 : : build_int_cst (type, 1));
2840 : 31007 : step_vector = build_one_cst (type);
2841 : : }
2842 : : else
2843 : : {
2844 : 7 : niters_vector = ni_minus_gap;
2845 : 7 : step_vector = build_int_cst (type, vf);
2846 : : }
2847 : :
2848 : 31014 : if (!is_gimple_val (niters_vector))
2849 : : {
2850 : 23007 : var = create_tmp_var (type, "bnd");
2851 : 23007 : gimple_seq stmts = NULL;
2852 : 23007 : niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2853 : 23007 : gsi_insert_seq_on_edge_immediate (pe, stmts);
2854 : : /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2855 : : we set range information to make niters analyzer's life easier.
2856 : : Note the number of latch iteration value can be TYPE_MAX_VALUE so
2857 : : we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf. */
2858 : 23007 : if (stmts != NULL && log_vf)
2859 : : {
2860 : 23003 : if (niters_no_overflow)
2861 : : {
2862 : 22839 : int_range<1> vr (type,
2863 : 22839 : wi::one (TYPE_PRECISION (type)),
2864 : 45678 : wi::rshift (wi::max_value (TYPE_PRECISION (type),
2865 : 22839 : TYPE_SIGN (type)),
2866 : 22839 : exact_log2 (const_vf),
2867 : 45678 : TYPE_SIGN (type)));
2868 : 22839 : set_range_info (niters_vector, vr);
2869 : 22839 : }
2870 : : /* For VF == 1 the vector IV might also overflow so we cannot
2871 : : assert a minimum value of 1. */
2872 : 164 : else if (const_vf > 1)
2873 : : {
2874 : 128 : int_range<1> vr (type,
2875 : 128 : wi::one (TYPE_PRECISION (type)),
2876 : 384 : wi::rshift (wi::max_value (TYPE_PRECISION (type),
2877 : 128 : TYPE_SIGN (type))
2878 : 256 : - (const_vf - 1),
2879 : 256 : exact_log2 (const_vf), TYPE_SIGN (type))
2880 : 384 : + 1);
2881 : 128 : set_range_info (niters_vector, vr);
2882 : 128 : }
2883 : : }
2884 : : }
2885 : 31014 : *niters_vector_ptr = niters_vector;
2886 : 31014 : *step_vector_ptr = step_vector;
2887 : :
2888 : 31014 : return;
2889 : : }
2890 : :
2891 : : /* Given NITERS_VECTOR which is the number of iterations for vectorized
2892 : : loop specified by LOOP_VINFO after vectorization, compute the number
2893 : : of iterations before vectorization (niters_vector * vf) and store it
2894 : : to NITERS_VECTOR_MULT_VF_PTR. */
2895 : :
2896 : : static void
2897 : 30336 : vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2898 : : tree niters_vector,
2899 : : tree *niters_vector_mult_vf_ptr)
2900 : : {
2901 : : /* We should be using a step_vector of VF if VF is variable. */
2902 : 30336 : int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2903 : 30336 : tree type = TREE_TYPE (niters_vector);
2904 : 60672 : tree log_vf = build_int_cst (type, exact_log2 (vf));
2905 : 30336 : tree tree_vf = build_int_cst (type, vf);
2906 : 30336 : basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
2907 : :
2908 : 30336 : gcc_assert (niters_vector_mult_vf_ptr != NULL);
2909 : 30336 : tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2910 : : niters_vector, log_vf);
2911 : :
2912 : : /* If we've peeled a vector iteration then subtract one full vector
2913 : : iteration. */
2914 : 30336 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
2915 : 19 : niters_vector_mult_vf = fold_build2 (MINUS_EXPR, type,
2916 : : niters_vector_mult_vf, tree_vf);
2917 : :
2918 : 30336 : if (!is_gimple_val (niters_vector_mult_vf))
2919 : : {
2920 : 22351 : tree var = create_tmp_var (type, "niters_vector_mult_vf");
2921 : 22351 : gimple_seq stmts = NULL;
2922 : 22351 : niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2923 : : &stmts, true, var);
2924 : 22351 : gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2925 : 22351 : gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2926 : : }
2927 : 30336 : *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2928 : 30336 : }
2929 : :
2930 : : /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2931 : : of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2932 : : are two pred edges of the merge point before UPDATE_LOOP. The two loops
2933 : : appear like below:
2934 : :
2935 : : guard_bb:
2936 : : if (cond)
2937 : : goto merge_bb;
2938 : : else
2939 : : goto skip_loop;
2940 : :
2941 : : skip_loop:
2942 : : header_a:
2943 : : i_1 = PHI<i_0, i_2>;
2944 : : ...
2945 : : i_2 = i_1 + 1;
2946 : : if (cond_a)
2947 : : goto latch_a;
2948 : : else
2949 : : goto exit_a;
2950 : : latch_a:
2951 : : goto header_a;
2952 : :
2953 : : exit_a:
2954 : : i_5 = PHI<i_2>;
2955 : :
2956 : : merge_bb:
2957 : : ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2958 : :
2959 : : update_loop:
2960 : : header_b:
2961 : : i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2962 : : ...
2963 : : i_4 = i_3 + 1;
2964 : : if (cond_b)
2965 : : goto latch_b;
2966 : : else
2967 : : goto exit_bb;
2968 : : latch_b:
2969 : : goto header_b;
2970 : :
2971 : : exit_bb:
2972 : :
2973 : : This function creates PHI nodes at merge_bb and replaces the use of i_5
2974 : : in the update_loop's PHI node with the result of new PHI result. */
2975 : :
2976 : : static void
2977 : 22655 : slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2978 : : class loop *update_loop,
2979 : : edge guard_edge, edge merge_edge)
2980 : : {
2981 : 22655 : location_t merge_loc, guard_loc;
2982 : 22655 : edge orig_e = loop_preheader_edge (skip_loop);
2983 : 22655 : edge update_e = loop_preheader_edge (update_loop);
2984 : 22655 : gphi_iterator gsi_orig, gsi_update;
2985 : :
2986 : 22655 : for ((gsi_orig = gsi_start_phis (skip_loop->header),
2987 : 22655 : gsi_update = gsi_start_phis (update_loop->header));
2988 : 84420 : !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2989 : 61765 : gsi_next (&gsi_orig), gsi_next (&gsi_update))
2990 : : {
2991 : 61765 : gphi *orig_phi = gsi_orig.phi ();
2992 : 61765 : gphi *update_phi = gsi_update.phi ();
2993 : :
2994 : : /* Generate new phi node at merge bb of the guard. */
2995 : 61765 : tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2996 : 61765 : gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2997 : :
2998 : : /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2999 : : args in NEW_PHI for these edges. */
3000 : 61765 : tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
3001 : 61765 : tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
3002 : 61765 : merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
3003 : 61765 : guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
3004 : 61765 : add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
3005 : 61765 : add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
3006 : :
3007 : : /* Update phi in UPDATE_PHI. */
3008 : 61765 : adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
3009 : : }
3010 : 22655 : }
3011 : :
3012 : : /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
3013 : : Return a value that equals:
3014 : :
3015 : : - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
3016 : : - SKIP_VALUE when the main loop is skipped. */
3017 : :
3018 : : tree
3019 : 3796 : vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
3020 : : tree skip_value)
3021 : : {
3022 : 3796 : gcc_assert (loop_vinfo->main_loop_edge);
3023 : :
3024 : 3796 : tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
3025 : 3796 : basic_block bb = loop_vinfo->main_loop_edge->dest;
3026 : 3796 : gphi *new_phi = create_phi_node (phi_result, bb);
3027 : 3796 : add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
3028 : : UNKNOWN_LOCATION);
3029 : 3796 : add_phi_arg (new_phi, skip_value,
3030 : : loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
3031 : 3796 : return phi_result;
3032 : : }
3033 : :
3034 : : /* Function vect_do_peeling.
3035 : :
3036 : : Input:
3037 : : - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
3038 : :
3039 : : preheader:
3040 : : LOOP:
3041 : : header_bb:
3042 : : loop_body
3043 : : if (exit_loop_cond) goto exit_bb
3044 : : else goto header_bb
3045 : : exit_bb:
3046 : :
3047 : : - NITERS: The number of iterations of the loop.
3048 : : - NITERSM1: The number of iterations of the loop's latch.
3049 : : - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
3050 : : - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
3051 : : CHECK_PROFITABILITY is true.
3052 : : Output:
3053 : : - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
3054 : : iterate after vectorization; see vect_set_loop_condition for details.
3055 : : - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
3056 : : should be set to the number of scalar iterations handled by the
3057 : : vector loop. The SSA name is only used on exit from the loop.
3058 : :
3059 : : This function peels prolog and epilog from the loop, adds guards skipping
3060 : : PROLOG and EPILOG for various conditions. As a result, the changed CFG
3061 : : would look like:
3062 : :
3063 : : guard_bb_1:
3064 : : if (prefer_scalar_loop) goto merge_bb_1
3065 : : else goto guard_bb_2
3066 : :
3067 : : guard_bb_2:
3068 : : if (skip_prolog) goto merge_bb_2
3069 : : else goto prolog_preheader
3070 : :
3071 : : prolog_preheader:
3072 : : PROLOG:
3073 : : prolog_header_bb:
3074 : : prolog_body
3075 : : if (exit_prolog_cond) goto prolog_exit_bb
3076 : : else goto prolog_header_bb
3077 : : prolog_exit_bb:
3078 : :
3079 : : merge_bb_2:
3080 : :
3081 : : vector_preheader:
3082 : : VECTOR LOOP:
3083 : : vector_header_bb:
3084 : : vector_body
3085 : : if (exit_vector_cond) goto vector_exit_bb
3086 : : else goto vector_header_bb
3087 : : vector_exit_bb:
3088 : :
3089 : : guard_bb_3:
3090 : : if (skip_epilog) goto merge_bb_3
3091 : : else goto epilog_preheader
3092 : :
3093 : : merge_bb_1:
3094 : :
3095 : : epilog_preheader:
3096 : : EPILOG:
3097 : : epilog_header_bb:
3098 : : epilog_body
3099 : : if (exit_epilog_cond) goto merge_bb_3
3100 : : else goto epilog_header_bb
3101 : :
3102 : : merge_bb_3:
3103 : :
3104 : : Note this function peels prolog and epilog only if it's necessary,
3105 : : as well as guards.
3106 : : This function returns the epilogue loop if a decision was made to vectorize
3107 : : it, otherwise NULL.
3108 : :
3109 : : The analysis resulting in this epilogue loop's loop_vec_info was performed
3110 : : in the same vect_analyze_loop call as the main loop's. At that time
3111 : : vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
3112 : : vectorization factors than the main loop. This list is chained in the
3113 : : loop's loop_vec_info in the 'epilogue_vinfo' member. When we decide to
3114 : : vectorize the epilogue loop for a lower vectorization factor, the
3115 : : loop_vec_info in epilogue_vinfo is updated and linked to the epilogue loop.
3116 : : This is later used to vectorize the epilogue.
3117 : : The reason the loop_vec_info needs updating is that it was
3118 : : constructed based on the original main loop, and the epilogue loop is a
3119 : : copy of this loop, so all links pointing to statements in the original loop
3120 : : need updating. Furthermore, these loop_vec_infos share the
3121 : : data_reference's records, which will also need to be updated.
3122 : :
3123 : : TODO: Guard for prefer_scalar_loop should be emitted along with
3124 : : versioning conditions if loop versioning is needed. */
3125 : :
3126 : :
3127 : : class loop *
3128 : 55100 : vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
3129 : : tree *niters_vector, tree *step_vector,
3130 : : tree *niters_vector_mult_vf_var, int th,
3131 : : bool check_profitability, bool niters_no_overflow,
3132 : : tree *advance)
3133 : : {
3134 : 55100 : edge e, guard_e;
3135 : 55100 : tree type = TREE_TYPE (niters), guard_cond;
3136 : 55100 : basic_block guard_bb, guard_to;
3137 : 55100 : profile_probability prob_prolog, prob_vector, prob_epilog;
3138 : 55100 : int estimated_vf;
3139 : 55100 : int prolog_peeling = 0;
3140 : 55100 : bool vect_epilogues = loop_vinfo->epilogue_vinfo != NULL;
3141 : :
3142 : 55100 : if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3143 : 55099 : prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3144 : :
3145 : 55100 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3146 : 55100 : poly_uint64 bound_epilog = 0;
3147 : 55100 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
3148 : 55093 : && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3149 : 29341 : bound_epilog += vf - 1;
3150 : 55100 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3151 : 317 : bound_epilog += 1;
3152 : :
3153 : : /* For early breaks the scalar loop needs to execute at most VF times
3154 : : to find the element that caused the break. */
3155 : 55100 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3156 : 1340 : bound_epilog = vf;
3157 : :
3158 : 55100 : bool epilog_peeling = maybe_ne (bound_epilog, 0U);
3159 : 55100 : poly_uint64 bound_scalar = bound_epilog;
3160 : :
3161 : 55100 : if (!prolog_peeling && !epilog_peeling)
3162 : : return NULL;
3163 : :
3164 : : /* Before doing any peeling make sure to reset debug binds outside of
3165 : : the loop refering to defs not in LC SSA. */
3166 : 30367 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3167 : 92861 : for (unsigned i = 0; i < loop->num_nodes; ++i)
3168 : : {
3169 : 62494 : basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3170 : 62494 : imm_use_iterator ui;
3171 : 62494 : gimple *use_stmt;
3172 : 149290 : for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3173 : 86796 : gsi_next (&gsi))
3174 : : {
3175 : 274384 : FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
3176 : 206555 : if (gimple_debug_bind_p (use_stmt)
3177 : 18967 : && loop != gimple_bb (use_stmt)->loop_father
3178 : 18983 : && !flow_loop_nested_p (loop,
3179 : 16 : gimple_bb (use_stmt)->loop_father))
3180 : : {
3181 : 2 : gimple_debug_bind_reset_value (use_stmt);
3182 : 2 : update_stmt (use_stmt);
3183 : 86796 : }
3184 : : }
3185 : 588445 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
3186 : 463457 : gsi_next (&gsi))
3187 : : {
3188 : 463457 : ssa_op_iter op_iter;
3189 : 463457 : def_operand_p def_p;
3190 : 752306 : FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
3191 : 724922 : FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
3192 : 471617 : if (gimple_debug_bind_p (use_stmt)
3193 : 35544 : && loop != gimple_bb (use_stmt)->loop_father
3194 : 35571 : && !flow_loop_nested_p (loop,
3195 : 27 : gimple_bb (use_stmt)->loop_father))
3196 : : {
3197 : 0 : gimple_debug_bind_reset_value (use_stmt);
3198 : 0 : update_stmt (use_stmt);
3199 : 288849 : }
3200 : : }
3201 : : }
3202 : :
3203 : 30367 : prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
3204 : 30367 : estimated_vf = vect_vf_for_cost (loop_vinfo);
3205 : 30367 : if (estimated_vf == 2)
3206 : 5560 : estimated_vf = 3;
3207 : 30367 : prob_prolog = prob_epilog = profile_probability::guessed_always ()
3208 : 30367 : .apply_scale (estimated_vf - 1, estimated_vf);
3209 : :
3210 : 30367 : class loop *prolog = NULL, *epilog = NULL;
3211 : 30367 : class loop *first_loop = loop;
3212 : 30367 : bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
3213 : :
3214 : : /* SSA form needs to be up-to-date since we are going to manually
3215 : : update SSA form in slpeel_tree_duplicate_loop_to_edge_cfg and delete all
3216 : : update SSA state after that, so we have to make sure to not lose any
3217 : : pending update needs. */
3218 : 30367 : gcc_assert (!need_ssa_update_p (cfun));
3219 : :
3220 : : /* If we're vectorizing an epilogue loop, we have ensured that the
3221 : : virtual operand is in SSA form throughout the vectorized main loop.
3222 : : Normally it is possible to trace the updated
3223 : : vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
3224 : : back to scalar-stmt vuses, meaning that the effect of the SSA update
3225 : : remains local to the main loop. However, there are rare cases in
3226 : : which the vectorized loop should have vdefs even when the original scalar
3227 : : loop didn't. For example, vectorizing a load with IFN_LOAD_LANES
3228 : : introduces clobbers of the temporary vector array, which in turn
3229 : : needs new vdefs. If the scalar loop doesn't write to memory, these
3230 : : new vdefs will be the only ones in the vector loop.
3231 : : We are currently defering updating virtual SSA form and creating
3232 : : of a virtual PHI for this case so we do not have to make sure the
3233 : : newly introduced virtual def is in LCSSA form. */
3234 : :
3235 : 30367 : if (MAY_HAVE_DEBUG_BIND_STMTS)
3236 : : {
3237 : 11433 : gcc_assert (!adjust_vec.exists ());
3238 : 11433 : adjust_vec.create (32);
3239 : : }
3240 : 30367 : initialize_original_copy_tables ();
3241 : :
3242 : : /* Record the anchor bb at which the guard should be placed if the scalar
3243 : : loop might be preferred. */
3244 : 30367 : basic_block anchor = loop_preheader_edge (loop)->src;
3245 : :
3246 : : /* Generate the number of iterations for the prolog loop. We do this here
3247 : : so that we can also get the upper bound on the number of iterations. */
3248 : 30367 : tree niters_prolog;
3249 : 30367 : int bound_prolog = 0;
3250 : 30367 : if (prolog_peeling)
3251 : : {
3252 : 325 : niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
3253 : : &bound_prolog);
3254 : : /* If algonment peeling is known, we will always execute prolog. */
3255 : 325 : if (TREE_CODE (niters_prolog) == INTEGER_CST)
3256 : 208 : prob_prolog = profile_probability::always ();
3257 : : }
3258 : : else
3259 : 30042 : niters_prolog = build_int_cst (type, 0);
3260 : :
3261 : 30367 : loop_vec_info epilogue_vinfo = loop_vinfo->epilogue_vinfo;
3262 : 30367 : tree niters_vector_mult_vf = NULL_TREE;
3263 : : /* Saving NITERs before the loop, as this may be changed by prologue. */
3264 : 30367 : tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
3265 : 30367 : edge update_e = NULL, skip_e = NULL;
3266 : 30367 : unsigned int lowest_vf = constant_lower_bound (vf);
3267 : : /* Prolog loop may be skipped. */
3268 : 30367 : bool skip_prolog = (prolog_peeling != 0);
3269 : : /* Skip this loop to epilog when there are not enough iterations to enter this
3270 : : vectorized loop. If true we should perform runtime checks on the NITERS
3271 : : to check whether we should skip the current vectorized loop. If we know
3272 : : the number of scalar iterations we may choose to add a runtime check if
3273 : : this number "maybe" smaller than the number of iterations required
3274 : : when we know the number of scalar iterations may potentially
3275 : : be smaller than the number of iterations required to enter this loop, for
3276 : : this we use the upper bounds on the prolog and epilog peeling. When we
3277 : : don't know the number of iterations and don't require versioning it is
3278 : : because we have asserted that there are enough scalar iterations to enter
3279 : : the main loop, so this skip is not necessary. When we are versioning then
3280 : : we only add such a skip if we have chosen to vectorize the epilogue. */
3281 : 8011 : bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3282 : 38378 : ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
3283 : 8011 : bound_prolog + bound_epilog)
3284 : 22356 : : (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3285 : 26 : || vect_epilogues));
3286 : :
3287 : : /* Epilog loop must be executed if the number of iterations for epilog
3288 : : loop is known at compile time, otherwise we need to add a check at
3289 : : the end of vector loop and skip to the end of epilog loop. */
3290 : 30367 : bool skip_epilog = (prolog_peeling < 0
3291 : 30250 : || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3292 : 30367 : || !vf.is_constant ());
3293 : : /* PEELING_FOR_GAPS and peeling for early breaks are special because epilog
3294 : : loop must be executed. */
3295 : 30367 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3296 : 30050 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3297 : 1657 : skip_epilog = false;
3298 : :
3299 : 30367 : class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3300 : 30367 : auto_vec<profile_count> original_counts;
3301 : 30367 : basic_block *original_bbs = NULL;
3302 : :
3303 : 30367 : if (skip_vector)
3304 : : {
3305 : 22330 : split_edge (loop_preheader_edge (loop));
3306 : :
3307 : 22330 : if (epilog_peeling && (vect_epilogues || scalar_loop == NULL))
3308 : : {
3309 : 20751 : original_bbs = get_loop_body (loop);
3310 : 62574 : for (unsigned int i = 0; i < loop->num_nodes; i++)
3311 : 41823 : original_counts.safe_push(original_bbs[i]->count);
3312 : : }
3313 : :
3314 : : /* Due to the order in which we peel prolog and epilog, we first
3315 : : propagate probability to the whole loop. The purpose is to
3316 : : avoid adjusting probabilities of both prolog and vector loops
3317 : : separately. Note in this case, the probability of epilog loop
3318 : : needs to be scaled back later. */
3319 : 22330 : basic_block bb_before_loop = loop_preheader_edge (loop)->src;
3320 : 22330 : if (prob_vector.initialized_p ())
3321 : : {
3322 : 22330 : scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
3323 : 22330 : scale_loop_profile (loop, prob_vector, -1);
3324 : : }
3325 : : }
3326 : :
3327 : 30367 : if (vect_epilogues)
3328 : : {
3329 : : /* Make sure to set the epilogue's epilogue scalar loop, such that we can
3330 : : use the original scalar loop as remaining epilogue if necessary. */
3331 : 6373 : LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
3332 : 6373 : = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3333 : 6373 : LOOP_VINFO_SCALAR_IV_EXIT (epilogue_vinfo)
3334 : 6373 : = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
3335 : : }
3336 : :
3337 : 30367 : if (prolog_peeling)
3338 : : {
3339 : 325 : e = loop_preheader_edge (loop);
3340 : 325 : edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
3341 : 325 : gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, exit_e, e)
3342 : : && !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo));
3343 : :
3344 : : /* Peel prolog and put it on preheader edge of loop. */
3345 : 325 : edge scalar_e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
3346 : 325 : edge prolog_e = NULL;
3347 : 325 : prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, exit_e,
3348 : : scalar_loop, scalar_e,
3349 : : e, &prolog_e);
3350 : 325 : gcc_assert (prolog);
3351 : 325 : prolog->force_vectorize = false;
3352 : :
3353 : 325 : first_loop = prolog;
3354 : 325 : reset_original_copy_tables ();
3355 : :
3356 : : /* Update the number of iterations for prolog loop. */
3357 : 325 : tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
3358 : 325 : vect_set_loop_condition (prolog, prolog_e, NULL, niters_prolog,
3359 : : step_prolog, NULL_TREE, false);
3360 : :
3361 : : /* Skip the prolog loop. */
3362 : 325 : if (skip_prolog)
3363 : : {
3364 : 325 : guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3365 : : niters_prolog, build_int_cst (type, 0));
3366 : 325 : guard_bb = loop_preheader_edge (prolog)->src;
3367 : 325 : basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
3368 : 325 : guard_to = split_edge (loop_preheader_edge (loop));
3369 : 325 : guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
3370 : : guard_to, guard_bb,
3371 : : prob_prolog.invert (),
3372 : : irred_flag);
3373 : 1432 : for (edge alt_e : get_loop_exit_edges (prolog))
3374 : : {
3375 : 457 : if (alt_e == prolog_e)
3376 : 325 : continue;
3377 : 132 : basic_block old_dom
3378 : 132 : = get_immediate_dominator (CDI_DOMINATORS, alt_e->dest);
3379 : 132 : if (flow_bb_inside_loop_p (prolog, old_dom))
3380 : : {
3381 : 45 : auto_vec<basic_block, 8> queue;
3382 : 45 : for (auto son = first_dom_son (CDI_DOMINATORS, old_dom);
3383 : 149 : son; son = next_dom_son (CDI_DOMINATORS, son))
3384 : 104 : if (!flow_bb_inside_loop_p (prolog, son))
3385 : 59 : queue.safe_push (son);
3386 : 194 : for (auto son : queue)
3387 : 59 : set_immediate_dominator (CDI_DOMINATORS, son, guard_bb);
3388 : 45 : }
3389 : 325 : }
3390 : :
3391 : 325 : e = EDGE_PRED (guard_to, 0);
3392 : 325 : e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3393 : 325 : slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
3394 : :
3395 : 325 : scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
3396 : 325 : scale_loop_profile (prolog, prob_prolog, bound_prolog - 1);
3397 : : }
3398 : :
3399 : : /* Update init address of DRs. */
3400 : 325 : vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
3401 : : /* Update niters for vector loop. */
3402 : 325 : LOOP_VINFO_NITERS (loop_vinfo)
3403 : 325 : = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
3404 : 325 : LOOP_VINFO_NITERSM1 (loop_vinfo)
3405 : 325 : = fold_build2 (MINUS_EXPR, type,
3406 : : LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
3407 : 325 : bool new_var_p = false;
3408 : 325 : niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
3409 : : /* It's guaranteed that vector loop bound before vectorization is at
3410 : : least VF, so set range information for newly generated var. */
3411 : 325 : if (new_var_p)
3412 : : {
3413 : 159 : int_range<1> vr (type,
3414 : 159 : wi::to_wide (build_int_cst (type, lowest_vf)),
3415 : 318 : wi::to_wide (TYPE_MAX_VALUE (type)));
3416 : 159 : set_range_info (niters, vr);
3417 : 159 : }
3418 : :
3419 : : /* Prolog iterates at most bound_prolog times, latch iterates at
3420 : : most bound_prolog - 1 times. */
3421 : 325 : record_niter_bound (prolog, bound_prolog - 1, false, true);
3422 : 325 : delete_update_ssa ();
3423 : 325 : adjust_vec_debug_stmts ();
3424 : 325 : scev_reset ();
3425 : : }
3426 : 30367 : basic_block bb_before_epilog = NULL;
3427 : :
3428 : 30367 : if (epilog_peeling)
3429 : : {
3430 : 30336 : e = LOOP_VINFO_IV_EXIT (loop_vinfo);
3431 : 30336 : gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, e, e));
3432 : :
3433 : : /* Peel epilog and put it on exit edge of loop. If we are vectorizing
3434 : : said epilog then we should use a copy of the main loop as a starting
3435 : : point. This loop may have already had some preliminary transformations
3436 : : to allow for more optimal vectorization, for example if-conversion.
3437 : : If we are not vectorizing the epilog then we should use the scalar loop
3438 : : as the transformations mentioned above make less or no sense when not
3439 : : vectorizing. */
3440 : 30336 : edge scalar_e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
3441 : 30336 : epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
3442 : 6373 : edge epilog_e = vect_epilogues ? e : scalar_e;
3443 : 30336 : edge new_epilog_e = NULL;
3444 : 30336 : auto_vec<basic_block> doms;
3445 : 30336 : epilog
3446 : 30336 : = slpeel_tree_duplicate_loop_to_edge_cfg (loop, e, epilog, epilog_e, e,
3447 : : &new_epilog_e, true, &doms);
3448 : :
3449 : 30336 : LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo) = new_epilog_e;
3450 : 30336 : gcc_assert (epilog);
3451 : 30336 : gcc_assert (new_epilog_e);
3452 : 30336 : epilog->force_vectorize = false;
3453 : 30336 : bb_before_epilog = loop_preheader_edge (epilog)->src;
3454 : :
3455 : : /* Scalar version loop may be preferred. In this case, add guard
3456 : : and skip to epilog. Note this only happens when the number of
3457 : : iterations of loop is unknown at compile time, otherwise this
3458 : : won't be vectorized. */
3459 : 30336 : if (skip_vector)
3460 : : {
3461 : : /* Additional epilogue iteration is peeled if gap exists. */
3462 : 22330 : tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
3463 : : bound_prolog, bound_epilog,
3464 : : th, &bound_scalar,
3465 : : check_profitability);
3466 : : /* Build guard against NITERSM1 since NITERS may overflow. */
3467 : 22330 : guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
3468 : 22330 : guard_bb = anchor;
3469 : 22330 : guard_to = split_edge (loop_preheader_edge (epilog));
3470 : 22330 : guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
3471 : : guard_to, guard_bb,
3472 : : prob_vector.invert (),
3473 : : irred_flag);
3474 : 22330 : skip_e = guard_e;
3475 : 22330 : e = EDGE_PRED (guard_to, 0);
3476 : 22330 : e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3477 : :
3478 : : /* Handle any remaining dominator updates needed after
3479 : : inserting the loop skip edge above. */
3480 : 22330 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
3481 : 228 : && prolog_peeling)
3482 : : {
3483 : : /* Adding a skip edge to skip a loop with multiple exits
3484 : : means the dominator of the join blocks for all exits shifts
3485 : : from the prolog skip guard to the loop skip guard. */
3486 : 112 : auto prolog_skip_bb
3487 : 112 : = single_pred (loop_preheader_edge (prolog)->src);
3488 : 112 : auto needs_update
3489 : 112 : = get_dominated_by (CDI_DOMINATORS, prolog_skip_bb);
3490 : :
3491 : : /* Update everything except for the immediate children of
3492 : : the prolog skip block (the prolog and vector preheaders).
3493 : : Those should remain dominated by the prolog skip block itself,
3494 : : since the loop guard edge goes to the epilogue. */
3495 : 597 : for (auto bb : needs_update)
3496 : 261 : if (bb != EDGE_SUCC (prolog_skip_bb, 0)->dest
3497 : 261 : && bb != EDGE_SUCC (prolog_skip_bb, 1)->dest)
3498 : 37 : set_immediate_dominator (CDI_DOMINATORS, bb, guard_bb);
3499 : 112 : }
3500 : :
3501 : 22330 : slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
3502 : :
3503 : : /* Simply propagate profile info from guard_bb to guard_to which is
3504 : : a merge point of control flow. */
3505 : 22330 : profile_count old_count = guard_to->count;
3506 : 22330 : guard_to->count = guard_bb->count;
3507 : :
3508 : : /* Restore the counts of the epilog loop if we didn't use the scalar loop. */
3509 : 22330 : if (vect_epilogues || scalar_loop == NULL)
3510 : : {
3511 : 20751 : gcc_assert(epilog->num_nodes == loop->num_nodes);
3512 : 20751 : basic_block *bbs = get_loop_body (epilog);
3513 : 62574 : for (unsigned int i = 0; i < epilog->num_nodes; i++)
3514 : : {
3515 : 41823 : gcc_assert(get_bb_original (bbs[i]) == original_bbs[i]);
3516 : 41823 : bbs[i]->count = original_counts[i];
3517 : : }
3518 : 20751 : free (bbs);
3519 : 20751 : free (original_bbs);
3520 : : }
3521 : 1579 : else if (old_count.nonzero_p ())
3522 : 1579 : scale_loop_profile (epilog, guard_to->count.probability_in (old_count), -1);
3523 : :
3524 : : /* Only need to handle basic block before epilog loop if it's not
3525 : : the guard_bb, which is the case when skip_vector is true. */
3526 : 22330 : if (guard_bb != bb_before_epilog && single_pred_p (bb_before_epilog))
3527 : 22102 : bb_before_epilog->count = single_pred_edge (bb_before_epilog)->count ();
3528 : 22330 : bb_before_epilog = loop_preheader_edge (epilog)->src;
3529 : : }
3530 : :
3531 : : /* If loop is peeled for non-zero constant times, now niters refers to
3532 : : orig_niters - prolog_peeling, it won't overflow even the orig_niters
3533 : : overflows. */
3534 : 30336 : niters_no_overflow |= (prolog_peeling > 0);
3535 : 30336 : vect_gen_vector_loop_niters (loop_vinfo, niters,
3536 : : niters_vector, step_vector,
3537 : : niters_no_overflow);
3538 : 30336 : if (!integer_onep (*step_vector))
3539 : : {
3540 : : /* On exit from the loop we will have an easy way of calcalating
3541 : : NITERS_VECTOR / STEP * STEP. Install a dummy definition
3542 : : until then. */
3543 : 0 : niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
3544 : 0 : SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
3545 : 0 : *niters_vector_mult_vf_var = niters_vector_mult_vf;
3546 : : }
3547 : : else
3548 : 30336 : vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
3549 : : &niters_vector_mult_vf);
3550 : : /* Update IVs of original loop as if they were advanced by
3551 : : niters_vector_mult_vf steps. */
3552 : 30336 : gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
3553 : 30336 : update_e = skip_vector ? e : loop_preheader_edge (epilog);
3554 : 30336 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3555 : 1340 : update_e = single_succ_edge (LOOP_VINFO_IV_EXIT (loop_vinfo)->dest);
3556 : :
3557 : : /* If we have a peeled vector iteration, all exits are the same, leave it
3558 : : and so the main exit needs to be treated the same as the alternative
3559 : : exits in that we leave their updates to vectorizable_live_operations.
3560 : : */
3561 : 30336 : if (!LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
3562 : 30317 : vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
3563 : : update_e);
3564 : :
3565 : : /* If we have a peeled vector iteration we will never skip the epilog loop
3566 : : and we can simplify the cfg a lot by not doing the edge split. */
3567 : 30336 : if (skip_epilog
3568 : 30336 : || (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
3569 : 1340 : && !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)))
3570 : : {
3571 : 23281 : guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3572 : : niters, niters_vector_mult_vf);
3573 : :
3574 : 23281 : guard_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
3575 : 23281 : edge epilog_e = LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo);
3576 : 23281 : guard_to = epilog_e->dest;
3577 : 24389 : guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
3578 : : skip_vector ? anchor : guard_bb,
3579 : : prob_epilog.invert (),
3580 : : irred_flag);
3581 : 23281 : doms.safe_push (guard_to);
3582 : 23281 : if (vect_epilogues)
3583 : 5160 : epilogue_vinfo->skip_this_loop_edge = guard_e;
3584 : 23281 : edge main_iv = LOOP_VINFO_IV_EXIT (loop_vinfo);
3585 : 23281 : gphi_iterator gsi2 = gsi_start_phis (main_iv->dest);
3586 : 23281 : for (gphi_iterator gsi = gsi_start_phis (guard_to);
3587 : 56858 : !gsi_end_p (gsi); gsi_next (&gsi))
3588 : : {
3589 : : /* We are expecting all of the PHIs we have on epilog_e
3590 : : to be also on the main loop exit. But sometimes
3591 : : a stray virtual definition can appear at epilog_e
3592 : : which we can then take as the same on all exits,
3593 : : we've removed the LC SSA PHI on the main exit before
3594 : : so we wouldn't need to create a loop PHI for it. */
3595 : 33577 : if (virtual_operand_p (gimple_phi_result (*gsi))
3596 : 33577 : && (gsi_end_p (gsi2)
3597 : 32222 : || !virtual_operand_p (gimple_phi_result (*gsi2))))
3598 : 64 : add_phi_arg (*gsi,
3599 : 32 : gimple_phi_arg_def_from_edge (*gsi, epilog_e),
3600 : : guard_e, UNKNOWN_LOCATION);
3601 : : else
3602 : : {
3603 : 33545 : add_phi_arg (*gsi, gimple_phi_result (*gsi2), guard_e,
3604 : : UNKNOWN_LOCATION);
3605 : 33545 : gsi_next (&gsi2);
3606 : : }
3607 : : }
3608 : :
3609 : : /* Only need to handle basic block before epilog loop if it's not
3610 : : the guard_bb, which is the case when skip_vector is true. */
3611 : 23281 : if (guard_bb != bb_before_epilog)
3612 : : {
3613 : 23276 : prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
3614 : :
3615 : 23276 : scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
3616 : : }
3617 : 23281 : scale_loop_profile (epilog, prob_epilog, -1);
3618 : : }
3619 : :
3620 : : /* Recalculate the dominators after adding the guard edge. */
3621 : 30336 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3622 : 1340 : iterate_fix_dominators (CDI_DOMINATORS, doms, false);
3623 : :
3624 : : /* When we do not have a loop-around edge to the epilog we know
3625 : : the vector loop covered at least VF scalar iterations unless
3626 : : we have early breaks.
3627 : : Update any known upper bound with this knowledge. */
3628 : 30336 : if (! skip_vector
3629 : 8006 : && ! LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3630 : : {
3631 : 6894 : if (epilog->any_upper_bound)
3632 : 6894 : epilog->nb_iterations_upper_bound -= lowest_vf;
3633 : 6894 : if (epilog->any_likely_upper_bound)
3634 : 6894 : epilog->nb_iterations_likely_upper_bound -= lowest_vf;
3635 : 6894 : if (epilog->any_estimate)
3636 : 6891 : epilog->nb_iterations_estimate -= lowest_vf;
3637 : : }
3638 : :
3639 : 30336 : unsigned HOST_WIDE_INT bound;
3640 : 30336 : if (bound_scalar.is_constant (&bound))
3641 : : {
3642 : 30336 : gcc_assert (bound != 0);
3643 : : /* Adjust the upper bound by the extra peeled vector iteration if we
3644 : : are an epilogue of an peeled vect loop and not VLA. For VLA the
3645 : : loop bounds are unknown. */
3646 : 30336 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)
3647 : 30336 : && vf.is_constant ())
3648 : 19 : bound += vf.to_constant ();
3649 : : /* -1 to convert loop iterations to latch iterations. */
3650 : 30336 : record_niter_bound (epilog, bound - 1, false, true);
3651 : 30336 : scale_loop_profile (epilog, profile_probability::always (),
3652 : : bound - 1);
3653 : : }
3654 : :
3655 : 30336 : delete_update_ssa ();
3656 : 30336 : adjust_vec_debug_stmts ();
3657 : 30336 : scev_reset ();
3658 : 30336 : }
3659 : :
3660 : 30367 : if (vect_epilogues)
3661 : : {
3662 : 6373 : epilog->aux = epilogue_vinfo;
3663 : 6373 : LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3664 : 6373 : LOOP_VINFO_IV_EXIT (epilogue_vinfo)
3665 : 6373 : = LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo);
3666 : :
3667 : 6373 : loop_constraint_clear (epilog, LOOP_C_INFINITE);
3668 : :
3669 : : /* We now must calculate the number of NITERS performed by the previous
3670 : : loop and EPILOGUE_NITERS to be performed by the epilogue. */
3671 : 6373 : tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3672 : : niters_prolog, niters_vector_mult_vf);
3673 : :
3674 : : /* If skip_vector we may skip the previous loop, we insert a phi-node to
3675 : : determine whether we are coming from the previous vectorized loop
3676 : : using the update_e edge or the skip_vector basic block using the
3677 : : skip_e edge. */
3678 : 6373 : if (skip_vector)
3679 : : {
3680 : 5218 : gcc_assert (update_e != NULL && skip_e != NULL);
3681 : 5218 : gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3682 : : update_e->dest);
3683 : 5218 : tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3684 : 5218 : gimple *stmt = gimple_build_assign (new_ssa, niters);
3685 : 5218 : gimple_stmt_iterator gsi;
3686 : 5218 : if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3687 : 5218 : && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3688 : : {
3689 : 5218 : gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3690 : 5218 : gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3691 : : }
3692 : : else
3693 : : {
3694 : 0 : gsi = gsi_last_bb (update_e->src);
3695 : 0 : gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3696 : : }
3697 : :
3698 : 5218 : niters = new_ssa;
3699 : 5218 : add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3700 : 5218 : add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3701 : : UNKNOWN_LOCATION);
3702 : 5218 : niters = PHI_RESULT (new_phi);
3703 : 5218 : epilogue_vinfo->main_loop_edge = update_e;
3704 : 5218 : epilogue_vinfo->skip_main_loop_edge = skip_e;
3705 : : }
3706 : :
3707 : : /* Set ADVANCE to the number of iterations performed by the previous
3708 : : loop and its prologue. */
3709 : 6373 : *advance = niters;
3710 : :
3711 : : /* Subtract the number of iterations performed by the vectorized loop
3712 : : from the number of total iterations. */
3713 : 6373 : tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3714 : : before_loop_niters,
3715 : : niters);
3716 : :
3717 : 6373 : LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3718 : 6373 : LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3719 : 6373 : = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3720 : : epilogue_niters,
3721 : : build_one_cst (TREE_TYPE (epilogue_niters)));
3722 : :
3723 : : /* Decide what to do if the number of epilogue iterations is not
3724 : : a multiple of the epilogue loop's vectorization factor.
3725 : : We should have rejected the loop during the analysis phase
3726 : : if this fails. */
3727 : 6373 : bool res = vect_determine_partial_vectors_and_peeling (epilogue_vinfo);
3728 : 6373 : gcc_assert (res);
3729 : : }
3730 : :
3731 : 30367 : adjust_vec.release ();
3732 : 30367 : free_original_copy_tables ();
3733 : :
3734 : 54361 : return vect_epilogues ? epilog : NULL;
3735 : 30367 : }
3736 : :
3737 : : /* Function vect_create_cond_for_niters_checks.
3738 : :
3739 : : Create a conditional expression that represents the run-time checks for
3740 : : loop's niter. The loop is guaranteed to terminate if the run-time
3741 : : checks hold.
3742 : :
3743 : : Input:
3744 : : COND_EXPR - input conditional expression. New conditions will be chained
3745 : : with logical AND operation. If it is NULL, then the function
3746 : : is used to return the number of alias checks.
3747 : : LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3748 : : to be checked.
3749 : :
3750 : : Output:
3751 : : COND_EXPR - conditional expression.
3752 : :
3753 : : The returned COND_EXPR is the conditional expression to be used in the
3754 : : if statement that controls which version of the loop gets executed at
3755 : : runtime. */
3756 : :
3757 : : static void
3758 : 362 : vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3759 : : {
3760 : 362 : tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3761 : :
3762 : 362 : if (*cond_expr)
3763 : 362 : *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3764 : : *cond_expr, part_cond_expr);
3765 : : else
3766 : 0 : *cond_expr = part_cond_expr;
3767 : 362 : }
3768 : :
3769 : : /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3770 : : and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
3771 : :
3772 : : static void
3773 : 307 : chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3774 : : {
3775 : 307 : if (*cond_expr)
3776 : 307 : *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3777 : : *cond_expr, part_cond_expr);
3778 : : else
3779 : 0 : *cond_expr = part_cond_expr;
3780 : 307 : }
3781 : :
3782 : : /* Function vect_create_cond_for_align_checks.
3783 : :
3784 : : Create a conditional expression that represents the alignment checks for
3785 : : all of data references (array element references) whose alignment must be
3786 : : checked at runtime.
3787 : :
3788 : : Input:
3789 : : COND_EXPR - input conditional expression. New conditions will be chained
3790 : : with logical AND operation.
3791 : : LOOP_VINFO - two fields of the loop information are used.
3792 : : LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3793 : : LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3794 : :
3795 : : Output:
3796 : : COND_EXPR_STMT_LIST - statements needed to construct the conditional
3797 : : expression.
3798 : : The returned value is the conditional expression to be used in the if
3799 : : statement that controls which version of the loop gets executed at runtime.
3800 : :
3801 : : The algorithm makes two assumptions:
3802 : : 1) The number of bytes "n" in a vector is a power of 2.
3803 : : 2) An address "a" is aligned if a%n is zero and that this
3804 : : test can be done as a&(n-1) == 0. For example, for 16
3805 : : byte vectors the test is a&0xf == 0. */
3806 : :
3807 : : static void
3808 : 27 : vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3809 : : tree *cond_expr,
3810 : : gimple_seq *cond_expr_stmt_list)
3811 : : {
3812 : 27 : const vec<stmt_vec_info> &may_misalign_stmts
3813 : : = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3814 : 27 : stmt_vec_info stmt_info;
3815 : 27 : int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3816 : 27 : tree mask_cst;
3817 : 27 : unsigned int i;
3818 : 27 : tree int_ptrsize_type;
3819 : 27 : char tmp_name[20];
3820 : 27 : tree or_tmp_name = NULL_TREE;
3821 : 27 : tree and_tmp_name;
3822 : 27 : gimple *and_stmt;
3823 : 27 : tree ptrsize_zero;
3824 : 27 : tree part_cond_expr;
3825 : :
3826 : : /* Check that mask is one less than a power of 2, i.e., mask is
3827 : : all zeros followed by all ones. */
3828 : 27 : gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3829 : :
3830 : 27 : int_ptrsize_type = signed_type_for (ptr_type_node);
3831 : :
3832 : : /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3833 : : of the first vector of the i'th data reference. */
3834 : :
3835 : 72 : FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3836 : : {
3837 : 45 : gimple_seq new_stmt_list = NULL;
3838 : 45 : tree addr_base;
3839 : 45 : tree addr_tmp_name;
3840 : 45 : tree new_or_tmp_name;
3841 : 45 : gimple *addr_stmt, *or_stmt;
3842 : 45 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3843 : 45 : bool negative = tree_int_cst_compare
3844 : 45 : (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3845 : 45 : tree offset = negative
3846 : 45 : ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
3847 : : * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
3848 : 45 : : size_zero_node;
3849 : :
3850 : : /* create: addr_tmp = (int)(address_of_first_vector) */
3851 : 45 : addr_base =
3852 : 45 : vect_create_addr_base_for_vector_ref (loop_vinfo,
3853 : : stmt_info, &new_stmt_list,
3854 : : offset);
3855 : 45 : if (new_stmt_list != NULL)
3856 : 16 : gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3857 : :
3858 : 45 : sprintf (tmp_name, "addr2int%d", i);
3859 : 45 : addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3860 : 45 : addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3861 : 45 : gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3862 : :
3863 : : /* The addresses are OR together. */
3864 : :
3865 : 45 : if (or_tmp_name != NULL_TREE)
3866 : : {
3867 : : /* create: or_tmp = or_tmp | addr_tmp */
3868 : 18 : sprintf (tmp_name, "orptrs%d", i);
3869 : 18 : new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3870 : 18 : or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3871 : : or_tmp_name, addr_tmp_name);
3872 : 18 : gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3873 : 18 : or_tmp_name = new_or_tmp_name;
3874 : : }
3875 : : else
3876 : : or_tmp_name = addr_tmp_name;
3877 : :
3878 : : } /* end for i */
3879 : :
3880 : 27 : mask_cst = build_int_cst (int_ptrsize_type, mask);
3881 : :
3882 : : /* create: and_tmp = or_tmp & mask */
3883 : 27 : and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3884 : :
3885 : 27 : and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3886 : : or_tmp_name, mask_cst);
3887 : 27 : gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3888 : :
3889 : : /* Make and_tmp the left operand of the conditional test against zero.
3890 : : if and_tmp has a nonzero bit then some address is unaligned. */
3891 : 27 : ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3892 : 27 : part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3893 : : and_tmp_name, ptrsize_zero);
3894 : 27 : chain_cond_expr (cond_expr, part_cond_expr);
3895 : 27 : }
3896 : :
3897 : : /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3898 : : create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3899 : : Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3900 : : and this new condition are true. Treat a null *COND_EXPR as "true". */
3901 : :
3902 : : static void
3903 : 3093 : vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3904 : : {
3905 : 3093 : const vec<vec_object_pair> &pairs
3906 : : = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3907 : 3093 : unsigned int i;
3908 : 3093 : vec_object_pair *pair;
3909 : 3101 : FOR_EACH_VEC_ELT (pairs, i, pair)
3910 : : {
3911 : 8 : tree addr1 = build_fold_addr_expr (pair->first);
3912 : 8 : tree addr2 = build_fold_addr_expr (pair->second);
3913 : 8 : tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3914 : : addr1, addr2);
3915 : 8 : chain_cond_expr (cond_expr, part_cond_expr);
3916 : : }
3917 : 3093 : }
3918 : :
3919 : : /* Create an expression that is true when all lower-bound conditions for
3920 : : the vectorized loop are met. Chain this condition with *COND_EXPR. */
3921 : :
3922 : : static void
3923 : 3093 : vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3924 : : {
3925 : 3093 : const vec<vec_lower_bound> &lower_bounds
3926 : : = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3927 : 3365 : for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3928 : : {
3929 : 272 : tree expr = lower_bounds[i].expr;
3930 : 272 : tree type = unsigned_type_for (TREE_TYPE (expr));
3931 : 272 : expr = fold_convert (type, expr);
3932 : 272 : poly_uint64 bound = lower_bounds[i].min_value;
3933 : 272 : if (!lower_bounds[i].unsigned_p)
3934 : : {
3935 : 106 : expr = fold_build2 (PLUS_EXPR, type, expr,
3936 : : build_int_cstu (type, bound - 1));
3937 : 106 : bound += bound - 1;
3938 : : }
3939 : 272 : tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3940 : : build_int_cstu (type, bound));
3941 : 272 : chain_cond_expr (cond_expr, part_cond_expr);
3942 : : }
3943 : 3093 : }
3944 : :
3945 : : /* Function vect_create_cond_for_alias_checks.
3946 : :
3947 : : Create a conditional expression that represents the run-time checks for
3948 : : overlapping of address ranges represented by a list of data references
3949 : : relations passed as input.
3950 : :
3951 : : Input:
3952 : : COND_EXPR - input conditional expression. New conditions will be chained
3953 : : with logical AND operation. If it is NULL, then the function
3954 : : is used to return the number of alias checks.
3955 : : LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3956 : : to be checked.
3957 : :
3958 : : Output:
3959 : : COND_EXPR - conditional expression.
3960 : :
3961 : : The returned COND_EXPR is the conditional expression to be used in the if
3962 : : statement that controls which version of the loop gets executed at runtime.
3963 : : */
3964 : :
3965 : : void
3966 : 3093 : vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3967 : : {
3968 : 3093 : const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
3969 : : LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3970 : :
3971 : 3093 : if (comp_alias_ddrs.is_empty ())
3972 : : return;
3973 : :
3974 : 2952 : create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3975 : : &comp_alias_ddrs, cond_expr);
3976 : 2952 : if (dump_enabled_p ())
3977 : 1214 : dump_printf_loc (MSG_NOTE, vect_location,
3978 : : "created %u versioning for alias checks.\n",
3979 : : comp_alias_ddrs.length ());
3980 : : }
3981 : :
3982 : :
3983 : : /* Function vect_loop_versioning.
3984 : :
3985 : : If the loop has data references that may or may not be aligned or/and
3986 : : has data reference relations whose independence was not proven then
3987 : : two versions of the loop need to be generated, one which is vectorized
3988 : : and one which isn't. A test is then generated to control which of the
3989 : : loops is executed. The test checks for the alignment of all of the
3990 : : data references that may or may not be aligned. An additional
3991 : : sequence of runtime tests is generated for each pairs of DDRs whose
3992 : : independence was not proven. The vectorized version of loop is
3993 : : executed only if both alias and alignment tests are passed.
3994 : :
3995 : : The test generated to check which version of loop is executed
3996 : : is modified to also check for profitability as indicated by the
3997 : : cost model threshold TH.
3998 : :
3999 : : The versioning precondition(s) are placed in *COND_EXPR and
4000 : : *COND_EXPR_STMT_LIST. */
4001 : :
4002 : : class loop *
4003 : 3515 : vect_loop_versioning (loop_vec_info loop_vinfo,
4004 : : gimple *loop_vectorized_call)
4005 : : {
4006 : 3515 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
4007 : 3515 : class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
4008 : 3515 : basic_block condition_bb;
4009 : 3515 : gphi_iterator gsi;
4010 : 3515 : gimple_stmt_iterator cond_exp_gsi;
4011 : 3515 : basic_block merge_bb;
4012 : 3515 : basic_block new_exit_bb;
4013 : 3515 : edge new_exit_e, e;
4014 : 3515 : gphi *orig_phi, *new_phi;
4015 : 3515 : tree cond_expr = NULL_TREE;
4016 : 3515 : gimple_seq cond_expr_stmt_list = NULL;
4017 : 3515 : tree arg;
4018 : 3515 : profile_probability prob = profile_probability::likely ();
4019 : 3515 : gimple_seq gimplify_stmt_list = NULL;
4020 : 3515 : tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
4021 : 3515 : bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
4022 : 3515 : bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
4023 : 3515 : bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
4024 : 3515 : poly_uint64 versioning_threshold
4025 : : = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
4026 : 3515 : tree version_simd_if_cond
4027 : : = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
4028 : 3515 : unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
4029 : :
4030 : 3515 : if (vect_apply_runtime_profitability_check_p (loop_vinfo)
4031 : : && !ordered_p (th, versioning_threshold))
4032 : : cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
4033 : : build_int_cst (TREE_TYPE (scalar_loop_iters),
4034 : : th - 1));
4035 : 3515 : if (maybe_ne (versioning_threshold, 0U))
4036 : : {
4037 : 3512 : tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
4038 : : build_int_cst (TREE_TYPE (scalar_loop_iters),
4039 : : versioning_threshold - 1));
4040 : 3512 : if (cond_expr)
4041 : 0 : cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
4042 : : expr, cond_expr);
4043 : : else
4044 : 3512 : cond_expr = expr;
4045 : : }
4046 : :
4047 : 3515 : tree cost_name = NULL_TREE;
4048 : 3515 : profile_probability prob2 = profile_probability::always ();
4049 : 3515 : if (cond_expr
4050 : 3512 : && EXPR_P (cond_expr)
4051 : 2414 : && (version_niter
4052 : 2414 : || version_align
4053 : : || version_alias
4054 : 2097 : || version_simd_if_cond))
4055 : : {
4056 : 2414 : cost_name = cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
4057 : : &cond_expr_stmt_list,
4058 : : is_gimple_val, NULL_TREE);
4059 : : /* Split prob () into two so that the overall probability of passing
4060 : : both the cost-model and versioning checks is the orig prob. */
4061 : 2414 : prob2 = prob = prob.sqrt ();
4062 : : }
4063 : :
4064 : 3515 : if (version_niter)
4065 : 362 : vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
4066 : :
4067 : 3515 : if (cond_expr)
4068 : : {
4069 : 3512 : gimple_seq tem = NULL;
4070 : 3512 : cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
4071 : : &tem, is_gimple_condexpr_for_cond,
4072 : : NULL_TREE);
4073 : 3512 : gimple_seq_add_seq (&cond_expr_stmt_list, tem);
4074 : : }
4075 : :
4076 : 3515 : if (version_align)
4077 : 27 : vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
4078 : : &cond_expr_stmt_list);
4079 : :
4080 : 3515 : if (version_alias)
4081 : : {
4082 : 3093 : vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
4083 : 3093 : vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
4084 : 3093 : vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
4085 : : }
4086 : :
4087 : 3515 : if (version_simd_if_cond)
4088 : : {
4089 : 58 : gcc_assert (dom_info_available_p (CDI_DOMINATORS));
4090 : 58 : if (flag_checking)
4091 : 58 : if (basic_block bb
4092 : 58 : = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
4093 : 58 : gcc_assert (bb != loop->header
4094 : : && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
4095 : : && (scalar_loop == NULL
4096 : : || (bb != scalar_loop->header
4097 : : && dominated_by_p (CDI_DOMINATORS,
4098 : : scalar_loop->header, bb))));
4099 : 58 : tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
4100 : 58 : tree c = fold_build2 (NE_EXPR, boolean_type_node,
4101 : : version_simd_if_cond, zero);
4102 : 58 : if (cond_expr)
4103 : 58 : cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
4104 : : c, cond_expr);
4105 : : else
4106 : 0 : cond_expr = c;
4107 : 58 : if (dump_enabled_p ())
4108 : 5 : dump_printf_loc (MSG_NOTE, vect_location,
4109 : : "created versioning for simd if condition check.\n");
4110 : : }
4111 : :
4112 : 3515 : cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
4113 : : &gimplify_stmt_list,
4114 : : is_gimple_condexpr_for_cond, NULL_TREE);
4115 : 3515 : gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
4116 : :
4117 : : /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
4118 : : invariant in. */
4119 : 3515 : class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
4120 : 3515 : for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
4121 : 62273 : !gsi_end_p (gsi); gsi_next (&gsi))
4122 : : {
4123 : 58758 : gimple *stmt = gsi_stmt (gsi);
4124 : 58758 : update_stmt (stmt);
4125 : 58758 : ssa_op_iter iter;
4126 : 58758 : use_operand_p use_p;
4127 : 58758 : basic_block def_bb;
4128 : 135820 : FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
4129 : 77062 : if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
4130 : 77062 : && flow_bb_inside_loop_p (outermost, def_bb))
4131 : 2361 : outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
4132 : : }
4133 : :
4134 : : /* Search for the outermost loop we can version. Avoid versioning of
4135 : : non-perfect nests but allow if-conversion versioned loops inside. */
4136 : 3515 : class loop *loop_to_version = loop;
4137 : 3515 : if (flow_loop_nested_p (outermost, loop))
4138 : : {
4139 : 1366 : if (dump_enabled_p ())
4140 : 622 : dump_printf_loc (MSG_NOTE, vect_location,
4141 : : "trying to apply versioning to outer loop %d\n",
4142 : : outermost->num);
4143 : 1366 : if (outermost->num == 0)
4144 : 1282 : outermost = superloop_at_depth (loop, 1);
4145 : : /* And avoid applying versioning on non-perfect nests. */
4146 : : while (loop_to_version != outermost
4147 : 109 : && (e = single_exit (loop_outer (loop_to_version)))
4148 : 90 : && !(e->flags & EDGE_COMPLEX)
4149 : 89 : && (!loop_outer (loop_to_version)->inner->next
4150 : 45 : || vect_loop_vectorized_call (loop_to_version))
4151 : 1460 : && (!loop_outer (loop_to_version)->inner->next
4152 : 3 : || !loop_outer (loop_to_version)->inner->next->next))
4153 : : loop_to_version = loop_outer (loop_to_version);
4154 : : }
4155 : :
4156 : : /* Apply versioning. If there is already a scalar version created by
4157 : : if-conversion re-use that. Note we cannot re-use the copy of
4158 : : an if-converted outer-loop when vectorizing the inner loop only. */
4159 : 3515 : gcond *cond;
4160 : 3515 : if ((!loop_to_version->inner || loop == loop_to_version)
4161 : 3480 : && loop_vectorized_call)
4162 : : {
4163 : 78 : gcc_assert (scalar_loop);
4164 : 78 : condition_bb = gimple_bb (loop_vectorized_call);
4165 : 156 : cond = as_a <gcond *> (*gsi_last_bb (condition_bb));
4166 : 78 : gimple_cond_set_condition_from_tree (cond, cond_expr);
4167 : 78 : update_stmt (cond);
4168 : :
4169 : 78 : if (cond_expr_stmt_list)
4170 : : {
4171 : 78 : cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
4172 : 78 : gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
4173 : : GSI_SAME_STMT);
4174 : : }
4175 : :
4176 : : /* if-conversion uses profile_probability::always () for both paths,
4177 : : reset the paths probabilities appropriately. */
4178 : 78 : edge te, fe;
4179 : 78 : extract_true_false_edges_from_block (condition_bb, &te, &fe);
4180 : 78 : te->probability = prob;
4181 : 78 : fe->probability = prob.invert ();
4182 : : /* We can scale loops counts immediately but have to postpone
4183 : : scaling the scalar loop because we re-use it during peeling.
4184 : :
4185 : : Ifcvt duplicates loop preheader, loop body and produces an basic
4186 : : block after loop exit. We need to scale all that. */
4187 : 78 : basic_block preheader = loop_preheader_edge (loop_to_version)->src;
4188 : 78 : preheader->count = preheader->count.apply_probability (prob * prob2);
4189 : 78 : scale_loop_frequencies (loop_to_version, prob * prob2);
4190 : : /* When the loop has multiple exits then we can only version itself.
4191 : : This is denoted by loop_to_version == loop. In this case we can
4192 : : do the versioning by selecting the exit edge the vectorizer is
4193 : : currently using. */
4194 : 78 : edge exit_edge;
4195 : 78 : if (loop_to_version == loop)
4196 : 78 : exit_edge = LOOP_VINFO_IV_EXIT (loop_vinfo);
4197 : : else
4198 : 0 : exit_edge = single_exit (loop_to_version);
4199 : 78 : exit_edge->dest->count = preheader->count;
4200 : 78 : LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = (prob * prob2).invert ();
4201 : :
4202 : 78 : nloop = scalar_loop;
4203 : 78 : if (dump_enabled_p ())
4204 : 82 : dump_printf_loc (MSG_NOTE, vect_location,
4205 : : "reusing %sloop version created by if conversion\n",
4206 : : loop_to_version != loop ? "outer " : "");
4207 : 78 : }
4208 : : else
4209 : : {
4210 : 3437 : if (loop_to_version != loop
4211 : 3437 : && dump_enabled_p ())
4212 : 11 : dump_printf_loc (MSG_NOTE, vect_location,
4213 : : "applying loop versioning to outer loop %d\n",
4214 : : loop_to_version->num);
4215 : :
4216 : 3437 : unsigned orig_pe_idx = loop_preheader_edge (loop)->dest_idx;
4217 : :
4218 : 3437 : initialize_original_copy_tables ();
4219 : 6874 : nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
4220 : 3437 : prob * prob2, (prob * prob2).invert (),
4221 : 3437 : prob * prob2, (prob * prob2).invert (),
4222 : : true);
4223 : :
4224 : : /* If the PHI nodes in the loop header were reallocated, we need to fix up
4225 : : our internally stashed copies of those. */
4226 : 3437 : if (loop_to_version == loop)
4227 : 3402 : for (auto gsi = gsi_start_phis (loop->header);
4228 : 13551 : !gsi_end_p (gsi); gsi_next (&gsi))
4229 : 10149 : loop_vinfo->resync_stmt_addr (gsi.phi ());
4230 : :
4231 : : /* We will later insert second conditional so overall outcome of
4232 : : both is prob * prob2. */
4233 : 3437 : edge true_e, false_e;
4234 : 3437 : extract_true_false_edges_from_block (condition_bb, &true_e, &false_e);
4235 : 3437 : true_e->probability = prob;
4236 : 3437 : false_e->probability = prob.invert ();
4237 : 3437 : gcc_assert (nloop);
4238 : 3437 : nloop = get_loop_copy (loop);
4239 : :
4240 : : /* For cycle vectorization with SLP we rely on the PHI arguments
4241 : : appearing in the same order as the SLP node operands which for the
4242 : : loop PHI nodes means the preheader edge dest index needs to remain
4243 : : the same for the analyzed loop which also becomes the vectorized one.
4244 : : Make it so in case the state after versioning differs by redirecting
4245 : : the first edge into the header to the same destination which moves
4246 : : it last. */
4247 : 3437 : if (loop_preheader_edge (loop)->dest_idx != orig_pe_idx)
4248 : : {
4249 : 289 : edge e = EDGE_PRED (loop->header, 0);
4250 : 289 : ssa_redirect_edge (e, e->dest);
4251 : 289 : flush_pending_stmts (e);
4252 : : }
4253 : 3437 : gcc_assert (loop_preheader_edge (loop)->dest_idx == orig_pe_idx);
4254 : :
4255 : : /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
4256 : : reap those otherwise; they also refer to the original
4257 : : loops. */
4258 : : class loop *l = loop;
4259 : 3440 : while (gimple *call = vect_loop_vectorized_call (l))
4260 : : {
4261 : 3 : call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
4262 : 3 : fold_loop_internal_call (call, boolean_false_node);
4263 : 3 : l = loop_outer (l);
4264 : 3 : }
4265 : 3437 : free_original_copy_tables ();
4266 : :
4267 : 3437 : if (cond_expr_stmt_list)
4268 : : {
4269 : 3357 : cond_exp_gsi = gsi_last_bb (condition_bb);
4270 : 3357 : gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
4271 : : GSI_SAME_STMT);
4272 : : }
4273 : :
4274 : : /* Loop versioning violates an assumption we try to maintain during
4275 : : vectorization - that the loop exit block has a single predecessor.
4276 : : After versioning, the exit block of both loop versions is the same
4277 : : basic block (i.e. it has two predecessors). Just in order to simplify
4278 : : following transformations in the vectorizer, we fix this situation
4279 : : here by adding a new (empty) block on the exit-edge of the loop,
4280 : : with the proper loop-exit phis to maintain loop-closed-form.
4281 : : If loop versioning wasn't done from loop, but scalar_loop instead,
4282 : : merge_bb will have already just a single successor. */
4283 : :
4284 : : /* When the loop has multiple exits then we can only version itself.
4285 : : This is denoted by loop_to_version == loop. In this case we can
4286 : : do the versioning by selecting the exit edge the vectorizer is
4287 : : currently using. */
4288 : 3437 : edge exit_edge;
4289 : 3437 : if (loop_to_version == loop)
4290 : 3402 : exit_edge = LOOP_VINFO_IV_EXIT (loop_vinfo);
4291 : : else
4292 : 35 : exit_edge = single_exit (loop_to_version);
4293 : :
4294 : 3437 : gcc_assert (exit_edge);
4295 : 3437 : merge_bb = exit_edge->dest;
4296 : 3437 : if (EDGE_COUNT (merge_bb->preds) >= 2)
4297 : : {
4298 : 3437 : gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
4299 : 3437 : new_exit_bb = split_edge (exit_edge);
4300 : 3437 : new_exit_e = exit_edge;
4301 : 3437 : e = EDGE_SUCC (new_exit_bb, 0);
4302 : :
4303 : 7106 : for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
4304 : 3669 : gsi_next (&gsi))
4305 : : {
4306 : 3669 : tree new_res;
4307 : 3669 : orig_phi = gsi.phi ();
4308 : 3669 : new_res = copy_ssa_name (PHI_RESULT (orig_phi));
4309 : 3669 : new_phi = create_phi_node (new_res, new_exit_bb);
4310 : 3669 : arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
4311 : 3669 : add_phi_arg (new_phi, arg, new_exit_e,
4312 : : gimple_phi_arg_location_from_edge (orig_phi, e));
4313 : 3669 : adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
4314 : : }
4315 : : }
4316 : :
4317 : 3437 : update_ssa (TODO_update_ssa_no_phi);
4318 : : }
4319 : :
4320 : : /* Split the cost model check off to a separate BB. Costing assumes
4321 : : this is the only thing we perform when we enter the scalar loop
4322 : : from a failed cost decision. */
4323 : 3515 : if (cost_name && TREE_CODE (cost_name) == SSA_NAME)
4324 : : {
4325 : 2414 : gimple *def = SSA_NAME_DEF_STMT (cost_name);
4326 : 2414 : gcc_assert (gimple_bb (def) == condition_bb);
4327 : : /* All uses of the cost check are 'true' after the check we
4328 : : are going to insert. */
4329 : 2414 : replace_uses_by (cost_name, boolean_true_node);
4330 : : /* And we're going to build the new single use of it. */
4331 : 2414 : gcond *cond = gimple_build_cond (NE_EXPR, cost_name, boolean_false_node,
4332 : : NULL_TREE, NULL_TREE);
4333 : 2414 : edge e = split_block (gimple_bb (def), def);
4334 : 2414 : gimple_stmt_iterator gsi = gsi_for_stmt (def);
4335 : 2414 : gsi_insert_after (&gsi, cond, GSI_NEW_STMT);
4336 : 2414 : edge true_e, false_e;
4337 : 2414 : extract_true_false_edges_from_block (e->dest, &true_e, &false_e);
4338 : 2414 : e->flags &= ~EDGE_FALLTHRU;
4339 : 2414 : e->flags |= EDGE_TRUE_VALUE;
4340 : 2414 : edge e2 = make_edge (e->src, false_e->dest, EDGE_FALSE_VALUE);
4341 : 2414 : e->probability = prob2;
4342 : 2414 : e2->probability = prob2.invert ();
4343 : 2414 : e->dest->count = e->count ();
4344 : 2414 : set_immediate_dominator (CDI_DOMINATORS, false_e->dest, e->src);
4345 : 2414 : auto_vec<basic_block, 3> adj;
4346 : 2414 : for (basic_block son = first_dom_son (CDI_DOMINATORS, e->dest);
4347 : 7233 : son;
4348 : 4819 : son = next_dom_son (CDI_DOMINATORS, son))
4349 : 7224 : if (EDGE_COUNT (son->preds) > 1)
4350 : 2405 : adj.safe_push (son);
4351 : 9647 : for (auto son : adj)
4352 : 2405 : set_immediate_dominator (CDI_DOMINATORS, son, e->src);
4353 : : //debug_bb (condition_bb);
4354 : : //debug_bb (e->src);
4355 : 2414 : }
4356 : :
4357 : 3515 : if (version_niter)
4358 : : {
4359 : : /* The versioned loop could be infinite, we need to clear existing
4360 : : niter information which is copied from the original loop. */
4361 : 362 : gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
4362 : 362 : vect_free_loop_info_assumptions (nloop);
4363 : : }
4364 : :
4365 : 3515 : if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
4366 : 3515 : && dump_enabled_p ())
4367 : : {
4368 : 752 : if (version_alias)
4369 : 707 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
4370 : : vect_location,
4371 : : "loop versioned for vectorization because of "
4372 : : "possible aliasing\n");
4373 : 752 : if (version_align)
4374 : 12 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
4375 : : vect_location,
4376 : : "loop versioned for vectorization to enhance "
4377 : : "alignment\n");
4378 : :
4379 : : }
4380 : :
4381 : 3515 : return nloop;
4382 : : }
|