Branch data Line data Source code
1 : : /* Vectorizer Specific Loop Manipulations
2 : : Copyright (C) 2003-2025 Free Software Foundation, Inc.
3 : : Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 : : and Ira Rosen <irar@il.ibm.com>
5 : :
6 : : This file is part of GCC.
7 : :
8 : : GCC is free software; you can redistribute it and/or modify it under
9 : : the terms of the GNU General Public License as published by the Free
10 : : Software Foundation; either version 3, or (at your option) any later
11 : : version.
12 : :
13 : : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : : for more details.
17 : :
18 : : You should have received a copy of the GNU General Public License
19 : : along with GCC; see the file COPYING3. If not see
20 : : <http://www.gnu.org/licenses/>. */
21 : :
22 : : #include "config.h"
23 : : #include "system.h"
24 : : #include "coretypes.h"
25 : : #include "backend.h"
26 : : #include "tree.h"
27 : : #include "gimple.h"
28 : : #include "cfghooks.h"
29 : : #include "tree-pass.h"
30 : : #include "ssa.h"
31 : : #include "fold-const.h"
32 : : #include "cfganal.h"
33 : : #include "gimplify.h"
34 : : #include "gimple-iterator.h"
35 : : #include "gimplify-me.h"
36 : : #include "tree-cfg.h"
37 : : #include "tree-ssa-loop-manip.h"
38 : : #include "tree-into-ssa.h"
39 : : #include "tree-ssa.h"
40 : : #include "cfgloop.h"
41 : : #include "tree-scalar-evolution.h"
42 : : #include "tree-vectorizer.h"
43 : : #include "tree-ssa-loop-ivopts.h"
44 : : #include "gimple-fold.h"
45 : : #include "tree-ssa-loop-niter.h"
46 : : #include "internal-fn.h"
47 : : #include "stor-layout.h"
48 : : #include "optabs-query.h"
49 : : #include "vec-perm-indices.h"
50 : : #include "insn-config.h"
51 : : #include "rtl.h"
52 : : #include "recog.h"
53 : : #include "langhooks.h"
54 : : #include "tree-vector-builder.h"
55 : : #include "optabs-tree.h"
56 : :
57 : : /*************************************************************************
58 : : Simple Loop Peeling Utilities
59 : :
60 : : Utilities to support loop peeling for vectorization purposes.
61 : : *************************************************************************/
62 : :
63 : :
64 : : /* Renames the use *OP_P. */
65 : :
66 : : static void
67 : 901818 : rename_use_op (use_operand_p op_p)
68 : : {
69 : 901818 : tree new_name;
70 : :
71 : 901818 : if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
72 : : return;
73 : :
74 : 898374 : new_name = get_current_def (USE_FROM_PTR (op_p));
75 : :
76 : : /* Something defined outside of the loop. */
77 : 898374 : if (!new_name)
78 : : return;
79 : :
80 : : /* An ordinary ssa name defined in the loop. */
81 : :
82 : 774883 : SET_USE (op_p, new_name);
83 : : }
84 : :
85 : :
86 : : /* Renames the variables in basic block BB. Allow renaming of PHI arguments
87 : : on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
88 : : true. */
89 : :
90 : : static void
91 : 111162 : rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
92 : : {
93 : 111162 : gimple *stmt;
94 : 111162 : use_operand_p use_p;
95 : 111162 : ssa_op_iter iter;
96 : 111162 : edge e;
97 : 111162 : edge_iterator ei;
98 : 111162 : class loop *loop = bb->loop_father;
99 : 111162 : class loop *outer_loop = NULL;
100 : :
101 : 111162 : if (rename_from_outer_loop)
102 : : {
103 : 829 : gcc_assert (loop);
104 : 829 : outer_loop = loop_outer (loop);
105 : : }
106 : :
107 : 735930 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
108 : 513606 : gsi_next (&gsi))
109 : : {
110 : 513606 : stmt = gsi_stmt (gsi);
111 : 1264117 : FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
112 : 750511 : rename_use_op (use_p);
113 : : }
114 : :
115 : 227120 : FOR_EACH_EDGE (e, ei, bb->preds)
116 : : {
117 : 115958 : if (!flow_bb_inside_loop_p (loop, e->src))
118 : : {
119 : 34623 : if (!rename_from_outer_loop)
120 : 34363 : continue;
121 : 260 : if (e->src != outer_loop->header)
122 : : {
123 : 152 : if (outer_loop->inner->next)
124 : : {
125 : : /* If outer_loop has 2 inner loops, allow there to
126 : : be an extra basic block which decides which of the
127 : : two loops to use using LOOP_VECTORIZED. */
128 : 149 : if (!single_pred_p (e->src)
129 : 26 : || single_pred (e->src) != outer_loop->header)
130 : 123 : continue;
131 : : }
132 : : }
133 : : }
134 : 184791 : for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
135 : 103319 : gsi_next (&gsi))
136 : 103319 : rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
137 : : }
138 : 111162 : }
139 : :
140 : :
141 : : struct adjust_info
142 : : {
143 : : tree from, to;
144 : : basic_block bb;
145 : : };
146 : :
147 : : /* A stack of values to be adjusted in debug stmts. We have to
148 : : process them LIFO, so that the closest substitution applies. If we
149 : : processed them FIFO, without the stack, we might substitute uses
150 : : with a PHI DEF that would soon become non-dominant, and when we got
151 : : to the suitable one, it wouldn't have anything to substitute any
152 : : more. */
153 : : static vec<adjust_info, va_heap> adjust_vec;
154 : :
155 : : /* Adjust any debug stmts that referenced AI->from values to use the
156 : : loop-closed AI->to, if the references are dominated by AI->bb and
157 : : not by the definition of AI->from. */
158 : :
159 : : static void
160 : 62964 : adjust_debug_stmts_now (adjust_info *ai)
161 : : {
162 : 62964 : basic_block bbphi = ai->bb;
163 : 62964 : tree orig_def = ai->from;
164 : 62964 : tree new_def = ai->to;
165 : 62964 : imm_use_iterator imm_iter;
166 : 62964 : gimple *stmt;
167 : 62964 : basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
168 : :
169 : 62964 : gcc_assert (dom_info_available_p (CDI_DOMINATORS));
170 : :
171 : : /* Adjust any debug stmts that held onto non-loop-closed
172 : : references. */
173 : 361130 : FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
174 : : {
175 : 235202 : use_operand_p use_p;
176 : 235202 : basic_block bbuse;
177 : :
178 : 235202 : if (!is_gimple_debug (stmt))
179 : 178155 : continue;
180 : :
181 : 57047 : gcc_assert (gimple_debug_bind_p (stmt));
182 : :
183 : 57047 : bbuse = gimple_bb (stmt);
184 : :
185 : 57047 : if ((bbuse == bbphi
186 : 57047 : || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
187 : 58297 : && !(bbuse == bbdef
188 : 625 : || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
189 : : {
190 : 0 : if (new_def)
191 : 0 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
192 : 0 : SET_USE (use_p, new_def);
193 : : else
194 : : {
195 : 0 : gimple_debug_bind_reset_value (stmt);
196 : 0 : update_stmt (stmt);
197 : : }
198 : : }
199 : 62964 : }
200 : 62964 : }
201 : :
202 : : /* Adjust debug stmts as scheduled before. */
203 : :
204 : : static void
205 : 33415 : adjust_vec_debug_stmts (void)
206 : : {
207 : 33415 : if (!MAY_HAVE_DEBUG_BIND_STMTS)
208 : : return;
209 : :
210 : 12408 : gcc_assert (adjust_vec.exists ());
211 : :
212 : 75261 : while (!adjust_vec.is_empty ())
213 : : {
214 : 62853 : adjust_debug_stmts_now (&adjust_vec.last ());
215 : 62853 : adjust_vec.pop ();
216 : : }
217 : : }
218 : :
219 : : /* Adjust any debug stmts that referenced FROM values to use the
220 : : loop-closed TO, if the references are dominated by BB and not by
221 : : the definition of FROM. If adjust_vec is non-NULL, adjustments
222 : : will be postponed until adjust_vec_debug_stmts is called. */
223 : :
224 : : static void
225 : 90736 : adjust_debug_stmts (tree from, tree to, basic_block bb)
226 : : {
227 : 90736 : adjust_info ai;
228 : :
229 : 90736 : if (MAY_HAVE_DEBUG_BIND_STMTS
230 : 90736 : && TREE_CODE (from) == SSA_NAME
231 : 81366 : && ! SSA_NAME_IS_DEFAULT_DEF (from)
232 : 170765 : && ! virtual_operand_p (from))
233 : : {
234 : 62964 : ai.from = from;
235 : 62964 : ai.to = to;
236 : 62964 : ai.bb = bb;
237 : :
238 : 62964 : if (adjust_vec.exists ())
239 : 62853 : adjust_vec.safe_push (ai);
240 : : else
241 : 111 : adjust_debug_stmts_now (&ai);
242 : : }
243 : 90736 : }
244 : :
245 : : /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
246 : : to adjust any debug stmts that referenced the old phi arg,
247 : : presumably non-loop-closed references left over from other
248 : : transformations. */
249 : :
250 : : static void
251 : 163390 : adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
252 : : {
253 : 163390 : tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
254 : :
255 : 163390 : gcc_assert (TREE_CODE (orig_def) != SSA_NAME
256 : : || orig_def != new_def);
257 : :
258 : 163390 : SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
259 : :
260 : 163390 : if (MAY_HAVE_DEBUG_BIND_STMTS)
261 : 67670 : adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
262 : : gimple_bb (update_phi));
263 : 163390 : }
264 : :
265 : : /* Define one loop rgroup control CTRL from loop LOOP. INIT_CTRL is the value
266 : : that the control should have during the first iteration and NEXT_CTRL is the
267 : : value that it should have on subsequent iterations. */
268 : :
269 : : static void
270 : 86 : vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
271 : : tree next_ctrl)
272 : : {
273 : 86 : gphi *phi = create_phi_node (ctrl, loop->header);
274 : 86 : add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
275 : 86 : add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
276 : 86 : }
277 : :
278 : : /* Add SEQ to the end of LOOP's preheader block. */
279 : :
280 : : static void
281 : 18 : add_preheader_seq (class loop *loop, gimple_seq seq)
282 : : {
283 : 18 : if (seq)
284 : : {
285 : 15 : edge pe = loop_preheader_edge (loop);
286 : 15 : basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
287 : 15 : gcc_assert (!new_bb);
288 : : }
289 : 18 : }
290 : :
291 : : /* Add SEQ to the beginning of LOOP's header block. */
292 : :
293 : : static void
294 : 0 : add_header_seq (class loop *loop, gimple_seq seq)
295 : : {
296 : 0 : if (seq)
297 : : {
298 : 0 : gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
299 : 0 : gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
300 : : }
301 : 0 : }
302 : :
303 : : /* Return true if the target can interleave elements of two vectors.
304 : : OFFSET is 0 if the first half of the vectors should be interleaved
305 : : or 1 if the second half should. When returning true, store the
306 : : associated permutation in INDICES. */
307 : :
308 : : static bool
309 : 0 : interleave_supported_p (vec_perm_indices *indices, tree vectype,
310 : : unsigned int offset)
311 : : {
312 : 0 : poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
313 : 0 : poly_uint64 base = exact_div (nelts, 2) * offset;
314 : 0 : vec_perm_builder sel (nelts, 2, 3);
315 : 0 : for (unsigned int i = 0; i < 3; ++i)
316 : : {
317 : 0 : sel.quick_push (base + i);
318 : 0 : sel.quick_push (base + i + nelts);
319 : : }
320 : 0 : indices->new_vector (sel, 2, nelts);
321 : 0 : return can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
322 : 0 : *indices);
323 : 0 : }
324 : :
325 : : /* Try to use permutes to define the masks in DEST_RGM using the masks
326 : : in SRC_RGM, given that the former has twice as many masks as the
327 : : latter. Return true on success, adding any new statements to SEQ. */
328 : :
329 : : static bool
330 : 0 : vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
331 : : rgroup_controls *src_rgm)
332 : : {
333 : 0 : tree src_masktype = src_rgm->type;
334 : 0 : tree dest_masktype = dest_rgm->type;
335 : 0 : machine_mode src_mode = TYPE_MODE (src_masktype);
336 : 0 : insn_code icode1, icode2;
337 : 0 : if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
338 : 0 : && (icode1 = optab_handler (vec_unpacku_hi_optab,
339 : : src_mode)) != CODE_FOR_nothing
340 : 0 : && (icode2 = optab_handler (vec_unpacku_lo_optab,
341 : : src_mode)) != CODE_FOR_nothing)
342 : : {
343 : : /* Unpacking the source masks gives at least as many mask bits as
344 : : we need. We can then VIEW_CONVERT any excess bits away. */
345 : 0 : machine_mode dest_mode = insn_data[icode1].operand[0].mode;
346 : 0 : gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
347 : 0 : tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
348 : 0 : for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
349 : : {
350 : 0 : tree src = src_rgm->controls[i / 2];
351 : 0 : tree dest = dest_rgm->controls[i];
352 : 0 : tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
353 : 0 : ? VEC_UNPACK_HI_EXPR
354 : : : VEC_UNPACK_LO_EXPR);
355 : 0 : gassign *stmt;
356 : 0 : if (dest_masktype == unpack_masktype)
357 : 0 : stmt = gimple_build_assign (dest, code, src);
358 : : else
359 : : {
360 : 0 : tree temp = make_ssa_name (unpack_masktype);
361 : 0 : stmt = gimple_build_assign (temp, code, src);
362 : 0 : gimple_seq_add_stmt (seq, stmt);
363 : 0 : stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
364 : : build1 (VIEW_CONVERT_EXPR,
365 : : dest_masktype, temp));
366 : : }
367 : 0 : gimple_seq_add_stmt (seq, stmt);
368 : : }
369 : : return true;
370 : : }
371 : 0 : vec_perm_indices indices[2];
372 : 0 : if (dest_masktype == src_masktype
373 : 0 : && interleave_supported_p (&indices[0], src_masktype, 0)
374 : 0 : && interleave_supported_p (&indices[1], src_masktype, 1))
375 : : {
376 : : /* The destination requires twice as many mask bits as the source, so
377 : : we can use interleaving permutes to double up the number of bits. */
378 : : tree masks[2];
379 : 0 : for (unsigned int i = 0; i < 2; ++i)
380 : 0 : masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
381 : 0 : for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
382 : : {
383 : 0 : tree src = src_rgm->controls[i / 2];
384 : 0 : tree dest = dest_rgm->controls[i];
385 : 0 : gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
386 : 0 : src, src, masks[i & 1]);
387 : 0 : gimple_seq_add_stmt (seq, stmt);
388 : : }
389 : 0 : return true;
390 : : }
391 : : return false;
392 : 0 : }
393 : :
394 : : /* Populate DEST_RGM->controls, given that they should add up to STEP.
395 : :
396 : : STEP = MIN_EXPR <ivtmp_34, VF>;
397 : :
398 : : First length (MIN (X, VF/N)):
399 : : loop_len_15 = MIN_EXPR <STEP, VF/N>;
400 : :
401 : : Second length:
402 : : tmp = STEP - loop_len_15;
403 : : loop_len_16 = MIN (tmp, VF/N);
404 : :
405 : : Third length:
406 : : tmp2 = tmp - loop_len_16;
407 : : loop_len_17 = MIN (tmp2, VF/N);
408 : :
409 : : Last length:
410 : : loop_len_18 = tmp2 - loop_len_17;
411 : : */
412 : :
413 : : static void
414 : 0 : vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
415 : : rgroup_controls *dest_rgm, tree step)
416 : : {
417 : 0 : tree ctrl_type = dest_rgm->type;
418 : 0 : poly_uint64 nitems_per_ctrl
419 : 0 : = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
420 : 0 : tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
421 : :
422 : 0 : for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
423 : : {
424 : 0 : tree ctrl = dest_rgm->controls[i];
425 : 0 : if (i == 0)
426 : : {
427 : : /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N]. */
428 : 0 : gassign *assign
429 : 0 : = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
430 : 0 : gimple_seq_add_stmt (seq, assign);
431 : : }
432 : 0 : else if (i == dest_rgm->controls.length () - 1)
433 : : {
434 : : /* Last iteration: Remain capped to the range [0, VF/N]. */
435 : 0 : gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step,
436 : 0 : dest_rgm->controls[i - 1]);
437 : 0 : gimple_seq_add_stmt (seq, assign);
438 : : }
439 : : else
440 : : {
441 : : /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N]. */
442 : 0 : step = gimple_build (seq, MINUS_EXPR, iv_type, step,
443 : 0 : dest_rgm->controls[i - 1]);
444 : 0 : gassign *assign
445 : 0 : = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
446 : 0 : gimple_seq_add_stmt (seq, assign);
447 : : }
448 : : }
449 : 0 : }
450 : :
451 : : /* Stores the standard position for induction variable increment in belonging to
452 : : LOOP_EXIT (just before the exit condition of the given exit to BSI.
453 : : INSERT_AFTER is set to true if the increment should be inserted after
454 : : *BSI. */
455 : :
456 : : void
457 : 60938 : vect_iv_increment_position (edge loop_exit, gimple_stmt_iterator *bsi,
458 : : bool *insert_after)
459 : : {
460 : 60938 : basic_block bb = loop_exit->src;
461 : 60938 : *bsi = gsi_last_bb (bb);
462 : 60938 : *insert_after = false;
463 : 60938 : }
464 : :
465 : : /* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
466 : : for all the rgroup controls in RGC and return a control that is nonzero
467 : : when the loop needs to iterate. Add any new preheader statements to
468 : : PREHEADER_SEQ. Use LOOP_COND_GSI to insert code before the exit gcond.
469 : :
470 : : RGC belongs to loop LOOP. The loop originally iterated NITERS
471 : : times and has been vectorized according to LOOP_VINFO.
472 : :
473 : : If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
474 : : starts with NITERS_SKIP dummy iterations of the scalar loop before
475 : : the real work starts. The mask elements for these dummy iterations
476 : : must be 0, to ensure that the extra iterations do not have an effect.
477 : :
478 : : It is known that:
479 : :
480 : : NITERS * RGC->max_nscalars_per_iter * RGC->factor
481 : :
482 : : does not overflow. However, MIGHT_WRAP_P says whether an induction
483 : : variable that starts at 0 and has step:
484 : :
485 : : VF * RGC->max_nscalars_per_iter * RGC->factor
486 : :
487 : : might overflow before hitting a value above:
488 : :
489 : : (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
490 : :
491 : : This means that we cannot guarantee that such an induction variable
492 : : would ever hit a value that produces a set of all-false masks or zero
493 : : lengths for RGC.
494 : :
495 : : Note: the cost of the code generated by this function is modeled
496 : : by vect_estimate_min_profitable_iters, so changes here may need
497 : : corresponding changes there. */
498 : :
499 : : static tree
500 : 0 : vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
501 : : gimple_seq *preheader_seq,
502 : : gimple_seq *header_seq,
503 : : gimple_stmt_iterator loop_cond_gsi,
504 : : rgroup_controls *rgc, tree niters,
505 : : tree niters_skip, bool might_wrap_p,
506 : : tree *iv_step, tree *compare_step)
507 : : {
508 : 0 : tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
509 : 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
510 : 0 : bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
511 : :
512 : 0 : tree ctrl_type = rgc->type;
513 : 0 : unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
514 : 0 : poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
515 : 0 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
516 : 0 : tree length_limit = NULL_TREE;
517 : : /* For length, we need length_limit to ensure length in range. */
518 : 0 : if (!use_masks_p)
519 : 0 : length_limit = build_int_cst (compare_type, nitems_per_ctrl);
520 : :
521 : : /* Calculate the maximum number of item values that the rgroup
522 : : handles in total, the number that it handles for each iteration
523 : : of the vector loop, and the number that it should skip during the
524 : : first iteration of the vector loop. */
525 : 0 : tree nitems_total = niters;
526 : 0 : tree nitems_step = build_int_cst (iv_type, vf);
527 : 0 : tree nitems_skip = niters_skip;
528 : 0 : if (nitems_per_iter != 1)
529 : : {
530 : : /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
531 : : these multiplications don't overflow. */
532 : 0 : tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
533 : 0 : tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
534 : 0 : nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
535 : : nitems_total, compare_factor);
536 : 0 : nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
537 : : nitems_step, iv_factor);
538 : 0 : if (nitems_skip)
539 : 0 : nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
540 : : nitems_skip, compare_factor);
541 : : }
542 : :
543 : : /* Create an induction variable that counts the number of items
544 : : processed. */
545 : 0 : tree index_before_incr, index_after_incr;
546 : 0 : gimple_stmt_iterator incr_gsi;
547 : 0 : bool insert_after;
548 : 0 : edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
549 : 0 : vect_iv_increment_position (exit_e, &incr_gsi, &insert_after);
550 : 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
551 : : {
552 : : /* Create an IV that counts down from niters_total and whose step
553 : : is the (variable) amount processed in the current iteration:
554 : : ...
555 : : _10 = (unsigned long) count_12(D);
556 : : ...
557 : : # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
558 : : _36 = (MIN_EXPR | SELECT_VL) <ivtmp_9, POLY_INT_CST [4, 4]>;
559 : : ...
560 : : vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
561 : : ...
562 : : ivtmp_35 = ivtmp_9 - POLY_INT_CST [4, 4];
563 : : ...
564 : : if (ivtmp_9 > POLY_INT_CST [4, 4])
565 : : goto <bb 4>; [83.33%]
566 : : else
567 : : goto <bb 5>; [16.67%]
568 : : */
569 : 0 : nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
570 : 0 : tree step = rgc->controls.length () == 1 ? rgc->controls[0]
571 : 0 : : make_ssa_name (iv_type);
572 : : /* Create decrement IV. */
573 : 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
574 : : {
575 : 0 : create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
576 : : insert_after, &index_before_incr, &index_after_incr);
577 : 0 : tree vectype = build_zero_cst (rgc->type);
578 : 0 : tree len = gimple_build (header_seq, IFN_SELECT_VL, iv_type,
579 : : index_before_incr, nitems_step,
580 : : vectype);
581 : 0 : gimple_seq_add_stmt (header_seq, gimple_build_assign (step, len));
582 : : }
583 : : else
584 : : {
585 : 0 : create_iv (nitems_total, MINUS_EXPR, nitems_step, NULL_TREE, loop,
586 : : &incr_gsi, insert_after, &index_before_incr,
587 : : &index_after_incr);
588 : 0 : gimple_seq_add_stmt (header_seq,
589 : 0 : gimple_build_assign (step, MIN_EXPR,
590 : : index_before_incr,
591 : : nitems_step));
592 : : }
593 : 0 : *iv_step = step;
594 : 0 : *compare_step = nitems_step;
595 : 0 : return LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) ? index_after_incr
596 : 0 : : index_before_incr;
597 : : }
598 : :
599 : : /* Create increment IV. */
600 : 0 : create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
601 : : loop, &incr_gsi, insert_after, &index_before_incr,
602 : : &index_after_incr);
603 : :
604 : 0 : tree zero_index = build_int_cst (compare_type, 0);
605 : 0 : tree test_index, test_limit, first_limit;
606 : 0 : gimple_stmt_iterator *test_gsi;
607 : 0 : if (might_wrap_p)
608 : : {
609 : : /* In principle the loop should stop iterating once the incremented
610 : : IV reaches a value greater than or equal to:
611 : :
612 : : NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
613 : :
614 : : However, there's no guarantee that this addition doesn't overflow
615 : : the comparison type, or that the IV hits a value above it before
616 : : wrapping around. We therefore adjust the limit down by one
617 : : IV step:
618 : :
619 : : (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
620 : : -[infinite-prec] NITEMS_STEP
621 : :
622 : : and compare the IV against this limit _before_ incrementing it.
623 : : Since the comparison type is unsigned, we actually want the
624 : : subtraction to saturate at zero:
625 : :
626 : : (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
627 : : -[sat] NITEMS_STEP
628 : :
629 : : And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
630 : :
631 : : NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
632 : :
633 : : where the rightmost subtraction can be done directly in
634 : : COMPARE_TYPE. */
635 : 0 : test_index = index_before_incr;
636 : 0 : tree adjust = gimple_convert (preheader_seq, compare_type,
637 : : nitems_step);
638 : 0 : if (nitems_skip)
639 : 0 : adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
640 : : adjust, nitems_skip);
641 : 0 : test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
642 : : nitems_total, adjust);
643 : 0 : test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
644 : : test_limit, adjust);
645 : 0 : test_gsi = &incr_gsi;
646 : :
647 : : /* Get a safe limit for the first iteration. */
648 : 0 : if (nitems_skip)
649 : : {
650 : : /* The first vector iteration can handle at most NITEMS_STEP
651 : : items. NITEMS_STEP <= CONST_LIMIT, and adding
652 : : NITEMS_SKIP to that cannot overflow. */
653 : 0 : tree const_limit = build_int_cst (compare_type,
654 : 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)
655 : 0 : * nitems_per_iter);
656 : 0 : first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
657 : : nitems_total, const_limit);
658 : 0 : first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
659 : : first_limit, nitems_skip);
660 : : }
661 : : else
662 : : /* For the first iteration it doesn't matter whether the IV hits
663 : : a value above NITEMS_TOTAL. That only matters for the latch
664 : : condition. */
665 : : first_limit = nitems_total;
666 : : }
667 : : else
668 : : {
669 : : /* Test the incremented IV, which will always hit a value above
670 : : the bound before wrapping. */
671 : 0 : test_index = index_after_incr;
672 : 0 : test_limit = nitems_total;
673 : 0 : if (nitems_skip)
674 : 0 : test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
675 : : test_limit, nitems_skip);
676 : : test_gsi = &loop_cond_gsi;
677 : :
678 : : first_limit = test_limit;
679 : : }
680 : :
681 : : /* Convert the IV value to the comparison type (either a no-op or
682 : : a demotion). */
683 : 0 : gimple_seq test_seq = NULL;
684 : 0 : test_index = gimple_convert (&test_seq, compare_type, test_index);
685 : 0 : gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
686 : :
687 : : /* Provide a definition of each control in the group. */
688 : 0 : tree next_ctrl = NULL_TREE;
689 : 0 : tree ctrl;
690 : 0 : unsigned int i;
691 : 0 : FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
692 : : {
693 : : /* Previous controls will cover BIAS items. This control covers the
694 : : next batch. */
695 : 0 : poly_uint64 bias = nitems_per_ctrl * i;
696 : 0 : tree bias_tree = build_int_cst (compare_type, bias);
697 : :
698 : : /* See whether the first iteration of the vector loop is known
699 : : to have a full control. */
700 : 0 : poly_uint64 const_limit;
701 : 0 : bool first_iteration_full
702 : 0 : = (poly_int_tree_p (first_limit, &const_limit)
703 : 0 : && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
704 : :
705 : : /* Rather than have a new IV that starts at BIAS and goes up to
706 : : TEST_LIMIT, prefer to use the same 0-based IV for each control
707 : : and adjust the bound down by BIAS. */
708 : 0 : tree this_test_limit = test_limit;
709 : 0 : if (i != 0)
710 : : {
711 : 0 : this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
712 : : compare_type, this_test_limit,
713 : : bias_tree);
714 : 0 : this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
715 : : compare_type, this_test_limit,
716 : : bias_tree);
717 : : }
718 : :
719 : : /* Create the initial control. First include all items that
720 : : are within the loop limit. */
721 : 0 : tree init_ctrl = NULL_TREE;
722 : 0 : if (!first_iteration_full)
723 : : {
724 : 0 : tree start, end;
725 : 0 : if (first_limit == test_limit)
726 : : {
727 : : /* Use a natural test between zero (the initial IV value)
728 : : and the loop limit. The "else" block would be valid too,
729 : : but this choice can avoid the need to load BIAS_TREE into
730 : : a register. */
731 : : start = zero_index;
732 : : end = this_test_limit;
733 : : }
734 : : else
735 : : {
736 : : /* FIRST_LIMIT is the maximum number of items handled by the
737 : : first iteration of the vector loop. Test the portion
738 : : associated with this control. */
739 : 0 : start = bias_tree;
740 : 0 : end = first_limit;
741 : : }
742 : :
743 : 0 : if (use_masks_p)
744 : 0 : init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
745 : : start, end, "max_mask");
746 : : else
747 : : {
748 : 0 : init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
749 : 0 : gimple_seq seq = vect_gen_len (init_ctrl, start,
750 : : end, length_limit);
751 : 0 : gimple_seq_add_seq (preheader_seq, seq);
752 : : }
753 : : }
754 : :
755 : : /* Now AND out the bits that are within the number of skipped
756 : : items. */
757 : 0 : poly_uint64 const_skip;
758 : 0 : if (nitems_skip
759 : 0 : && !(poly_int_tree_p (nitems_skip, &const_skip)
760 : 0 : && known_le (const_skip, bias)))
761 : : {
762 : 0 : gcc_assert (use_masks_p);
763 : 0 : tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
764 : : bias_tree, nitems_skip);
765 : 0 : if (init_ctrl)
766 : 0 : init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
767 : : init_ctrl, unskipped_mask);
768 : : else
769 : : init_ctrl = unskipped_mask;
770 : : }
771 : :
772 : 0 : if (!init_ctrl)
773 : : {
774 : : /* First iteration is full. */
775 : 0 : if (use_masks_p)
776 : 0 : init_ctrl = build_minus_one_cst (ctrl_type);
777 : : else
778 : : init_ctrl = length_limit;
779 : : }
780 : :
781 : : /* Get the control value for the next iteration of the loop. */
782 : 0 : if (use_masks_p)
783 : : {
784 : 0 : gimple_seq stmts = NULL;
785 : 0 : next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
786 : : this_test_limit, "next_mask");
787 : 0 : gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
788 : : }
789 : : else
790 : : {
791 : 0 : next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
792 : 0 : gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
793 : : length_limit);
794 : 0 : gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
795 : : }
796 : :
797 : 0 : vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
798 : : }
799 : :
800 : 0 : int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
801 : 0 : if (partial_load_bias != 0)
802 : : {
803 : 0 : tree adjusted_len = rgc->bias_adjusted_ctrl;
804 : 0 : gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
805 : 0 : rgc->controls[0],
806 : : build_int_cst
807 : 0 : (TREE_TYPE (rgc->controls[0]),
808 : 0 : partial_load_bias));
809 : 0 : gimple_seq_add_stmt (header_seq, minus);
810 : : }
811 : :
812 : : return next_ctrl;
813 : : }
814 : :
815 : : /* Set up the iteration condition and rgroup controls for LOOP, given
816 : : that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
817 : : loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is
818 : : the number of iterations of the original scalar loop that should be
819 : : handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as
820 : : for vect_set_loop_condition.
821 : :
822 : : Insert the branch-back condition before LOOP_COND_GSI and return the
823 : : final gcond. */
824 : :
825 : : static gcond *
826 : 0 : vect_set_loop_condition_partial_vectors (class loop *loop, edge exit_edge,
827 : : loop_vec_info loop_vinfo, tree niters,
828 : : tree final_iv, bool niters_maybe_zero,
829 : : gimple_stmt_iterator loop_cond_gsi)
830 : : {
831 : 0 : gimple_seq preheader_seq = NULL;
832 : 0 : gimple_seq header_seq = NULL;
833 : :
834 : 0 : bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
835 : 0 : tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
836 : 0 : unsigned int compare_precision = TYPE_PRECISION (compare_type);
837 : 0 : tree orig_niters = niters;
838 : :
839 : : /* Type of the initial value of NITERS. */
840 : 0 : tree ni_actual_type = TREE_TYPE (niters);
841 : 0 : unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
842 : 0 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
843 : 0 : if (niters_skip)
844 : 0 : niters_skip = gimple_convert (&preheader_seq, compare_type, niters_skip);
845 : :
846 : : /* Convert NITERS to the same size as the compare. */
847 : 0 : if (compare_precision > ni_actual_precision
848 : 0 : && niters_maybe_zero)
849 : : {
850 : : /* We know that there is always at least one iteration, so if the
851 : : count is zero then it must have wrapped. Cope with this by
852 : : subtracting 1 before the conversion and adding 1 to the result. */
853 : 0 : gcc_assert (TYPE_UNSIGNED (ni_actual_type));
854 : 0 : niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
855 : : niters, build_minus_one_cst (ni_actual_type));
856 : 0 : niters = gimple_convert (&preheader_seq, compare_type, niters);
857 : 0 : niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
858 : : niters, build_one_cst (compare_type));
859 : : }
860 : : else
861 : 0 : niters = gimple_convert (&preheader_seq, compare_type, niters);
862 : :
863 : : /* Iterate over all the rgroups and fill in their controls. We could use
864 : : the first control from any rgroup for the loop condition; here we
865 : : arbitrarily pick the last. */
866 : 0 : tree test_ctrl = NULL_TREE;
867 : 0 : tree iv_step = NULL_TREE;
868 : 0 : tree compare_step = NULL_TREE;
869 : 0 : rgroup_controls *rgc;
870 : 0 : rgroup_controls *iv_rgc = nullptr;
871 : 0 : unsigned int i;
872 : 0 : auto_vec<rgroup_controls> *controls = use_masks_p
873 : 0 : ? &LOOP_VINFO_MASKS (loop_vinfo).rgc_vec
874 : : : &LOOP_VINFO_LENS (loop_vinfo);
875 : 0 : FOR_EACH_VEC_ELT (*controls, i, rgc)
876 : 0 : if (!rgc->controls.is_empty ())
877 : : {
878 : : /* First try using permutes. This adds a single vector
879 : : instruction to the loop for each mask, but needs no extra
880 : : loop invariants or IVs. */
881 : 0 : unsigned int nmasks = i + 1;
882 : 0 : if (use_masks_p && (nmasks & 1) == 0)
883 : : {
884 : 0 : rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
885 : 0 : if (!half_rgc->controls.is_empty ()
886 : 0 : && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
887 : 0 : continue;
888 : : }
889 : :
890 : 0 : if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
891 : 0 : || !iv_rgc
892 : 0 : || (iv_rgc->max_nscalars_per_iter * iv_rgc->factor
893 : 0 : != rgc->max_nscalars_per_iter * rgc->factor))
894 : : {
895 : : /* See whether zero-based IV would ever generate all-false masks
896 : : or zero length before wrapping around. */
897 : 0 : bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
898 : :
899 : : /* Set up all controls for this group. */
900 : 0 : test_ctrl
901 : 0 : = vect_set_loop_controls_directly (loop, loop_vinfo,
902 : : &preheader_seq, &header_seq,
903 : : loop_cond_gsi, rgc, niters,
904 : : niters_skip, might_wrap_p,
905 : : &iv_step, &compare_step);
906 : :
907 : 0 : iv_rgc = rgc;
908 : : }
909 : :
910 : 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
911 : 0 : && rgc->controls.length () > 1)
912 : : {
913 : : /* vect_set_loop_controls_directly creates an IV whose step
914 : : is equal to the expected sum of RGC->controls. Use that
915 : : information to populate RGC->controls. */
916 : 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
917 : 0 : gcc_assert (iv_step);
918 : 0 : vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, iv_step);
919 : : }
920 : : }
921 : :
922 : : /* Emit all accumulated statements. */
923 : 0 : add_preheader_seq (loop, preheader_seq);
924 : 0 : add_header_seq (loop, header_seq);
925 : :
926 : : /* Get a boolean result that tells us whether to iterate. */
927 : 0 : gcond *cond_stmt;
928 : 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
929 : 0 : && !LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
930 : : {
931 : 0 : gcc_assert (compare_step);
932 : 0 : tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
933 : 0 : cond_stmt = gimple_build_cond (code, test_ctrl, compare_step, NULL_TREE,
934 : : NULL_TREE);
935 : 0 : }
936 : : else
937 : : {
938 : 0 : tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
939 : 0 : tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
940 : 0 : cond_stmt
941 : 0 : = gimple_build_cond (code, test_ctrl, zero_ctrl, NULL_TREE, NULL_TREE);
942 : : }
943 : 0 : gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
944 : :
945 : : /* The loop iterates (NITERS - 1) / VF + 1 times.
946 : : Subtract one from this to get the latch count. */
947 : 0 : tree step = build_int_cst (compare_type,
948 : 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo));
949 : 0 : tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
950 : : build_minus_one_cst (compare_type));
951 : 0 : loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
952 : : niters_minus_one, step);
953 : :
954 : 0 : if (final_iv)
955 : : {
956 : 0 : gassign *assign;
957 : : /* If vectorizing an inverted early break loop we have to restart the
958 : : scalar loop at niters - vf. This matches what we do in
959 : : vect_gen_vector_loop_niters_mult_vf for non-masked loops. */
960 : 0 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
961 : : {
962 : 0 : tree ftype = TREE_TYPE (orig_niters);
963 : 0 : tree vf = build_int_cst (ftype, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
964 : 0 : assign = gimple_build_assign (final_iv, MINUS_EXPR, orig_niters, vf);
965 : : }
966 : : else
967 : 0 : assign = gimple_build_assign (final_iv, orig_niters);
968 : 0 : gsi_insert_on_edge_immediate (exit_edge, assign);
969 : : }
970 : :
971 : 0 : return cond_stmt;
972 : : }
973 : :
974 : : /* Set up the iteration condition and rgroup controls for LOOP in AVX512
975 : : style, given that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the
976 : : vectorized loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is
977 : : the number of iterations of the original scalar loop that should be
978 : : handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as
979 : : for vect_set_loop_condition.
980 : :
981 : : Insert the branch-back condition before LOOP_COND_GSI and return the
982 : : final gcond. */
983 : :
984 : : static gcond *
985 : 18 : vect_set_loop_condition_partial_vectors_avx512 (class loop *loop,
986 : : edge exit_edge,
987 : : loop_vec_info loop_vinfo, tree niters,
988 : : tree final_iv,
989 : : bool niters_maybe_zero,
990 : : gimple_stmt_iterator loop_cond_gsi)
991 : : {
992 : 18 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
993 : 18 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
994 : 18 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
995 : 18 : tree orig_niters = niters;
996 : 18 : gimple_seq preheader_seq = NULL;
997 : :
998 : : /* Create an IV that counts down from niters and whose step
999 : : is the number of iterations processed in the current iteration.
1000 : : Produce the controls with compares like the following.
1001 : :
1002 : : # iv_2 = PHI <niters, iv_3>
1003 : : rem_4 = MIN <iv_2, VF>;
1004 : : remv_6 = { rem_4, rem_4, rem_4, ... }
1005 : : mask_5 = { 0, 0, 1, 1, 2, 2, ... } < remv6;
1006 : : iv_3 = iv_2 - VF;
1007 : : if (iv_2 > VF)
1008 : : continue;
1009 : :
1010 : : Where the constant is built with elements at most VF - 1 and
1011 : : repetitions according to max_nscalars_per_iter which is guarnateed
1012 : : to be the same within a group. */
1013 : :
1014 : : /* Convert NITERS to the determined IV type. */
1015 : 18 : if (TYPE_PRECISION (iv_type) > TYPE_PRECISION (TREE_TYPE (niters))
1016 : 18 : && niters_maybe_zero)
1017 : : {
1018 : : /* We know that there is always at least one iteration, so if the
1019 : : count is zero then it must have wrapped. Cope with this by
1020 : : subtracting 1 before the conversion and adding 1 to the result. */
1021 : 0 : gcc_assert (TYPE_UNSIGNED (TREE_TYPE (niters)));
1022 : 0 : niters = gimple_build (&preheader_seq, PLUS_EXPR, TREE_TYPE (niters),
1023 : 0 : niters, build_minus_one_cst (TREE_TYPE (niters)));
1024 : 0 : niters = gimple_convert (&preheader_seq, iv_type, niters);
1025 : 0 : niters = gimple_build (&preheader_seq, PLUS_EXPR, iv_type,
1026 : : niters, build_one_cst (iv_type));
1027 : : }
1028 : : else
1029 : 18 : niters = gimple_convert (&preheader_seq, iv_type, niters);
1030 : :
1031 : : /* Bias the initial value of the IV in case we need to skip iterations
1032 : : at the beginning. */
1033 : 18 : tree niters_adj = niters;
1034 : 18 : if (niters_skip)
1035 : : {
1036 : 1 : tree skip = gimple_convert (&preheader_seq, iv_type, niters_skip);
1037 : 1 : niters_adj = gimple_build (&preheader_seq, PLUS_EXPR,
1038 : : iv_type, niters, skip);
1039 : : }
1040 : :
1041 : : /* The iteration step is the vectorization factor. */
1042 : 18 : tree iv_step = build_int_cst (iv_type, vf);
1043 : :
1044 : : /* Create the decrement IV. */
1045 : 18 : tree index_before_incr, index_after_incr;
1046 : 18 : gimple_stmt_iterator incr_gsi;
1047 : 18 : bool insert_after;
1048 : 18 : vect_iv_increment_position (exit_edge, &incr_gsi, &insert_after);
1049 : 18 : create_iv (niters_adj, MINUS_EXPR, iv_step, NULL_TREE, loop,
1050 : : &incr_gsi, insert_after, &index_before_incr,
1051 : : &index_after_incr);
1052 : :
1053 : : /* Iterate over all the rgroups and fill in their controls. */
1054 : 80 : for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1055 : : {
1056 : 26 : if (rgc.controls.is_empty ())
1057 : 6 : continue;
1058 : :
1059 : 20 : tree ctrl_type = rgc.type;
1060 : 20 : poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type);
1061 : :
1062 : 20 : tree vectype = rgc.compare_type;
1063 : :
1064 : : /* index_after_incr is the IV specifying the remaining iterations in
1065 : : the next iteration. */
1066 : 20 : tree rem = index_after_incr;
1067 : : /* When the data type for the compare to produce the mask is
1068 : : smaller than the IV type we need to saturate. Saturate to
1069 : : the smallest possible value (IV_TYPE) so we only have to
1070 : : saturate once (CSE will catch redundant ones we add). */
1071 : 20 : if (TYPE_PRECISION (TREE_TYPE (vectype)) < TYPE_PRECISION (iv_type))
1072 : 9 : rem = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
1073 : : UNKNOWN_LOCATION,
1074 : 9 : MIN_EXPR, TREE_TYPE (rem), rem, iv_step);
1075 : 20 : rem = gimple_convert (&incr_gsi, false, GSI_CONTINUE_LINKING,
1076 : 20 : UNKNOWN_LOCATION, TREE_TYPE (vectype), rem);
1077 : :
1078 : : /* Build a data vector composed of the remaining iterations. */
1079 : 20 : rem = gimple_build_vector_from_val (&incr_gsi, false, GSI_CONTINUE_LINKING,
1080 : : UNKNOWN_LOCATION, vectype, rem);
1081 : :
1082 : : /* Provide a definition of each vector in the control group. */
1083 : 20 : tree next_ctrl = NULL_TREE;
1084 : 20 : tree first_rem = NULL_TREE;
1085 : 20 : tree ctrl;
1086 : 20 : unsigned int i;
1087 : 238 : FOR_EACH_VEC_ELT_REVERSE (rgc.controls, i, ctrl)
1088 : : {
1089 : : /* Previous controls will cover BIAS items. This control covers the
1090 : : next batch. */
1091 : 86 : poly_uint64 bias = nitems_per_ctrl * i;
1092 : :
1093 : : /* Build the constant to compare the remaining iters against,
1094 : : this is sth like { 0, 0, 1, 1, 2, 2, 3, 3, ... } appropriately
1095 : : split into pieces. */
1096 : 86 : unsigned n = TYPE_VECTOR_SUBPARTS (ctrl_type).to_constant ();
1097 : 86 : tree_vector_builder builder (vectype, n, 1);
1098 : 1266 : for (unsigned i = 0; i < n; ++i)
1099 : : {
1100 : 1180 : unsigned HOST_WIDE_INT val
1101 : 1180 : = (i + bias.to_constant ()) / rgc.max_nscalars_per_iter;
1102 : 1180 : gcc_assert (val < vf.to_constant ());
1103 : 1180 : builder.quick_push (build_int_cst (TREE_TYPE (vectype), val));
1104 : : }
1105 : 86 : tree cmp_series = builder.build ();
1106 : :
1107 : : /* Create the initial control. First include all items that
1108 : : are within the loop limit. */
1109 : 86 : tree init_ctrl = NULL_TREE;
1110 : 86 : poly_uint64 const_limit;
1111 : : /* See whether the first iteration of the vector loop is known
1112 : : to have a full control. */
1113 : 86 : if (poly_int_tree_p (niters, &const_limit)
1114 : 86 : && known_ge (const_limit, (i + 1) * nitems_per_ctrl))
1115 : 1 : init_ctrl = build_minus_one_cst (ctrl_type);
1116 : : else
1117 : : {
1118 : : /* The remaining work items initially are niters. Saturate,
1119 : : splat and compare. */
1120 : 85 : if (!first_rem)
1121 : : {
1122 : 19 : first_rem = niters;
1123 : 19 : if (TYPE_PRECISION (TREE_TYPE (vectype))
1124 : 19 : < TYPE_PRECISION (iv_type))
1125 : 9 : first_rem = gimple_build (&preheader_seq,
1126 : 9 : MIN_EXPR, TREE_TYPE (first_rem),
1127 : : first_rem, iv_step);
1128 : 19 : first_rem = gimple_convert (&preheader_seq, TREE_TYPE (vectype),
1129 : : first_rem);
1130 : 19 : first_rem = gimple_build_vector_from_val (&preheader_seq,
1131 : : vectype, first_rem);
1132 : : }
1133 : 85 : init_ctrl = gimple_build (&preheader_seq, LT_EXPR, ctrl_type,
1134 : : cmp_series, first_rem);
1135 : : }
1136 : :
1137 : : /* Now AND out the bits that are within the number of skipped
1138 : : items. */
1139 : 86 : poly_uint64 const_skip;
1140 : 86 : if (niters_skip
1141 : 86 : && !(poly_int_tree_p (niters_skip, &const_skip)
1142 : 1 : && known_le (const_skip, bias)))
1143 : : {
1144 : : /* For integer mode masks it's cheaper to shift out the bits
1145 : : since that avoids loading a constant. */
1146 : 1 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (ctrl_type)) == MODE_INT);
1147 : 1 : init_ctrl = gimple_build (&preheader_seq, VIEW_CONVERT_EXPR,
1148 : 1 : lang_hooks.types.type_for_mode
1149 : 1 : (TYPE_MODE (ctrl_type), 1),
1150 : : init_ctrl);
1151 : : /* ??? But when the shift amount isn't constant this requires
1152 : : a round-trip to GRPs. We could apply the bias to either
1153 : : side of the compare instead. */
1154 : 2 : tree shift = gimple_build (&preheader_seq, MINUS_EXPR,
1155 : 1 : TREE_TYPE (niters_skip), niters_skip,
1156 : 1 : build_int_cst (TREE_TYPE (niters_skip),
1157 : 1 : bias));
1158 : 2 : shift = gimple_build (&preheader_seq, MULT_EXPR,
1159 : 1 : TREE_TYPE (niters_skip), shift,
1160 : 1 : build_int_cst (TREE_TYPE (niters_skip),
1161 : 1 : rgc.max_nscalars_per_iter));
1162 : 1 : init_ctrl = gimple_build (&preheader_seq, LSHIFT_EXPR,
1163 : 1 : TREE_TYPE (init_ctrl),
1164 : : init_ctrl, shift);
1165 : 1 : init_ctrl = gimple_build (&preheader_seq, VIEW_CONVERT_EXPR,
1166 : : ctrl_type, init_ctrl);
1167 : : }
1168 : :
1169 : : /* Get the control value for the next iteration of the loop. */
1170 : 86 : next_ctrl = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
1171 : : UNKNOWN_LOCATION,
1172 : : LT_EXPR, ctrl_type, cmp_series, rem);
1173 : :
1174 : 86 : vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
1175 : 86 : }
1176 : : }
1177 : :
1178 : : /* Emit all accumulated statements. */
1179 : 18 : add_preheader_seq (loop, preheader_seq);
1180 : :
1181 : : /* Adjust the exit test using the decrementing IV. */
1182 : 18 : tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
1183 : : /* When we peel for alignment with niter_skip != 0 this can
1184 : : cause niter + niter_skip to wrap and since we are comparing the
1185 : : value before the decrement here we get a false early exit.
1186 : : We can't compare the value after decrement either because that
1187 : : decrement could wrap as well as we're not doing a saturating
1188 : : decrement. To avoid this situation we force a larger
1189 : : iv_type. */
1190 : 18 : gcond *cond_stmt = gimple_build_cond (code, index_before_incr, iv_step,
1191 : : NULL_TREE, NULL_TREE);
1192 : 18 : gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
1193 : :
1194 : : /* The loop iterates (NITERS - 1 + NITERS_SKIP) / VF + 1 times.
1195 : : Subtract one from this to get the latch count. */
1196 : 18 : tree niters_minus_one
1197 : 18 : = fold_build2 (PLUS_EXPR, TREE_TYPE (orig_niters), orig_niters,
1198 : : build_minus_one_cst (TREE_TYPE (orig_niters)));
1199 : 18 : tree niters_adj2 = fold_convert (iv_type, niters_minus_one);
1200 : 18 : if (niters_skip)
1201 : 1 : niters_adj2 = fold_build2 (PLUS_EXPR, iv_type, niters_minus_one,
1202 : : fold_convert (iv_type, niters_skip));
1203 : 18 : loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, iv_type,
1204 : : niters_adj2, iv_step);
1205 : :
1206 : 18 : if (final_iv)
1207 : : {
1208 : 0 : gassign *assign;
1209 : : /* If vectorizing an inverted early break loop we have to restart the
1210 : : scalar loop at niters - vf. This matches what we do in
1211 : : vect_gen_vector_loop_niters_mult_vf for non-masked loops. */
1212 : 0 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
1213 : : {
1214 : 0 : tree ftype = TREE_TYPE (orig_niters);
1215 : 0 : tree vf = build_int_cst (ftype, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1216 : 0 : assign = gimple_build_assign (final_iv, MINUS_EXPR, orig_niters, vf);
1217 : : }
1218 : : else
1219 : 0 : assign = gimple_build_assign (final_iv, orig_niters);
1220 : 0 : gsi_insert_on_edge_immediate (exit_edge, assign);
1221 : : }
1222 : :
1223 : 18 : return cond_stmt;
1224 : : }
1225 : :
1226 : :
1227 : : /* Like vect_set_loop_condition, but handle the case in which the vector
1228 : : loop handles exactly VF scalars per iteration. */
1229 : :
1230 : : static gcond *
1231 : 60853 : vect_set_loop_condition_normal (loop_vec_info /* loop_vinfo */, edge exit_edge,
1232 : : class loop *loop, tree niters, tree step,
1233 : : tree final_iv, bool niters_maybe_zero,
1234 : : gimple_stmt_iterator loop_cond_gsi)
1235 : : {
1236 : 60853 : tree indx_before_incr, indx_after_incr;
1237 : 60853 : gcond *cond_stmt;
1238 : 60853 : gcond *orig_cond;
1239 : 60853 : edge pe = loop_preheader_edge (loop);
1240 : 60853 : gimple_stmt_iterator incr_gsi;
1241 : 60853 : bool insert_after;
1242 : 60853 : enum tree_code code;
1243 : 60853 : tree niters_type = TREE_TYPE (niters);
1244 : :
1245 : 60853 : orig_cond = get_loop_exit_condition (exit_edge);
1246 : 60853 : gcc_assert (orig_cond);
1247 : 60853 : loop_cond_gsi = gsi_for_stmt (orig_cond);
1248 : :
1249 : 60853 : tree init, limit;
1250 : 60853 : if (!niters_maybe_zero && integer_onep (step))
1251 : : {
1252 : : /* In this case we can use a simple 0-based IV:
1253 : :
1254 : : A:
1255 : : x = 0;
1256 : : do
1257 : : {
1258 : : ...
1259 : : x += 1;
1260 : : }
1261 : : while (x < NITERS); */
1262 : 60853 : code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
1263 : 60853 : init = build_zero_cst (niters_type);
1264 : 60853 : limit = niters;
1265 : : }
1266 : : else
1267 : : {
1268 : : /* The following works for all values of NITERS except 0:
1269 : :
1270 : : B:
1271 : : x = 0;
1272 : : do
1273 : : {
1274 : : ...
1275 : : x += STEP;
1276 : : }
1277 : : while (x <= NITERS - STEP);
1278 : :
1279 : : so that the loop continues to iterate if x + STEP - 1 < NITERS
1280 : : but stops if x + STEP - 1 >= NITERS.
1281 : :
1282 : : However, if NITERS is zero, x never hits a value above NITERS - STEP
1283 : : before wrapping around. There are two obvious ways of dealing with
1284 : : this:
1285 : :
1286 : : - start at STEP - 1 and compare x before incrementing it
1287 : : - start at -1 and compare x after incrementing it
1288 : :
1289 : : The latter is simpler and is what we use. The loop in this case
1290 : : looks like:
1291 : :
1292 : : C:
1293 : : x = -1;
1294 : : do
1295 : : {
1296 : : ...
1297 : : x += STEP;
1298 : : }
1299 : : while (x < NITERS - STEP);
1300 : :
1301 : : In both cases the loop limit is NITERS - STEP. */
1302 : 0 : gimple_seq seq = NULL;
1303 : 0 : limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
1304 : 0 : limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
1305 : 0 : if (seq)
1306 : : {
1307 : 0 : basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1308 : 0 : gcc_assert (!new_bb);
1309 : : }
1310 : 0 : if (niters_maybe_zero)
1311 : : {
1312 : : /* Case C. */
1313 : 0 : code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
1314 : 0 : init = build_all_ones_cst (niters_type);
1315 : : }
1316 : : else
1317 : : {
1318 : : /* Case B. */
1319 : 0 : code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
1320 : 0 : init = build_zero_cst (niters_type);
1321 : : }
1322 : : }
1323 : :
1324 : 60853 : vect_iv_increment_position (exit_edge, &incr_gsi, &insert_after);
1325 : 60853 : create_iv (init, PLUS_EXPR, step, NULL_TREE, loop,
1326 : : &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
1327 : 60853 : indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
1328 : : true, NULL_TREE, true,
1329 : : GSI_SAME_STMT);
1330 : 60853 : limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
1331 : : true, GSI_SAME_STMT);
1332 : :
1333 : 60853 : cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
1334 : : NULL_TREE);
1335 : :
1336 : 60853 : gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
1337 : :
1338 : : /* Record the number of latch iterations. */
1339 : 60853 : if (limit == niters)
1340 : : /* Case A: the loop iterates NITERS times. Subtract one to get the
1341 : : latch count. */
1342 : 60853 : loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
1343 : : build_int_cst (niters_type, 1));
1344 : : else
1345 : : /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
1346 : : Subtract one from this to get the latch count. */
1347 : 0 : loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
1348 : : limit, step);
1349 : :
1350 : 60853 : if (final_iv)
1351 : : {
1352 : 0 : gassign *assign;
1353 : 0 : gcc_assert (single_pred_p (exit_edge->dest));
1354 : 0 : tree phi_dest
1355 : 0 : = integer_zerop (init) ? final_iv : copy_ssa_name (indx_after_incr);
1356 : : /* Make sure to maintain LC SSA form here and elide the subtraction
1357 : : if the value is zero. */
1358 : 0 : gphi *phi = create_phi_node (phi_dest, exit_edge->dest);
1359 : 0 : add_phi_arg (phi, indx_after_incr, exit_edge, UNKNOWN_LOCATION);
1360 : 0 : if (!integer_zerop (init))
1361 : : {
1362 : 0 : assign = gimple_build_assign (final_iv, MINUS_EXPR,
1363 : : phi_dest, init);
1364 : 0 : gimple_stmt_iterator gsi = gsi_after_labels (exit_edge->dest);
1365 : 0 : gsi_insert_before (&gsi, assign, GSI_SAME_STMT);
1366 : : }
1367 : : }
1368 : :
1369 : 60853 : return cond_stmt;
1370 : : }
1371 : :
1372 : : /* If we're using fully-masked loops, make LOOP iterate:
1373 : :
1374 : : N == (NITERS - 1) / STEP + 1
1375 : :
1376 : : times. When NITERS is zero, this is equivalent to making the loop
1377 : : execute (1 << M) / STEP times, where M is the precision of NITERS.
1378 : : NITERS_MAYBE_ZERO is true if this last case might occur.
1379 : :
1380 : : If we're not using fully-masked loops, make LOOP iterate:
1381 : :
1382 : : N == (NITERS - STEP) / STEP + 1
1383 : :
1384 : : times, where NITERS is known to be outside the range [1, STEP - 1].
1385 : : This is equivalent to making the loop execute NITERS / STEP times
1386 : : when NITERS is nonzero and (1 << M) / STEP times otherwise.
1387 : : NITERS_MAYBE_ZERO again indicates whether this last case might occur.
1388 : :
1389 : : If FINAL_IV is nonnull, it is an SSA name that should be set to
1390 : : N * STEP on exit from the loop.
1391 : :
1392 : : Assumption: the exit-condition of LOOP is the last stmt in the loop. */
1393 : :
1394 : : void
1395 : 60871 : vect_set_loop_condition (class loop *loop, edge loop_e, loop_vec_info loop_vinfo,
1396 : : tree niters, tree step, tree final_iv,
1397 : : bool niters_maybe_zero)
1398 : : {
1399 : 60871 : gcond *cond_stmt;
1400 : 60871 : gcond *orig_cond = get_loop_exit_condition (loop_e);
1401 : 60871 : gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
1402 : :
1403 : 60871 : if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1404 : : {
1405 : 18 : if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_avx512)
1406 : 18 : cond_stmt = vect_set_loop_condition_partial_vectors_avx512 (loop, loop_e,
1407 : : loop_vinfo,
1408 : : niters, final_iv,
1409 : : niters_maybe_zero,
1410 : : loop_cond_gsi);
1411 : : else
1412 : 0 : cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_e,
1413 : : loop_vinfo,
1414 : : niters, final_iv,
1415 : : niters_maybe_zero,
1416 : : loop_cond_gsi);
1417 : : }
1418 : : else
1419 : 60853 : cond_stmt = vect_set_loop_condition_normal (loop_vinfo, loop_e, loop,
1420 : : niters,
1421 : : step, final_iv,
1422 : : niters_maybe_zero,
1423 : : loop_cond_gsi);
1424 : :
1425 : : /* Remove old loop exit test. */
1426 : 60871 : stmt_vec_info orig_cond_info;
1427 : 60871 : if (loop_vinfo
1428 : 60871 : && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
1429 : 60465 : loop_vinfo->remove_stmt (orig_cond_info);
1430 : : else
1431 : 406 : gsi_remove (&loop_cond_gsi, true);
1432 : :
1433 : 60871 : if (dump_enabled_p ())
1434 : 10702 : dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
1435 : : (gimple *) cond_stmt);
1436 : 60871 : }
1437 : :
1438 : : /* Get the virtual operand live on E. The precondition on this is valid
1439 : : immediate dominators and an actual virtual definition dominating E. */
1440 : : /* ??? Costly band-aid. For the use in question we can populate a
1441 : : live-on-exit/end-of-BB virtual operand when copying stmts. */
1442 : :
1443 : : static tree
1444 : 10 : get_live_virtual_operand_on_edge (edge e)
1445 : : {
1446 : 10 : basic_block bb = e->src;
1447 : 22 : do
1448 : : {
1449 : 61 : for (auto gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1450 : : {
1451 : 37 : gimple *stmt = gsi_stmt (gsi);
1452 : 55 : if (gimple_vdef (stmt))
1453 : 10 : return gimple_vdef (stmt);
1454 : 39 : if (gimple_vuse (stmt))
1455 : : return gimple_vuse (stmt);
1456 : : }
1457 : 8 : if (gphi *vphi = get_virtual_phi (bb))
1458 : 2 : return gimple_phi_result (vphi);
1459 : 6 : bb = get_immediate_dominator (CDI_DOMINATORS, bb);
1460 : 6 : }
1461 : : while (1);
1462 : : }
1463 : :
1464 : : /* Given LOOP this function generates a new copy of it and puts it
1465 : : on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
1466 : : non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1467 : : basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1468 : : entry or exit of LOOP. If FLOW_LOOPS then connect LOOP to SCALAR_LOOP as a
1469 : : continuation. This is correct for cases where one loop continues from the
1470 : : other like in the vectorizer, but not true for uses in e.g. loop distribution
1471 : : where the contents of the loop body are split but the iteration space of both
1472 : : copies remains the same.
1473 : :
1474 : : If UPDATED_DOMS is not NULL it is update with the list of basic blocks whoms
1475 : : dominators were updated during the peeling. When doing early break vectorization
1476 : : then LOOP_VINFO needs to be provided and is used to keep track of any newly created
1477 : : memory references that need to be updated should we decide to vectorize. */
1478 : :
1479 : : class loop *
1480 : 34486 : slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit,
1481 : : class loop *scalar_loop,
1482 : : edge scalar_exit, edge e, edge *new_e,
1483 : : bool flow_loops,
1484 : : vec<basic_block> *updated_doms)
1485 : : {
1486 : 34486 : class loop *new_loop;
1487 : 34486 : basic_block *new_bbs, *bbs, *pbbs;
1488 : 34486 : bool at_exit;
1489 : 34486 : bool was_imm_dom;
1490 : 34486 : basic_block exit_dest;
1491 : 34486 : edge exit, new_exit;
1492 : 34486 : bool duplicate_outer_loop = false;
1493 : :
1494 : 34486 : exit = loop_exit;
1495 : 34486 : at_exit = (e == exit);
1496 : 34486 : if (!at_exit && e != loop_preheader_edge (loop))
1497 : : return NULL;
1498 : :
1499 : 34486 : if (scalar_loop == NULL)
1500 : : {
1501 : 31972 : scalar_loop = loop;
1502 : 31972 : scalar_exit = loop_exit;
1503 : : }
1504 : 2514 : else if (scalar_loop == loop)
1505 : 0 : scalar_exit = loop_exit;
1506 : : else
1507 : : {
1508 : : /* Loop has been version, match exits up using the aux index. */
1509 : 7542 : for (edge exit : get_loop_exit_edges (scalar_loop))
1510 : 2514 : if (exit->aux == loop_exit->aux)
1511 : : {
1512 : 2514 : scalar_exit = exit;
1513 : 2514 : break;
1514 : 2514 : }
1515 : :
1516 : 2514 : gcc_assert (scalar_exit);
1517 : : }
1518 : :
1519 : 34486 : bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1520 : 34486 : pbbs = bbs + 1;
1521 : 34486 : get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1522 : : /* Allow duplication of outer loops. */
1523 : 34486 : if (scalar_loop->inner)
1524 : 123 : duplicate_outer_loop = true;
1525 : :
1526 : : /* Generate new loop structure. */
1527 : 34486 : new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1528 : 34486 : duplicate_subloops (scalar_loop, new_loop);
1529 : :
1530 : 34486 : exit_dest = exit->dest;
1531 : 34486 : was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1532 : 34486 : exit_dest) == exit->src ?
1533 : : true : false);
1534 : :
1535 : : /* Also copy the pre-header, this avoids jumping through hoops to
1536 : : duplicate the loop entry PHI arguments. Create an empty
1537 : : pre-header unconditionally for this. */
1538 : 34486 : basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1539 : 34486 : edge entry_e = single_pred_edge (preheader);
1540 : 34486 : bbs[0] = preheader;
1541 : 34486 : new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1542 : :
1543 : 34486 : copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1544 : : &scalar_exit, 1, &new_exit, NULL,
1545 : : at_exit ? loop->latch : e->src, true);
1546 : 34486 : exit = loop_exit;
1547 : 34486 : basic_block new_preheader = new_bbs[0];
1548 : :
1549 : 34486 : gcc_assert (new_exit);
1550 : :
1551 : : /* Record the new loop exit information. new_loop doesn't have SCEV data and
1552 : : so we must initialize the exit information. */
1553 : 34486 : if (new_e)
1554 : 33415 : *new_e = new_exit;
1555 : :
1556 : : /* Before installing PHI arguments make sure that the edges
1557 : : into them match that of the scalar loop we analyzed. This
1558 : : makes sure the SLP tree matches up between the main vectorized
1559 : : loop and the epilogue vectorized copies. */
1560 : 34486 : if (single_succ_edge (preheader)->dest_idx
1561 : 34486 : != single_succ_edge (new_bbs[0])->dest_idx)
1562 : : {
1563 : 29546 : basic_block swap_bb = new_bbs[1];
1564 : 29546 : gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1565 : 29546 : std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1566 : 29546 : EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1567 : 29546 : EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1568 : : }
1569 : 34486 : if (duplicate_outer_loop)
1570 : : {
1571 : 123 : class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1572 : 123 : if (loop_preheader_edge (scalar_loop)->dest_idx
1573 : 123 : != loop_preheader_edge (new_inner_loop)->dest_idx)
1574 : : {
1575 : 98 : basic_block swap_bb = new_inner_loop->header;
1576 : 98 : gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1577 : 98 : std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1578 : 98 : EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1579 : 98 : EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1580 : : }
1581 : : }
1582 : :
1583 : 34486 : add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1584 : :
1585 : : /* Skip new preheader since it's deleted if copy loop is added at entry. */
1586 : 180134 : for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1587 : 111162 : rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1588 : :
1589 : : /* Rename the exit uses. */
1590 : 139864 : for (edge exit : get_loop_exit_edges (new_loop))
1591 : 36406 : for (auto gsi = gsi_start_phis (exit->dest);
1592 : 84394 : !gsi_end_p (gsi); gsi_next (&gsi))
1593 : : {
1594 : 47988 : tree orig_def = PHI_ARG_DEF_FROM_EDGE (gsi.phi (), exit);
1595 : 47988 : rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), exit));
1596 : 47988 : if (MAY_HAVE_DEBUG_BIND_STMTS)
1597 : 23066 : adjust_debug_stmts (orig_def, PHI_RESULT (gsi.phi ()), exit->dest);
1598 : 34486 : }
1599 : :
1600 : 34486 : auto loop_exits = get_loop_exit_edges (loop);
1601 : 34486 : bool multiple_exits_p = loop_exits.length () > 1;
1602 : 34486 : auto_vec<basic_block> doms;
1603 : :
1604 : 34486 : if (at_exit) /* Add the loop copy at exit. */
1605 : : {
1606 : 33009 : if (scalar_loop != loop && new_exit->dest != exit_dest)
1607 : : {
1608 : 2510 : new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1609 : 2510 : flush_pending_stmts (new_exit);
1610 : : }
1611 : :
1612 : 33009 : bool need_virtual_phi = get_virtual_phi (loop->header);
1613 : :
1614 : : /* For the main loop exit preserve the LC PHI nodes. For vectorization
1615 : : we need them to continue or finalize reductions. Since we do not
1616 : : copy the loop exit blocks we have to materialize PHIs at the
1617 : : new destination before redirecting edges. */
1618 : 33009 : for (auto gsi_from = gsi_start_phis (loop_exit->dest);
1619 : 78100 : !gsi_end_p (gsi_from); gsi_next (&gsi_from))
1620 : : {
1621 : 45091 : tree res = gimple_phi_result (*gsi_from);
1622 : 45091 : create_phi_node (copy_ssa_name (res), new_preheader);
1623 : : }
1624 : 33009 : edge e = redirect_edge_and_branch (loop_exit, new_preheader);
1625 : 33009 : gcc_assert (e == loop_exit);
1626 : 33009 : flush_pending_stmts (loop_exit);
1627 : 33009 : set_immediate_dominator (CDI_DOMINATORS, new_preheader, loop_exit->src);
1628 : :
1629 : : /* If we ended up choosing an exit leading to a path not using memory
1630 : : we can end up without a virtual LC PHI. Create it when it is
1631 : : needed because of the epilog loop continuation. */
1632 : 33009 : if (need_virtual_phi && !get_virtual_phi (loop_exit->dest))
1633 : : {
1634 : 8 : tree header_def = gimple_phi_result (get_virtual_phi (loop->header));
1635 : 8 : gphi *vphi = create_phi_node (copy_ssa_name (header_def),
1636 : : new_preheader);
1637 : 8 : add_phi_arg (vphi, get_live_virtual_operand_on_edge (loop_exit),
1638 : : loop_exit, UNKNOWN_LOCATION);
1639 : : }
1640 : :
1641 : 33009 : bool multiple_exits_p = loop_exits.length () > 1;
1642 : 66018 : basic_block main_loop_exit_block = new_preheader;
1643 : 33009 : basic_block alt_loop_exit_block = NULL;
1644 : : /* Create the CFG for multiple exits.
1645 : : | loop_exit | alt1 | altN
1646 : : v v ... v
1647 : : main_loop_exit_block: alt_loop_exit_block:
1648 : : | /
1649 : : v v
1650 : : new_preheader:
1651 : : where in the new preheader we need merge PHIs for
1652 : : the continuation values into the epilogue header.
1653 : : Do not bother with exit PHIs for the early exits but
1654 : : their live virtual operand. We'll fix up things below. */
1655 : 33009 : if (multiple_exits_p)
1656 : : {
1657 : 1482 : edge loop_e = single_succ_edge (new_preheader);
1658 : 1482 : new_preheader = split_edge (loop_e);
1659 : :
1660 : 1482 : gphi *vphi = NULL;
1661 : 1482 : alt_loop_exit_block = new_preheader;
1662 : 7615 : for (auto exit : loop_exits)
1663 : 3169 : if (exit != loop_exit)
1664 : : {
1665 : 1687 : tree vphi_def = NULL_TREE;
1666 : 1687 : if (gphi *evphi = get_virtual_phi (exit->dest))
1667 : 623 : vphi_def = gimple_phi_arg_def_from_edge (evphi, exit);
1668 : 1687 : edge res = redirect_edge_and_branch (exit, alt_loop_exit_block);
1669 : 1687 : gcc_assert (res == exit);
1670 : 1687 : redirect_edge_var_map_clear (exit);
1671 : 1687 : if (alt_loop_exit_block == new_preheader)
1672 : 1482 : alt_loop_exit_block = split_edge (exit);
1673 : 1687 : if (!need_virtual_phi)
1674 : 1241 : continue;
1675 : : /* When the edge has no virtual LC PHI get at the live
1676 : : virtual operand by other means. */
1677 : 446 : if (!vphi_def)
1678 : 2 : vphi_def = get_live_virtual_operand_on_edge (exit);
1679 : 446 : if (!vphi)
1680 : 424 : vphi = create_phi_node (copy_ssa_name (vphi_def),
1681 : : alt_loop_exit_block);
1682 : : else
1683 : : /* Edge redirection might re-allocate the PHI node
1684 : : so we have to rediscover it. */
1685 : 22 : vphi = get_virtual_phi (alt_loop_exit_block);
1686 : 446 : add_phi_arg (vphi, vphi_def, exit, UNKNOWN_LOCATION);
1687 : : }
1688 : :
1689 : 1482 : set_immediate_dominator (CDI_DOMINATORS, new_preheader,
1690 : : loop->header);
1691 : :
1692 : : /* Fix up the profile counts of the new exit blocks.
1693 : : main_loop_exit_block was created by duplicating the
1694 : : preheader, so needs its count scaling according to the main
1695 : : exit edge's probability. The remaining count from the
1696 : : preheader goes to the alt_loop_exit_block, since all
1697 : : alternative exits have been redirected there. */
1698 : 1482 : main_loop_exit_block->count = loop_exit->count ();
1699 : 1482 : alt_loop_exit_block->count
1700 : 1482 : = preheader->count - main_loop_exit_block->count;
1701 : : }
1702 : :
1703 : : /* Adjust the epilog loop PHI entry values to continue iteration.
1704 : : This adds remaining necessary LC PHI nodes to the main exit
1705 : : and creates merge PHIs when we have multiple exits with
1706 : : their appropriate continuation. */
1707 : 33009 : if (flow_loops)
1708 : : {
1709 : 33009 : edge loop_entry = single_succ_edge (new_preheader);
1710 : 33009 : bool peeled_iters = single_pred (loop->latch) != loop_exit->src;
1711 : :
1712 : : /* Record the new SSA names in the cache so that we can skip
1713 : : materializing them again when we fill in the rest of the LC SSA
1714 : : variables. */
1715 : 33009 : hash_map <tree, tree> new_phi_args;
1716 : 33009 : for (auto psi = gsi_start_phis (main_loop_exit_block);
1717 : 78108 : !gsi_end_p (psi); gsi_next (&psi))
1718 : : {
1719 : 45099 : gphi *phi = *psi;
1720 : 45099 : tree new_arg = gimple_phi_arg_def_from_edge (phi, loop_exit);
1721 : 45099 : if (TREE_CODE (new_arg) != SSA_NAME)
1722 : 377 : continue;
1723 : :
1724 : : /* If the loop doesn't have a virtual def then only possibly keep
1725 : : the epilog LC PHI for it and avoid creating new defs. */
1726 : 44813 : if (virtual_operand_p (new_arg) && !need_virtual_phi)
1727 : : {
1728 : 91 : auto gsi = gsi_for_stmt (phi);
1729 : 91 : remove_phi_node (&gsi, true);
1730 : 91 : continue;
1731 : 91 : }
1732 : :
1733 : : /* If we decided not to remove the PHI node we should also not
1734 : : rematerialize it later on. */
1735 : 44722 : new_phi_args.put (new_arg, gimple_phi_result (phi));
1736 : : }
1737 : :
1738 : : /* Create the merge PHI nodes in new_preheader and populate the
1739 : : arguments for the exits. */
1740 : 33009 : if (multiple_exits_p)
1741 : : {
1742 : 1482 : for (auto gsi_from = gsi_start_phis (loop->header),
1743 : 1482 : gsi_to = gsi_start_phis (new_loop->header);
1744 : 4699 : !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1745 : 3217 : gsi_next (&gsi_from), gsi_next (&gsi_to))
1746 : : {
1747 : 3217 : gimple *from_phi = gsi_stmt (gsi_from);
1748 : 3217 : gimple *to_phi = gsi_stmt (gsi_to);
1749 : :
1750 : : /* When the vector loop is peeled then we need to use the
1751 : : value at start of the loop, otherwise the main loop exit
1752 : : should use the final iter value. */
1753 : 3217 : tree new_arg;
1754 : 3217 : if (peeled_iters)
1755 : 55 : new_arg = gimple_phi_result (from_phi);
1756 : : else
1757 : 3162 : new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1758 : : loop_latch_edge (loop));
1759 : :
1760 : : /* Check if we've already created a new phi node during edge
1761 : : redirection and re-use it if so. Otherwise create a
1762 : : LC PHI node to feed the merge PHI. */
1763 : 3217 : tree *res;
1764 : 6434 : if (virtual_operand_p (new_arg))
1765 : : {
1766 : : /* Use the existing virtual LC SSA from exit block. */
1767 : 424 : gphi *vphi = get_virtual_phi (main_loop_exit_block);
1768 : 424 : new_arg = gimple_phi_result (vphi);
1769 : : }
1770 : 2793 : else if ((res = new_phi_args.get (new_arg)))
1771 : 104 : new_arg = *res;
1772 : : else
1773 : : {
1774 : : /* Create the LC PHI node for the exit. */
1775 : 2689 : tree new_def = copy_ssa_name (new_arg);
1776 : 2689 : gphi *lc_phi
1777 : 2689 : = create_phi_node (new_def, main_loop_exit_block);
1778 : 2689 : SET_PHI_ARG_DEF (lc_phi, 0, new_arg);
1779 : 2689 : new_arg = new_def;
1780 : : }
1781 : :
1782 : : /* Create the PHI node in the merge block merging the
1783 : : main and early exit values. */
1784 : 3217 : tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
1785 : 3217 : gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
1786 : 3217 : edge main_e = single_succ_edge (main_loop_exit_block);
1787 : 3217 : SET_PHI_ARG_DEF_ON_EDGE (lcssa_phi, main_e, new_arg);
1788 : :
1789 : : /* And adjust the epilog entry value. */
1790 : 3217 : adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
1791 : : }
1792 : :
1793 : : /* After creating the merge PHIs handle the early exits those
1794 : : should use the values at the start of the loop. */
1795 : 1482 : for (auto gsi_from = gsi_start_phis (loop->header),
1796 : 1482 : gsi_to = gsi_start_phis (new_preheader);
1797 : 4699 : !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1798 : 3217 : gsi_next (&gsi_from), gsi_next (&gsi_to))
1799 : : {
1800 : 3217 : gimple *from_phi = gsi_stmt (gsi_from);
1801 : 3217 : gimple *to_phi = gsi_stmt (gsi_to);
1802 : :
1803 : : /* Now update the virtual PHI nodes with the right value. */
1804 : 3217 : tree alt_arg = gimple_phi_result (from_phi);
1805 : 6434 : if (virtual_operand_p (alt_arg))
1806 : : {
1807 : 424 : gphi *vphi = get_virtual_phi (alt_loop_exit_block);
1808 : 424 : alt_arg = gimple_phi_result (vphi);
1809 : : }
1810 : : /* For other live args we didn't create LC PHI nodes.
1811 : : Do so here. */
1812 : : else
1813 : : {
1814 : 2793 : tree alt_def = copy_ssa_name (alt_arg);
1815 : 2793 : gphi *lc_phi
1816 : 2793 : = create_phi_node (alt_def, alt_loop_exit_block);
1817 : 5985 : for (unsigned i = 0; i < gimple_phi_num_args (lc_phi);
1818 : : ++i)
1819 : 3192 : SET_PHI_ARG_DEF (lc_phi, i, alt_arg);
1820 : : alt_arg = alt_def;
1821 : : }
1822 : 3217 : edge alt_e = single_succ_edge (alt_loop_exit_block);
1823 : 3217 : SET_PHI_ARG_DEF_ON_EDGE (to_phi, alt_e, alt_arg);
1824 : : }
1825 : : }
1826 : : /* For the single exit case only create the missing LC PHI nodes
1827 : : for the continuation of the loop IVs that are not also already
1828 : : reductions and thus had LC PHI nodes on the exit already. */
1829 : : else
1830 : : {
1831 : 31527 : for (auto gsi_from = gsi_start_phis (loop->header),
1832 : 31527 : gsi_to = gsi_start_phis (new_loop->header);
1833 : 120608 : !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1834 : 89081 : gsi_next (&gsi_from), gsi_next (&gsi_to))
1835 : : {
1836 : 89081 : gimple *from_phi = gsi_stmt (gsi_from);
1837 : 89081 : gimple *to_phi = gsi_stmt (gsi_to);
1838 : 89081 : tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1839 : : loop_latch_edge (loop));
1840 : :
1841 : : /* Check if we've already created a new phi node during edge
1842 : : redirection. If we have, only propagate the value
1843 : : downwards. */
1844 : 89081 : if (tree *res = new_phi_args.get (new_arg))
1845 : : {
1846 : 42074 : adjust_phi_and_debug_stmts (to_phi, loop_entry, *res);
1847 : 42074 : continue;
1848 : : }
1849 : :
1850 : 47007 : tree new_res = copy_ssa_name (gimple_phi_result (from_phi));
1851 : 47007 : gphi *lcssa_phi = create_phi_node (new_res, new_preheader);
1852 : 47007 : SET_PHI_ARG_DEF_ON_EDGE (lcssa_phi, loop_exit, new_arg);
1853 : 47007 : adjust_phi_and_debug_stmts (to_phi, loop_entry, new_res);
1854 : : }
1855 : : }
1856 : 33009 : }
1857 : :
1858 : 33009 : if (was_imm_dom || duplicate_outer_loop)
1859 : 32693 : set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1860 : :
1861 : : /* And remove the non-necessary forwarder again. Keep the other
1862 : : one so we have a proper pre-header for the loop at the exit edge. */
1863 : 33009 : redirect_edge_pred (single_succ_edge (preheader),
1864 : : single_pred (preheader));
1865 : 33009 : delete_basic_block (preheader);
1866 : 33009 : set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1867 : 33009 : loop_preheader_edge (scalar_loop)->src);
1868 : :
1869 : : /* Finally after wiring the new epilogue we need to update its main exit
1870 : : to the original function exit we recorded. Other exits are already
1871 : : correct. */
1872 : 33009 : if (multiple_exits_p)
1873 : : {
1874 : 1482 : class loop *update_loop = new_loop;
1875 : 1482 : doms = get_all_dominated_blocks (CDI_DOMINATORS, loop->header);
1876 : 78325 : for (unsigned i = 0; i < doms.length (); ++i)
1877 : 76843 : if (flow_bb_inside_loop_p (loop, doms[i]))
1878 : 4637 : doms.unordered_remove (i);
1879 : :
1880 : 7615 : for (edge e : get_loop_exit_edges (update_loop))
1881 : : {
1882 : 3169 : edge ex;
1883 : 3169 : edge_iterator ei;
1884 : 6245 : FOR_EACH_EDGE (ex, ei, e->dest->succs)
1885 : : {
1886 : : /* Find the first non-fallthrough block as fall-throughs can't
1887 : : dominate other blocks. */
1888 : 3076 : if (single_succ_p (ex->dest))
1889 : : {
1890 : 1520 : doms.safe_push (ex->dest);
1891 : 1520 : ex = single_succ_edge (ex->dest);
1892 : : }
1893 : 3076 : doms.safe_push (ex->dest);
1894 : : }
1895 : 3169 : doms.safe_push (e->dest);
1896 : 1482 : }
1897 : :
1898 : 1482 : iterate_fix_dominators (CDI_DOMINATORS, doms, false);
1899 : 1482 : if (updated_doms)
1900 : 1482 : updated_doms->safe_splice (doms);
1901 : : }
1902 : : }
1903 : : else /* Add the copy at entry. */
1904 : : {
1905 : : /* Copy the current loop LC PHI nodes between the original loop exit
1906 : : block and the new loop header. This allows us to later split the
1907 : : preheader block and still find the right LC nodes. */
1908 : 1477 : if (flow_loops)
1909 : 406 : for (auto gsi_from = gsi_start_phis (new_loop->header),
1910 : 406 : gsi_to = gsi_start_phis (loop->header);
1911 : 1260 : !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);
1912 : 854 : gsi_next (&gsi_from), gsi_next (&gsi_to))
1913 : : {
1914 : 854 : gimple *from_phi = gsi_stmt (gsi_from);
1915 : 854 : gimple *to_phi = gsi_stmt (gsi_to);
1916 : 854 : tree new_arg = PHI_ARG_DEF_FROM_EDGE (from_phi,
1917 : : loop_latch_edge (new_loop));
1918 : 854 : adjust_phi_and_debug_stmts (to_phi, loop_preheader_edge (loop),
1919 : : new_arg);
1920 : : }
1921 : :
1922 : 1477 : if (scalar_loop != loop)
1923 : : {
1924 : : /* Remove the non-necessary forwarder of scalar_loop again. */
1925 : 4 : redirect_edge_pred (single_succ_edge (preheader),
1926 : : single_pred (preheader));
1927 : 4 : delete_basic_block (preheader);
1928 : 4 : set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1929 : 4 : loop_preheader_edge (scalar_loop)->src);
1930 : 4 : preheader = split_edge (loop_preheader_edge (loop));
1931 : 4 : entry_e = single_pred_edge (preheader);
1932 : : }
1933 : :
1934 : 1477 : redirect_edge_and_branch_force (entry_e, new_preheader);
1935 : 1477 : flush_pending_stmts (entry_e);
1936 : 1477 : set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1937 : :
1938 : 1477 : redirect_edge_and_branch_force (new_exit, preheader);
1939 : 1477 : flush_pending_stmts (new_exit);
1940 : 1477 : set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1941 : :
1942 : : /* And remove the non-necessary forwarder again. Keep the other
1943 : : one so we have a proper pre-header for the loop at the exit edge. */
1944 : 1477 : redirect_edge_pred (single_succ_edge (new_preheader),
1945 : : single_pred (new_preheader));
1946 : 1477 : delete_basic_block (new_preheader);
1947 : 1477 : set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1948 : 1477 : loop_preheader_edge (new_loop)->src);
1949 : :
1950 : : /* Update dominators for multiple exits. */
1951 : 1477 : if (multiple_exits_p)
1952 : : {
1953 : 1089 : for (edge alt_e : loop_exits)
1954 : : {
1955 : 447 : if (alt_e == loop_exit)
1956 : 214 : continue;
1957 : 233 : basic_block old_dom
1958 : 233 : = get_immediate_dominator (CDI_DOMINATORS, alt_e->dest);
1959 : 233 : if (flow_bb_inside_loop_p (loop, old_dom))
1960 : : {
1961 : 68 : auto_vec<basic_block, 8> queue;
1962 : 68 : for (auto son = first_dom_son (CDI_DOMINATORS, old_dom);
1963 : 219 : son; son = next_dom_son (CDI_DOMINATORS, son))
1964 : 151 : if (!flow_bb_inside_loop_p (loop, son))
1965 : 83 : queue.safe_push (son);
1966 : 287 : for (auto son : queue)
1967 : 83 : set_immediate_dominator (CDI_DOMINATORS,
1968 : : son, get_bb_copy (old_dom));
1969 : 68 : }
1970 : : }
1971 : : }
1972 : : }
1973 : :
1974 : 34486 : free (new_bbs);
1975 : 34486 : free (bbs);
1976 : :
1977 : 34486 : checking_verify_dominators (CDI_DOMINATORS);
1978 : :
1979 : 34486 : return new_loop;
1980 : 34486 : }
1981 : :
1982 : :
1983 : : /* Given the condition expression COND, put it as the last statement of
1984 : : GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1985 : : DOM_BB; return the skip edge. GUARD_TO is the target basic block to
1986 : : skip the loop. PROBABILITY is the skip edge's probability. Mark the
1987 : : new edge as irreducible if IRREDUCIBLE_P is true. */
1988 : :
1989 : : static edge
1990 : 50480 : slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1991 : : basic_block guard_to, basic_block dom_bb,
1992 : : profile_probability probability, bool irreducible_p)
1993 : : {
1994 : 50480 : gimple_stmt_iterator gsi;
1995 : 50480 : edge new_e, enter_e;
1996 : 50480 : gcond *cond_stmt;
1997 : 50480 : gimple_seq gimplify_stmt_list = NULL;
1998 : :
1999 : 50480 : enter_e = EDGE_SUCC (guard_bb, 0);
2000 : 50480 : enter_e->flags &= ~EDGE_FALLTHRU;
2001 : 50480 : enter_e->flags |= EDGE_FALSE_VALUE;
2002 : 50480 : gsi = gsi_last_bb (guard_bb);
2003 : :
2004 : 50480 : cond = force_gimple_operand_1 (cond, &gimplify_stmt_list,
2005 : : is_gimple_condexpr_for_cond, NULL_TREE);
2006 : 50480 : if (gimplify_stmt_list)
2007 : 22771 : gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
2008 : :
2009 : 50480 : cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
2010 : 50480 : gsi = gsi_last_bb (guard_bb);
2011 : 50480 : gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
2012 : :
2013 : : /* Add new edge to connect guard block to the merge/loop-exit block. */
2014 : 50480 : new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
2015 : :
2016 : 50480 : new_e->probability = probability;
2017 : 50480 : if (irreducible_p)
2018 : 14 : new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
2019 : :
2020 : 50480 : enter_e->probability = probability.invert ();
2021 : 50480 : set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
2022 : :
2023 : : /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
2024 : 50480 : if (enter_e->dest->loop_father->header == enter_e->dest)
2025 : 409 : split_edge (enter_e);
2026 : :
2027 : 50480 : return new_e;
2028 : : }
2029 : :
2030 : :
2031 : : /* This function verifies that the following restrictions apply to LOOP:
2032 : : (1) it consists of exactly 2 basic blocks - header, and an empty latch
2033 : : for innermost loop and 5 basic blocks for outer-loop.
2034 : : (2) it is single entry, single exit
2035 : : (3) its exit condition is the last stmt in the header
2036 : : (4) E is the entry/exit edge of LOOP.
2037 : : */
2038 : :
2039 : : bool
2040 : 410810 : slpeel_can_duplicate_loop_p (const class loop *loop, const_edge exit_e,
2041 : : const_edge e)
2042 : : {
2043 : 410810 : edge entry_e = loop_preheader_edge (loop);
2044 : 410810 : gcond *orig_cond = get_loop_exit_condition (exit_e);
2045 : 410810 : gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
2046 : :
2047 : : /* All loops have an outer scope; the only case loop->outer is NULL is for
2048 : : the function itself. */
2049 : 410810 : if (!loop_outer (loop)
2050 : 410810 : || !empty_block_p (loop->latch)
2051 : : || !exit_e
2052 : : /* Verify that new loop exit condition can be trivially modified. */
2053 : 410810 : || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
2054 : 821620 : || (e != exit_e && e != entry_e))
2055 : 0 : return false;
2056 : :
2057 : 410810 : basic_block *bbs = XNEWVEC (basic_block, loop->num_nodes);
2058 : 410810 : get_loop_body_with_size (loop, bbs, loop->num_nodes);
2059 : 410810 : bool ret = can_copy_bbs_p (bbs, loop->num_nodes);
2060 : 410810 : free (bbs);
2061 : 410810 : return ret;
2062 : : }
2063 : :
2064 : : /* Function find_loop_location.
2065 : :
2066 : : Extract the location of the loop in the source code.
2067 : : If the loop is not well formed for vectorization, an estimated
2068 : : location is calculated.
2069 : : Return the loop location if succeed and NULL if not. */
2070 : :
2071 : : dump_user_location_t
2072 : 3617342 : find_loop_location (class loop *loop)
2073 : : {
2074 : 3617342 : gimple *stmt = NULL;
2075 : 3617342 : basic_block bb;
2076 : 3617342 : gimple_stmt_iterator si;
2077 : :
2078 : 3617342 : if (!loop)
2079 : 0 : return dump_user_location_t ();
2080 : :
2081 : : /* For the root of the loop tree return the function location. */
2082 : 3617342 : if (!loop_outer (loop))
2083 : 0 : return dump_user_location_t::from_function_decl (cfun->decl);
2084 : :
2085 : 3617342 : if (loops_state_satisfies_p (LOOPS_HAVE_RECORDED_EXITS))
2086 : : {
2087 : : /* We only care about the loop location, so use any exit with location
2088 : : information. */
2089 : 11370945 : for (edge e : get_loop_exit_edges (loop))
2090 : : {
2091 : 3735045 : stmt = get_loop_exit_condition (e);
2092 : :
2093 : 3735045 : if (stmt
2094 : 3735045 : && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
2095 : 3157442 : return stmt;
2096 : 3617342 : }
2097 : : }
2098 : :
2099 : : /* If we got here the loop is probably not "well formed",
2100 : : try to estimate the loop location */
2101 : :
2102 : 459900 : if (!loop->header)
2103 : 0 : return dump_user_location_t ();
2104 : :
2105 : 459900 : bb = loop->header;
2106 : :
2107 : 1613194 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2108 : : {
2109 : 1023984 : stmt = gsi_stmt (si);
2110 : 1023984 : if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
2111 : 330590 : return stmt;
2112 : : }
2113 : :
2114 : 129310 : return dump_user_location_t ();
2115 : : }
2116 : :
2117 : : /* Return true if the phi described by STMT_INFO defines an IV of the
2118 : : loop to be vectorized. */
2119 : :
2120 : : static bool
2121 : 1113236 : iv_phi_p (stmt_vec_info stmt_info)
2122 : : {
2123 : 1113236 : gphi *phi = as_a <gphi *> (stmt_info->stmt);
2124 : 2226472 : if (virtual_operand_p (PHI_RESULT (phi)))
2125 : : return false;
2126 : :
2127 : 885268 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2128 : 885268 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2129 : 114688 : return false;
2130 : :
2131 : : return true;
2132 : : }
2133 : :
2134 : : /* Return true if vectorizer can peel for nonlinear iv. */
2135 : : static bool
2136 : 9854 : vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo,
2137 : : stmt_vec_info stmt_info)
2138 : : {
2139 : 9854 : enum vect_induction_op_type induction_type
2140 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
2141 : 9854 : tree niters_skip;
2142 : : /* Init_expr will be update by vect_update_ivs_after_vectorizer,
2143 : : if niters or vf is unkown:
2144 : : For shift, when shift mount >= precision, there would be UD.
2145 : : For mult, don't known how to generate
2146 : : init_expr * pow (step, niters) for variable niters.
2147 : : For neg unknown niters are ok, since niters of vectorized main loop
2148 : : will always be multiple of 2.
2149 : : See also PR113163, PR114196 and PR114485. */
2150 : 9854 : if (!LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()
2151 : 9854 : || LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2152 : 9854 : || (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2153 : 6684 : && induction_type != vect_step_op_neg))
2154 : : {
2155 : 6522 : if (dump_enabled_p ())
2156 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2157 : : "Peeling for epilogue is not supported"
2158 : : " for this nonlinear induction"
2159 : : " when iteration count is unknown or"
2160 : : " when using partial vectorization.\n");
2161 : 6522 : return false;
2162 : : }
2163 : :
2164 : 3332 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
2165 : 238 : && induction_type == vect_step_op_mul)
2166 : : {
2167 : 24 : if (dump_enabled_p ())
2168 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2169 : : "Peeling for is not supported for nonlinear mult"
2170 : : " induction using partial vectorization.\n");
2171 : 24 : return false;
2172 : : }
2173 : :
2174 : : /* Avoid compile time hog on vect_peel_nonlinear_iv_init. */
2175 : 3094 : if (induction_type == vect_step_op_mul)
2176 : : {
2177 : 401 : tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
2178 : 401 : tree type = TREE_TYPE (step_expr);
2179 : :
2180 : 796 : if (wi::exact_log2 (wi::to_wide (step_expr)) == -1
2181 : 401 : && LOOP_VINFO_INT_NITERS(loop_vinfo) >= TYPE_PRECISION (type))
2182 : : {
2183 : 6 : if (dump_enabled_p ())
2184 : 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2185 : : "Avoid compile time hog on"
2186 : : " vect_peel_nonlinear_iv_init"
2187 : : " for nonlinear induction vec_step_op_mul"
2188 : : " when iteration count is too big.\n");
2189 : 6 : return false;
2190 : : }
2191 : : }
2192 : :
2193 : : /* Also doens't support peel for neg when niter is variable.
2194 : : ??? generate something like niter_expr & 1 ? init_expr : -init_expr? */
2195 : 3302 : niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
2196 : 3302 : if ((niters_skip != NULL_TREE
2197 : 0 : && (TREE_CODE (niters_skip) != INTEGER_CST
2198 : 0 : || (HOST_WIDE_INT) TREE_INT_CST_LOW (niters_skip) < 0))
2199 : 3302 : || (!vect_use_loop_mask_for_alignment_p (loop_vinfo)
2200 : 3302 : && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0))
2201 : : {
2202 : 4 : if (dump_enabled_p ())
2203 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2204 : : "Peeling for alignement is not supported"
2205 : : " for nonlinear induction when niters_skip"
2206 : : " is not constant.\n");
2207 : 4 : return false;
2208 : : }
2209 : :
2210 : : return true;
2211 : : }
2212 : :
2213 : : /* Function vect_can_advance_ivs_p
2214 : :
2215 : : In case the number of iterations that LOOP iterates is unknown at compile
2216 : : time, an epilog loop will be generated, and the loop induction variables
2217 : : (IVs) will be "advanced" to the value they are supposed to take just before
2218 : : the epilog loop. Here we check that the access function of the loop IVs
2219 : : and the expression that represents the loop bound are simple enough.
2220 : : These restrictions will be relaxed in the future. */
2221 : :
2222 : : bool
2223 : 417963 : vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
2224 : : {
2225 : 417963 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2226 : 417963 : basic_block bb = loop->header;
2227 : 417963 : gphi_iterator gsi;
2228 : :
2229 : : /* Analyze phi functions of the loop header. */
2230 : :
2231 : 417963 : if (dump_enabled_p ())
2232 : 23680 : dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
2233 : 1428125 : for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2234 : : {
2235 : 1017721 : tree evolution_part;
2236 : 1017721 : enum vect_induction_op_type induction_type;
2237 : :
2238 : 1017721 : gphi *phi = gsi.phi ();
2239 : 1017721 : stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
2240 : 1017721 : if (dump_enabled_p ())
2241 : 68103 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
2242 : : phi_info->stmt);
2243 : :
2244 : : /* Skip virtual phi's. The data dependences that are associated with
2245 : : virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
2246 : :
2247 : : Skip reduction phis. */
2248 : 1017721 : if (!iv_phi_p (phi_info))
2249 : : {
2250 : 300023 : if (dump_enabled_p ())
2251 : 24056 : dump_printf_loc (MSG_NOTE, vect_location,
2252 : : "reduc or virtual phi. skip.\n");
2253 : 300023 : continue;
2254 : : }
2255 : :
2256 : 717698 : induction_type = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
2257 : 717698 : if (induction_type != vect_step_op_add)
2258 : : {
2259 : 9854 : if (!vect_can_peel_nonlinear_iv_p (loop_vinfo, phi_info))
2260 : : return false;
2261 : :
2262 : 3298 : continue;
2263 : : }
2264 : :
2265 : : /* Analyze the evolution function. */
2266 : :
2267 : 707844 : evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
2268 : 707844 : if (evolution_part == NULL_TREE)
2269 : : {
2270 : 971 : if (dump_enabled_p ())
2271 : 78 : dump_printf (MSG_MISSED_OPTIMIZATION,
2272 : : "No access function or evolution.\n");
2273 : 971 : return false;
2274 : : }
2275 : :
2276 : : /* FORNOW: We do not transform initial conditions of IVs
2277 : : which evolution functions are not invariants in the loop. */
2278 : :
2279 : 706873 : if (!expr_invariant_in_loop_p (loop, evolution_part))
2280 : : {
2281 : 32 : if (dump_enabled_p ())
2282 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2283 : : "evolution not invariant in loop.\n");
2284 : 32 : return false;
2285 : : }
2286 : :
2287 : : /* FORNOW: We do not transform initial conditions of IVs
2288 : : which evolution functions are a polynomial of degree >= 2. */
2289 : :
2290 : 1717003 : if (tree_is_chrec (evolution_part))
2291 : : {
2292 : 0 : if (dump_enabled_p ())
2293 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2294 : : "evolution is chrec.\n");
2295 : 0 : return false;
2296 : : }
2297 : : }
2298 : :
2299 : : return true;
2300 : : }
2301 : :
2302 : :
2303 : : /* Function vect_update_ivs_after_vectorizer.
2304 : :
2305 : : "Advance" the induction variables of LOOP to the value they should take
2306 : : after the execution of LOOP. This is currently necessary because the
2307 : : vectorizer does not handle induction variables that are used after the
2308 : : loop. Such a situation occurs when the last iterations of LOOP are
2309 : : peeled, because:
2310 : : 1. We introduced new uses after LOOP for IVs that were not originally used
2311 : : after LOOP: the IVs of LOOP are now used by an epilog loop.
2312 : : 2. LOOP is going to be vectorized; this means that it will iterate N/VF
2313 : : times, whereas the loop IVs should be bumped N times.
2314 : :
2315 : : Input:
2316 : : - LOOP - a loop that is going to be vectorized. The last few iterations
2317 : : of LOOP were peeled.
2318 : : - NITERS - the number of iterations that LOOP executes (before it is
2319 : : vectorized). i.e, the number of times the ivs should be bumped.
2320 : : - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
2321 : : coming out from LOOP on which there are uses of the LOOP ivs
2322 : : (this is the path from LOOP->exit to epilog_loop->preheader).
2323 : :
2324 : : The new definitions of the ivs are placed in LOOP->exit.
2325 : : The phi args associated with the edge UPDATE_E in the bb
2326 : : UPDATE_E->dest are updated accordingly.
2327 : :
2328 : : - EARLY_EXIT_P - Indicates whether the exit is an early exit rather than
2329 : : the main latch exit.
2330 : :
2331 : : Assumption 1: Like the rest of the vectorizer, this function assumes
2332 : : a single loop exit that has a single predecessor.
2333 : :
2334 : : Assumption 2: The phi nodes in the LOOP header and in update_bb are
2335 : : organized in the same order.
2336 : :
2337 : : Assumption 3: The access function of the ivs is simple enough (see
2338 : : vect_can_advance_ivs_p). This assumption will be relaxed in the future.
2339 : :
2340 : : Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
2341 : : coming out of LOOP on which the ivs of LOOP are used (this is the path
2342 : : that leads to the epilog loop; other paths skip the epilog loop). This
2343 : : path starts with the edge UPDATE_E, and its destination (denoted update_bb)
2344 : : needs to have its phis updated.
2345 : : */
2346 : :
2347 : : static void
2348 : 34491 : vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
2349 : : tree niters, edge update_e,
2350 : : bool early_exit_p)
2351 : : {
2352 : 34491 : gphi_iterator gsi, gsi1;
2353 : 34491 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2354 : 34491 : basic_block update_bb = update_e->dest;
2355 : 34491 : basic_block exit_bb = update_e->src;
2356 : 34491 : gimple_stmt_iterator last_gsi = gsi_last_bb (exit_bb);
2357 : :
2358 : 34491 : for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
2359 : 130006 : !gsi_end_p (gsi) && !gsi_end_p (gsi1);
2360 : 95515 : gsi_next (&gsi), gsi_next (&gsi1))
2361 : : {
2362 : 95515 : tree init_expr;
2363 : 95515 : tree step_expr, off;
2364 : 95515 : tree type;
2365 : 95515 : tree var, ni, ni_name;
2366 : :
2367 : 95515 : gphi *phi = gsi.phi ();
2368 : 95515 : gphi *phi1 = gsi1.phi ();
2369 : 95515 : stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
2370 : 95515 : if (dump_enabled_p ())
2371 : 12026 : dump_printf_loc (MSG_NOTE, vect_location,
2372 : : "vect_update_ivs_after_vectorizer: phi: %G",
2373 : : (gimple *) phi);
2374 : :
2375 : : /* Skip reduction and virtual phis. */
2376 : 95515 : if (!iv_phi_p (phi_info))
2377 : : {
2378 : 42633 : if (dump_enabled_p ())
2379 : 4630 : dump_printf_loc (MSG_NOTE, vect_location,
2380 : : "reduc or virtual phi. skip.\n");
2381 : 42633 : continue;
2382 : : }
2383 : :
2384 : 52882 : type = TREE_TYPE (gimple_phi_result (phi));
2385 : 52882 : step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
2386 : 52882 : step_expr = unshare_expr (step_expr);
2387 : :
2388 : : /* FORNOW: We do not support IVs whose evolution function is a polynomial
2389 : : of degree >= 2 or exponential. */
2390 : 52882 : gcc_assert (!tree_is_chrec (step_expr));
2391 : :
2392 : 52882 : init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
2393 : 52882 : gimple_seq stmts = NULL;
2394 : 52882 : enum vect_induction_op_type induction_type
2395 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
2396 : :
2397 : 52882 : if (induction_type == vect_step_op_add)
2398 : : {
2399 : 52754 : tree stype = TREE_TYPE (step_expr);
2400 : 52754 : off = fold_build2 (MULT_EXPR, stype,
2401 : : fold_convert (stype, niters), step_expr);
2402 : :
2403 : 52754 : if (POINTER_TYPE_P (type))
2404 : 3442 : ni = fold_build_pointer_plus (init_expr, off);
2405 : : else
2406 : 49312 : ni = fold_convert (type,
2407 : : fold_build2 (PLUS_EXPR, stype,
2408 : : fold_convert (stype, init_expr),
2409 : : off));
2410 : : }
2411 : : /* Don't bother call vect_peel_nonlinear_iv_init. */
2412 : 128 : else if (induction_type == vect_step_op_neg)
2413 : : ni = init_expr;
2414 : : else
2415 : 84 : ni = vect_peel_nonlinear_iv_init (&stmts, init_expr,
2416 : : niters, step_expr,
2417 : : induction_type, early_exit_p);
2418 : :
2419 : 52882 : var = create_tmp_var (type, "tmp");
2420 : :
2421 : 52882 : gimple_seq new_stmts = NULL;
2422 : 52882 : ni_name = force_gimple_operand (ni, &new_stmts, false, var);
2423 : :
2424 : : /* Exit_bb shouldn't be empty, but we also can't insert after a ctrl
2425 : : statements. */
2426 : 52882 : if (!gsi_end_p (last_gsi) && !is_ctrl_stmt (gsi_stmt (last_gsi)))
2427 : : {
2428 : 248 : gsi_insert_seq_after (&last_gsi, stmts, GSI_SAME_STMT);
2429 : 248 : gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
2430 : : }
2431 : : else
2432 : : {
2433 : 52634 : gsi_insert_seq_before (&last_gsi, stmts, GSI_SAME_STMT);
2434 : 52634 : gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
2435 : : }
2436 : :
2437 : : /* Fix phi expressions in all out of loop bb. */
2438 : 52882 : imm_use_iterator imm_iter;
2439 : 52882 : gimple *use_stmt;
2440 : 52882 : use_operand_p use_p;
2441 : 52882 : tree ic_var = PHI_ARG_DEF_FROM_EDGE (phi1, update_e);
2442 : 52882 : if (TREE_CODE (ic_var) == SSA_NAME)
2443 : : {
2444 : 158908 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, ic_var)
2445 : 53152 : if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2446 : 159456 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2447 : 106030 : SET_USE (use_p, ni_name);
2448 : : }
2449 : : else
2450 : 4 : adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
2451 : : }
2452 : 34491 : }
2453 : :
2454 : : /* Return a gimple value containing the misalignment (measured in vector
2455 : : elements) for the loop described by LOOP_VINFO, i.e. how many elements
2456 : : it is away from a perfectly aligned address. Add any new statements
2457 : : to SEQ. */
2458 : :
2459 : : static tree
2460 : 193 : get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
2461 : : {
2462 : 193 : dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2463 : 193 : stmt_vec_info stmt_info = dr_info->stmt;
2464 : 193 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2465 : :
2466 : 193 : poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
2467 : 193 : unsigned HOST_WIDE_INT target_align_c;
2468 : 193 : tree target_align_minus_1;
2469 : :
2470 : 193 : bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
2471 : 193 : size_zero_node) < 0;
2472 : 193 : tree offset = (negative
2473 : 193 : ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
2474 : : * TREE_INT_CST_LOW
2475 : : (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
2476 : 192 : : size_zero_node);
2477 : 193 : tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
2478 : : stmt_info, seq,
2479 : : offset);
2480 : 193 : tree type = unsigned_type_for (TREE_TYPE (start_addr));
2481 : 193 : if (target_align.is_constant (&target_align_c))
2482 : 193 : target_align_minus_1 = build_int_cst (type, target_align_c - 1);
2483 : : else
2484 : : {
2485 : : tree vla = build_int_cst (type, target_align);
2486 : : target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla,
2487 : : build_int_cst (type, 1));
2488 : : }
2489 : :
2490 : 193 : HOST_WIDE_INT elem_size
2491 : 193 : = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
2492 : 386 : tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
2493 : :
2494 : : /* Create: misalign_in_bytes = addr & (target_align - 1). */
2495 : 193 : tree int_start_addr = fold_convert (type, start_addr);
2496 : 193 : tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
2497 : : target_align_minus_1);
2498 : :
2499 : : /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
2500 : 193 : tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
2501 : : elem_size_log);
2502 : :
2503 : 193 : return misalign_in_elems;
2504 : : }
2505 : :
2506 : : /* Function vect_gen_prolog_loop_niters
2507 : :
2508 : : Generate the number of iterations which should be peeled as prolog for the
2509 : : loop represented by LOOP_VINFO. It is calculated as the misalignment of
2510 : : DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
2511 : : As a result, after the execution of this loop, the data reference DR will
2512 : : refer to an aligned location. The following computation is generated:
2513 : :
2514 : : If the misalignment of DR is known at compile time:
2515 : : addr_mis = int mis = DR_MISALIGNMENT (dr);
2516 : : Else, compute address misalignment in bytes:
2517 : : addr_mis = addr & (target_align - 1)
2518 : :
2519 : : prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
2520 : :
2521 : : (elem_size = element type size; an element is the scalar element whose type
2522 : : is the inner type of the vectype)
2523 : :
2524 : : The computations will be emitted at the end of BB. We also compute and
2525 : : store upper bound (included) of the result in BOUND.
2526 : :
2527 : : When the step of the data-ref in the loop is not 1 (as in interleaved data
2528 : : and SLP), the number of iterations of the prolog must be divided by the step
2529 : : (which is equal to the size of interleaved group).
2530 : :
2531 : : The above formulas assume that VF == number of elements in the vector. This
2532 : : may not hold when there are multiple-types in the loop.
2533 : : In this case, for some data-references in the loop the VF does not represent
2534 : : the number of elements that fit in the vector. Therefore, instead of VF we
2535 : : use TYPE_VECTOR_SUBPARTS. */
2536 : :
2537 : : static tree
2538 : 406 : vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
2539 : : basic_block bb, poly_int64 *bound)
2540 : : {
2541 : 406 : dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2542 : 406 : tree var;
2543 : 406 : tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
2544 : 406 : gimple_seq stmts = NULL, new_stmts = NULL;
2545 : 406 : tree iters, iters_name;
2546 : 406 : stmt_vec_info stmt_info = dr_info->stmt;
2547 : 406 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2548 : 406 : poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
2549 : :
2550 : 406 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2551 : : {
2552 : 213 : int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2553 : :
2554 : 213 : if (dump_enabled_p ())
2555 : 203 : dump_printf_loc (MSG_NOTE, vect_location,
2556 : : "known peeling = %d.\n", npeel);
2557 : :
2558 : 213 : iters = build_int_cst (niters_type, npeel);
2559 : 213 : *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2560 : : }
2561 : : else
2562 : : {
2563 : 193 : tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
2564 : 193 : tree type = TREE_TYPE (misalign_in_elems);
2565 : 193 : HOST_WIDE_INT elem_size
2566 : 193 : = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
2567 : : /* We only do prolog peeling if the target alignment is known at compile
2568 : : time. */
2569 : 193 : poly_uint64 align_in_elems =
2570 : 193 : exact_div (target_align, elem_size);
2571 : 193 : tree align_in_elems_minus_1 =
2572 : 193 : build_int_cst (type, align_in_elems - 1);
2573 : 193 : tree align_in_elems_tree = build_int_cst (type, align_in_elems);
2574 : :
2575 : : /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
2576 : : & (align_in_elems - 1)). */
2577 : 193 : bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
2578 : 193 : size_zero_node) < 0;
2579 : 193 : if (negative)
2580 : 1 : iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
2581 : : align_in_elems_tree);
2582 : : else
2583 : 192 : iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
2584 : : misalign_in_elems);
2585 : 193 : iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
2586 : 193 : iters = fold_convert (niters_type, iters);
2587 : 193 : *bound = align_in_elems;
2588 : : }
2589 : :
2590 : 406 : if (dump_enabled_p ())
2591 : 226 : dump_printf_loc (MSG_NOTE, vect_location,
2592 : : "niters for prolog loop: %T\n", iters);
2593 : :
2594 : 406 : var = create_tmp_var (niters_type, "prolog_loop_niters");
2595 : 406 : iters_name = force_gimple_operand (iters, &new_stmts, false, var);
2596 : :
2597 : 406 : if (new_stmts)
2598 : 193 : gimple_seq_add_seq (&stmts, new_stmts);
2599 : 406 : if (stmts)
2600 : : {
2601 : 193 : gcc_assert (single_succ_p (bb));
2602 : 193 : gimple_stmt_iterator gsi = gsi_last_bb (bb);
2603 : 193 : if (gsi_end_p (gsi))
2604 : 16 : gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2605 : : else
2606 : 177 : gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
2607 : : }
2608 : 406 : return iters_name;
2609 : : }
2610 : :
2611 : :
2612 : : /* Function vect_update_init_of_dr
2613 : :
2614 : : If CODE is PLUS, the vector loop starts NITERS iterations after the
2615 : : scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
2616 : : iterations before the scalar one (using masking to skip inactive
2617 : : elements). This function updates the information recorded in DR to
2618 : : account for the difference. Specifically, it updates the OFFSET
2619 : : field of DR_INFO. */
2620 : :
2621 : : static void
2622 : 22508 : vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
2623 : : {
2624 : 22508 : struct data_reference *dr = dr_info->dr;
2625 : 22508 : tree offset = dr_info->offset;
2626 : 22508 : if (!offset)
2627 : 22508 : offset = build_zero_cst (sizetype);
2628 : :
2629 : 22508 : niters = fold_build2 (MULT_EXPR, sizetype,
2630 : : fold_convert (sizetype, niters),
2631 : : fold_convert (sizetype, DR_STEP (dr)));
2632 : 22508 : offset = fold_build2 (code, sizetype,
2633 : : fold_convert (sizetype, offset), niters);
2634 : 22508 : dr_info->offset = offset;
2635 : 22508 : }
2636 : :
2637 : :
2638 : : /* Function vect_update_inits_of_drs
2639 : :
2640 : : Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
2641 : : CODE and NITERS are as for vect_update_inits_of_dr. */
2642 : :
2643 : : void
2644 : 7226 : vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
2645 : : tree_code code)
2646 : : {
2647 : 7226 : unsigned int i;
2648 : 7226 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2649 : 7226 : struct data_reference *dr;
2650 : :
2651 : 7226 : DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
2652 : :
2653 : : /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
2654 : : here, but since we might use these niters to update the epilogues niters
2655 : : and data references we can't insert them here as this definition might not
2656 : : always dominate its uses. */
2657 : 7226 : if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
2658 : 4467 : niters = fold_convert (sizetype, niters);
2659 : :
2660 : 37285 : FOR_EACH_VEC_ELT (datarefs, i, dr)
2661 : : {
2662 : 22934 : dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2663 : 22934 : if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
2664 : 22508 : && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
2665 : 22508 : vect_update_init_of_dr (dr_info, niters, code);
2666 : : }
2667 : 7226 : }
2668 : :
2669 : : /* For the information recorded in LOOP_VINFO prepare the loop for peeling
2670 : : by masking. This involves calculating the number of iterations to
2671 : : be peeled and then aligning all memory references appropriately. */
2672 : :
2673 : : void
2674 : 1 : vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
2675 : : {
2676 : 1 : tree misalign_in_elems;
2677 : 1 : tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
2678 : :
2679 : 1 : gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
2680 : :
2681 : : /* From the information recorded in LOOP_VINFO get the number of iterations
2682 : : that need to be skipped via masking. */
2683 : 1 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2684 : : {
2685 : 1 : poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2686 : 1 : - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
2687 : 1 : misalign_in_elems = build_int_cst (type, misalign);
2688 : : }
2689 : : else
2690 : : {
2691 : 0 : gimple_seq seq1 = NULL, seq2 = NULL;
2692 : 0 : misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
2693 : 0 : misalign_in_elems = fold_convert (type, misalign_in_elems);
2694 : 0 : misalign_in_elems = force_gimple_operand (misalign_in_elems,
2695 : : &seq2, true, NULL_TREE);
2696 : 0 : gimple_seq_add_seq (&seq1, seq2);
2697 : 0 : if (seq1)
2698 : : {
2699 : 0 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2700 : 0 : basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
2701 : 0 : gcc_assert (!new_bb);
2702 : : }
2703 : : }
2704 : :
2705 : 1 : if (dump_enabled_p ())
2706 : 1 : dump_printf_loc (MSG_NOTE, vect_location,
2707 : : "misalignment for fully-masked loop: %T\n",
2708 : : misalign_in_elems);
2709 : :
2710 : 1 : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
2711 : :
2712 : 1 : vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
2713 : 1 : }
2714 : :
2715 : : /* This function builds ni_name = number of iterations. Statements
2716 : : are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
2717 : : it to TRUE if new ssa_var is generated. */
2718 : :
2719 : : tree
2720 : 60871 : vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
2721 : : {
2722 : 60871 : tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
2723 : 60871 : if (TREE_CODE (ni) == INTEGER_CST)
2724 : : return ni;
2725 : : else
2726 : : {
2727 : 25534 : tree ni_name, var;
2728 : 25534 : gimple_seq stmts = NULL;
2729 : 25534 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2730 : :
2731 : 25534 : var = create_tmp_var (TREE_TYPE (ni), "niters");
2732 : 25534 : ni_name = force_gimple_operand (ni, &stmts, false, var);
2733 : 25534 : if (stmts)
2734 : : {
2735 : 24486 : gsi_insert_seq_on_edge_immediate (pe, stmts);
2736 : 24486 : if (new_var_p != NULL)
2737 : 235 : *new_var_p = true;
2738 : : }
2739 : :
2740 : 25534 : return ni_name;
2741 : : }
2742 : : }
2743 : :
2744 : : /* Calculate the number of iterations above which vectorized loop will be
2745 : : preferred than scalar loop. NITERS_PROLOG is the number of iterations
2746 : : of prolog loop. If it's integer const, the integer number is also passed
2747 : : in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
2748 : : number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
2749 : : value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
2750 : : threshold below which the scalar (rather than vectorized) loop will be
2751 : : executed. This function stores the upper bound (inclusive) of the result
2752 : : in BOUND_SCALAR. */
2753 : :
2754 : : static tree
2755 : 24539 : vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
2756 : : poly_int64 bound_prolog, poly_int64 bound_epilog,
2757 : : int th, poly_uint64 *bound_scalar,
2758 : : bool check_profitability)
2759 : : {
2760 : 24539 : tree type = TREE_TYPE (niters_prolog);
2761 : 24539 : tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
2762 : : build_int_cst (type, bound_epilog));
2763 : :
2764 : 24539 : *bound_scalar = bound_prolog + bound_epilog;
2765 : 24539 : if (check_profitability)
2766 : : {
2767 : : /* TH indicates the minimum niters of vectorized loop, while we
2768 : : compute the maximum niters of scalar loop. */
2769 : 15699 : th--;
2770 : : /* Peeling for constant times. */
2771 : 15699 : if (int_niters_prolog >= 0)
2772 : : {
2773 : 15661 : *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
2774 : 15661 : return build_int_cst (type, *bound_scalar);
2775 : : }
2776 : : /* Peeling an unknown number of times. Note that both BOUND_PROLOG
2777 : : and BOUND_EPILOG are inclusive upper bounds. */
2778 : 38 : if (known_ge (th, bound_prolog + bound_epilog))
2779 : : {
2780 : 0 : *bound_scalar = th;
2781 : 0 : return build_int_cst (type, th);
2782 : : }
2783 : : /* Need to do runtime comparison. */
2784 : 38 : else if (maybe_gt (th, bound_epilog))
2785 : : {
2786 : 38 : *bound_scalar = upper_bound (*bound_scalar, th);
2787 : 38 : return fold_build2 (MAX_EXPR, type,
2788 : : build_int_cst (type, th), niters);
2789 : : }
2790 : : }
2791 : : return niters;
2792 : : }
2793 : :
2794 : : /* NITERS is the number of times that the original scalar loop executes
2795 : : after peeling. Work out the maximum number of iterations N that can
2796 : : be handled by the vectorized form of the loop and then either:
2797 : :
2798 : : a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
2799 : :
2800 : : niters_vector = N
2801 : :
2802 : : b) set *STEP_VECTOR_PTR to one and generate:
2803 : :
2804 : : niters_vector = N / vf
2805 : :
2806 : : In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
2807 : : any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
2808 : : is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).
2809 : :
2810 : : Case (a) is used for LOOP_VINFO_USING_PARTIAL_VECTORS_P or if VF is
2811 : : variable. As stated above, NITERS_VECTOR then equals the number
2812 : : of scalar iterations and vect_set_loop_condition will handle the
2813 : : step. As opposed to (b) we don't know anything about NITER_VECTOR's
2814 : : range here.
2815 : : */
2816 : :
2817 : : void
2818 : 33757 : vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
2819 : : tree *niters_vector_ptr, tree *step_vector_ptr,
2820 : : bool niters_no_overflow)
2821 : : {
2822 : 33757 : tree ni_minus_gap, var;
2823 : 33757 : tree niters_vector, step_vector, type = TREE_TYPE (niters);
2824 : 33757 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2825 : 33757 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
2826 : :
2827 : : /* If epilogue loop is required because of data accesses with gaps, we
2828 : : subtract one iteration from the total number of iterations here for
2829 : : correct calculation of RATIO. */
2830 : 33757 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2831 : : {
2832 : 353 : ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
2833 : : build_one_cst (type));
2834 : 353 : if (!is_gimple_val (ni_minus_gap))
2835 : : {
2836 : 169 : var = create_tmp_var (type, "ni_gap");
2837 : 169 : gimple *stmts = NULL;
2838 : 169 : ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
2839 : : true, var);
2840 : 169 : gsi_insert_seq_on_edge_immediate (pe, stmts);
2841 : : }
2842 : : }
2843 : : else
2844 : : ni_minus_gap = niters;
2845 : :
2846 : : /* To silence some unexpected warnings, simply initialize to 0. */
2847 : 33757 : unsigned HOST_WIDE_INT const_vf = 0;
2848 : 33757 : if (vf.is_constant (&const_vf)
2849 : 33757 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2850 : : {
2851 : : /* Create: niters / vf, which is equivalent to niters >> log2(vf) when
2852 : : vf is a power of two, and when not we approximate using a
2853 : : truncating division. */
2854 : : /* If it's known that niters == number of latch executions + 1 doesn't
2855 : : overflow, we can generate niters / vf; otherwise we generate
2856 : : (niters - vf) / vf + 1 by using the fact that we know ratio
2857 : : will be at least one. */
2858 : 33739 : tree var_vf = build_int_cst (type, const_vf);
2859 : 33739 : if (niters_no_overflow)
2860 : 33518 : niters_vector = fold_build2 (TRUNC_DIV_EXPR, type, ni_minus_gap,
2861 : : var_vf);
2862 : : else
2863 : 221 : niters_vector
2864 : 221 : = fold_build2 (PLUS_EXPR, type,
2865 : : fold_build2 (TRUNC_DIV_EXPR, type,
2866 : : fold_build2 (MINUS_EXPR, type,
2867 : : ni_minus_gap,
2868 : : var_vf),
2869 : : var_vf),
2870 : : build_int_cst (type, 1));
2871 : 33739 : step_vector = build_one_cst (type);
2872 : : }
2873 : : else
2874 : : {
2875 : 18 : niters_vector = ni_minus_gap;
2876 : 18 : step_vector = build_int_cst (type, vf);
2877 : : }
2878 : :
2879 : 33757 : if (!is_gimple_val (niters_vector))
2880 : : {
2881 : 25292 : var = create_tmp_var (type, "bnd");
2882 : 25292 : gimple_seq stmts = NULL;
2883 : 25292 : niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2884 : 25292 : gsi_insert_seq_on_edge_immediate (pe, stmts);
2885 : : /* Peeling algorithm guarantees that vector loop bound is at least ONE,
2886 : : we set range information to make niters analyzer's life easier.
2887 : : Note the number of latch iteration value can be TYPE_MAX_VALUE so
2888 : : we have to represent the vector niter TYPE_MAX_VALUE + 1 / vf. */
2889 : 25292 : if (stmts != NULL
2890 : 25292 : && integer_onep (step_vector))
2891 : : {
2892 : 25277 : if (niters_no_overflow)
2893 : : {
2894 : 25074 : int_range<1> vr (type,
2895 : 50148 : wi::one (TYPE_PRECISION (type)),
2896 : 50148 : wi::div_trunc (wi::max_value
2897 : 25074 : (TYPE_PRECISION (type),
2898 : 25074 : TYPE_SIGN (type)),
2899 : : const_vf,
2900 : 50148 : TYPE_SIGN (type)));
2901 : 25074 : set_range_info (niters_vector, vr);
2902 : 25074 : }
2903 : : /* For VF == 1 the vector IV might also overflow so we cannot
2904 : : assert a minimum value of 1. */
2905 : 203 : else if (const_vf > 1)
2906 : : {
2907 : 167 : int_range<1> vr (type,
2908 : 334 : wi::one (TYPE_PRECISION (type)),
2909 : 501 : wi::rshift (wi::max_value (TYPE_PRECISION (type),
2910 : 167 : TYPE_SIGN (type))
2911 : 334 : - (const_vf - 1),
2912 : 334 : exact_log2 (const_vf), TYPE_SIGN (type))
2913 : 668 : + 1);
2914 : 167 : set_range_info (niters_vector, vr);
2915 : 167 : }
2916 : : }
2917 : : }
2918 : 33757 : *niters_vector_ptr = niters_vector;
2919 : 33757 : *step_vector_ptr = step_vector;
2920 : :
2921 : 33757 : return;
2922 : : }
2923 : :
2924 : : /* Given NITERS_VECTOR which is the number of iterations for vectorized
2925 : : loop specified by LOOP_VINFO after vectorization, compute the number
2926 : : of iterations before vectorization (niters_vector * vf) and store it
2927 : : to NITERS_VECTOR_MULT_VF_PTR. */
2928 : :
2929 : : static void
2930 : 33009 : vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2931 : : tree niters_vector,
2932 : : tree *niters_vector_mult_vf_ptr)
2933 : : {
2934 : : /* We should be using a step_vector of VF if VF is variable. */
2935 : 33009 : int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2936 : 33009 : tree type = TREE_TYPE (niters_vector);
2937 : 33009 : tree tree_vf = build_int_cst (type, vf);
2938 : 33009 : basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
2939 : :
2940 : 33009 : gcc_assert (niters_vector_mult_vf_ptr != NULL);
2941 : 33009 : tree niters_vector_mult_vf = fold_build2 (MULT_EXPR, type,
2942 : : niters_vector, tree_vf);
2943 : :
2944 : : /* If we've peeled a vector iteration then subtract one full vector
2945 : : iteration. */
2946 : 33009 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
2947 : 21 : niters_vector_mult_vf = fold_build2 (MINUS_EXPR, type,
2948 : : niters_vector_mult_vf, tree_vf);
2949 : :
2950 : 33009 : if (!is_gimple_val (niters_vector_mult_vf))
2951 : : {
2952 : 24566 : tree var = create_tmp_var (type, "niters_vector_mult_vf");
2953 : 24566 : gimple_seq stmts = NULL;
2954 : 24566 : niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2955 : : &stmts, true, var);
2956 : 24566 : gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2957 : 24566 : gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2958 : : }
2959 : 33009 : *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2960 : 33009 : }
2961 : :
2962 : : /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2963 : : of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2964 : : are two pred edges of the merge point before UPDATE_LOOP. The two loops
2965 : : appear like below:
2966 : :
2967 : : guard_bb:
2968 : : if (cond)
2969 : : goto merge_bb;
2970 : : else
2971 : : goto skip_loop;
2972 : :
2973 : : skip_loop:
2974 : : header_a:
2975 : : i_1 = PHI<i_0, i_2>;
2976 : : ...
2977 : : i_2 = i_1 + 1;
2978 : : if (cond_a)
2979 : : goto latch_a;
2980 : : else
2981 : : goto exit_a;
2982 : : latch_a:
2983 : : goto header_a;
2984 : :
2985 : : exit_a:
2986 : : i_5 = PHI<i_2>;
2987 : :
2988 : : merge_bb:
2989 : : ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2990 : :
2991 : : update_loop:
2992 : : header_b:
2993 : : i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2994 : : ...
2995 : : i_4 = i_3 + 1;
2996 : : if (cond_b)
2997 : : goto latch_b;
2998 : : else
2999 : : goto exit_bb;
3000 : : latch_b:
3001 : : goto header_b;
3002 : :
3003 : : exit_bb:
3004 : :
3005 : : This function creates PHI nodes at merge_bb and replaces the use of i_5
3006 : : in the update_loop's PHI node with the result of new PHI result. */
3007 : :
3008 : : static void
3009 : 24945 : slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
3010 : : class loop *update_loop,
3011 : : edge guard_edge, edge merge_edge)
3012 : : {
3013 : 24945 : location_t merge_loc, guard_loc;
3014 : 24945 : edge orig_e = loop_preheader_edge (skip_loop);
3015 : 24945 : edge update_e = loop_preheader_edge (update_loop);
3016 : 24945 : gphi_iterator gsi_orig, gsi_update;
3017 : :
3018 : 24945 : for ((gsi_orig = gsi_start_phis (skip_loop->header),
3019 : 24945 : gsi_update = gsi_start_phis (update_loop->header));
3020 : 91338 : !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
3021 : 66393 : gsi_next (&gsi_orig), gsi_next (&gsi_update))
3022 : : {
3023 : 66393 : gphi *orig_phi = gsi_orig.phi ();
3024 : 66393 : gphi *update_phi = gsi_update.phi ();
3025 : :
3026 : : /* Generate new phi node at merge bb of the guard. */
3027 : 66393 : tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3028 : 66393 : gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
3029 : :
3030 : : /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
3031 : : args in NEW_PHI for these edges. */
3032 : 66393 : tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
3033 : 66393 : tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
3034 : 66393 : merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
3035 : 66393 : guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
3036 : 66393 : add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
3037 : 66393 : add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
3038 : :
3039 : : /* Update phi in UPDATE_PHI. */
3040 : 66393 : adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
3041 : : }
3042 : 24945 : }
3043 : :
3044 : : /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
3045 : : Return a value that equals:
3046 : :
3047 : : - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
3048 : : - SKIP_VALUE when the main loop is skipped. */
3049 : :
3050 : : tree
3051 : 3956 : vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
3052 : : tree skip_value)
3053 : : {
3054 : 3956 : gcc_assert (loop_vinfo->main_loop_edge);
3055 : :
3056 : 3956 : tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
3057 : 3956 : basic_block bb = loop_vinfo->main_loop_edge->dest;
3058 : 3956 : gphi *new_phi = create_phi_node (phi_result, bb);
3059 : 3956 : add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
3060 : : UNKNOWN_LOCATION);
3061 : 3956 : add_phi_arg (new_phi, skip_value,
3062 : : loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
3063 : 3956 : return phi_result;
3064 : : }
3065 : :
3066 : : /* Function vect_do_peeling.
3067 : :
3068 : : Input:
3069 : : - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
3070 : :
3071 : : preheader:
3072 : : LOOP:
3073 : : header_bb:
3074 : : loop_body
3075 : : if (exit_loop_cond) goto exit_bb
3076 : : else goto header_bb
3077 : : exit_bb:
3078 : :
3079 : : - NITERS: The number of iterations of the loop.
3080 : : - NITERSM1: The number of iterations of the loop's latch.
3081 : : - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
3082 : : - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
3083 : : CHECK_PROFITABILITY is true.
3084 : : Output:
3085 : : - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
3086 : : iterate after vectorization; see vect_set_loop_condition for details.
3087 : : - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
3088 : : should be set to the number of scalar iterations handled by the
3089 : : vector loop. The SSA name is only used on exit from the loop.
3090 : :
3091 : : This function peels prolog and epilog from the loop, adds guards skipping
3092 : : PROLOG and EPILOG for various conditions. As a result, the changed CFG
3093 : : would look like:
3094 : :
3095 : : guard_bb_1:
3096 : : if (prefer_scalar_loop) goto merge_bb_1
3097 : : else goto guard_bb_2
3098 : :
3099 : : guard_bb_2:
3100 : : if (skip_prolog) goto merge_bb_2
3101 : : else goto prolog_preheader
3102 : :
3103 : : prolog_preheader:
3104 : : PROLOG:
3105 : : prolog_header_bb:
3106 : : prolog_body
3107 : : if (exit_prolog_cond) goto prolog_exit_bb
3108 : : else goto prolog_header_bb
3109 : : prolog_exit_bb:
3110 : :
3111 : : merge_bb_2:
3112 : :
3113 : : vector_preheader:
3114 : : VECTOR LOOP:
3115 : : vector_header_bb:
3116 : : vector_body
3117 : : if (exit_vector_cond) goto vector_exit_bb
3118 : : else goto vector_header_bb
3119 : : vector_exit_bb:
3120 : :
3121 : : guard_bb_3:
3122 : : if (skip_epilog) goto merge_bb_3
3123 : : else goto epilog_preheader
3124 : :
3125 : : merge_bb_1:
3126 : :
3127 : : epilog_preheader:
3128 : : EPILOG:
3129 : : epilog_header_bb:
3130 : : epilog_body
3131 : : if (exit_epilog_cond) goto merge_bb_3
3132 : : else goto epilog_header_bb
3133 : :
3134 : : merge_bb_3:
3135 : :
3136 : : Note this function peels prolog and epilog only if it's necessary,
3137 : : as well as guards.
3138 : : This function returns the epilogue loop if a decision was made to vectorize
3139 : : it, otherwise NULL.
3140 : :
3141 : : The analysis resulting in this epilogue loop's loop_vec_info was performed
3142 : : in the same vect_analyze_loop call as the main loop's. At that time
3143 : : vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
3144 : : vectorization factors than the main loop. This list is chained in the
3145 : : loop's loop_vec_info in the 'epilogue_vinfo' member. When we decide to
3146 : : vectorize the epilogue loop for a lower vectorization factor, the
3147 : : loop_vec_info in epilogue_vinfo is updated and linked to the epilogue loop.
3148 : : This is later used to vectorize the epilogue.
3149 : : The reason the loop_vec_info needs updating is that it was
3150 : : constructed based on the original main loop, and the epilogue loop is a
3151 : : copy of this loop, so all links pointing to statements in the original loop
3152 : : need updating. Furthermore, these loop_vec_infos share the
3153 : : data_reference's records, which will also need to be updated.
3154 : :
3155 : : TODO: Guard for prefer_scalar_loop should be emitted along with
3156 : : versioning conditions if loop versioning is needed. */
3157 : :
3158 : :
3159 : : class loop *
3160 : 60465 : vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
3161 : : tree *niters_vector, tree *step_vector,
3162 : : tree *niters_vector_mult_vf_var, int th,
3163 : : bool check_profitability, bool niters_no_overflow,
3164 : : tree *advance)
3165 : : {
3166 : 60465 : edge e, guard_e;
3167 : 60465 : tree type = TREE_TYPE (niters), guard_cond;
3168 : 60465 : basic_block guard_bb, guard_to;
3169 : 60465 : profile_probability prob_prolog, prob_vector, prob_epilog;
3170 : 60465 : int estimated_vf;
3171 : 60465 : int prolog_peeling = 0;
3172 : 60465 : bool vect_epilogues = loop_vinfo->epilogue_vinfo != NULL;
3173 : :
3174 : 60465 : if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3175 : 60464 : prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3176 : :
3177 : 60465 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3178 : 60465 : poly_uint64 bound_epilog = 0;
3179 : 60465 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
3180 : 60447 : && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3181 : 31983 : bound_epilog += vf - 1;
3182 : 60465 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3183 : 353 : bound_epilog += 1;
3184 : :
3185 : : /* For early breaks the scalar loop needs to execute at most VF times
3186 : : to find the element that caused the break. */
3187 : 60465 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3188 : 1482 : bound_epilog = vf;
3189 : :
3190 : 60465 : bool epilog_peeling = maybe_ne (bound_epilog, 0U);
3191 : 60465 : poly_uint64 bound_scalar = bound_epilog;
3192 : :
3193 : 60465 : if (!prolog_peeling && !epilog_peeling)
3194 : : return NULL;
3195 : :
3196 : : /* Before doing any peeling make sure to reset debug binds outside of
3197 : : the loop refering to defs not in LC SSA. */
3198 : 33040 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3199 : 101105 : for (unsigned i = 0; i < loop->num_nodes; ++i)
3200 : : {
3201 : 68065 : basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3202 : 68065 : imm_use_iterator ui;
3203 : 68065 : gimple *use_stmt;
3204 : 160957 : for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
3205 : 92892 : gsi_next (&gsi))
3206 : : {
3207 : 386923 : FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
3208 : 221082 : if (gimple_debug_bind_p (use_stmt)
3209 : 19943 : && loop != gimple_bb (use_stmt)->loop_father
3210 : 19959 : && !flow_loop_nested_p (loop,
3211 : 16 : gimple_bb (use_stmt)->loop_father))
3212 : : {
3213 : 2 : gimple_debug_bind_reset_value (use_stmt);
3214 : 2 : update_stmt (use_stmt);
3215 : 92892 : }
3216 : : }
3217 : 631643 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
3218 : 495513 : gsi_next (&gsi))
3219 : : {
3220 : 495513 : ssa_op_iter op_iter;
3221 : 495513 : def_operand_p def_p;
3222 : 804199 : FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
3223 : 1080261 : FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
3224 : 499674 : if (gimple_debug_bind_p (use_stmt)
3225 : 36785 : && loop != gimple_bb (use_stmt)->loop_father
3226 : 36812 : && !flow_loop_nested_p (loop,
3227 : 27 : gimple_bb (use_stmt)->loop_father))
3228 : : {
3229 : 0 : gimple_debug_bind_reset_value (use_stmt);
3230 : 0 : update_stmt (use_stmt);
3231 : 308686 : }
3232 : : }
3233 : : }
3234 : :
3235 : 33040 : prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
3236 : 33040 : estimated_vf = vect_vf_for_cost (loop_vinfo);
3237 : 33040 : if (estimated_vf == 2)
3238 : 6399 : estimated_vf = 3;
3239 : 33040 : prob_prolog = prob_epilog = profile_probability::guessed_always ()
3240 : 33040 : .apply_scale (estimated_vf - 1, estimated_vf);
3241 : :
3242 : 33040 : class loop *prolog = NULL, *epilog = NULL;
3243 : 33040 : class loop *first_loop = loop;
3244 : 33040 : bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
3245 : :
3246 : : /* SSA form needs to be up-to-date since we are going to manually
3247 : : update SSA form in slpeel_tree_duplicate_loop_to_edge_cfg and delete all
3248 : : update SSA state after that, so we have to make sure to not lose any
3249 : : pending update needs. */
3250 : 33040 : gcc_assert (!need_ssa_update_p (cfun));
3251 : :
3252 : : /* If we're vectorizing an epilogue loop, we have ensured that the
3253 : : virtual operand is in SSA form throughout the vectorized main loop.
3254 : : Normally it is possible to trace the updated
3255 : : vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
3256 : : back to scalar-stmt vuses, meaning that the effect of the SSA update
3257 : : remains local to the main loop. However, there are rare cases in
3258 : : which the vectorized loop should have vdefs even when the original scalar
3259 : : loop didn't. For example, vectorizing a load with IFN_LOAD_LANES
3260 : : introduces clobbers of the temporary vector array, which in turn
3261 : : needs new vdefs. If the scalar loop doesn't write to memory, these
3262 : : new vdefs will be the only ones in the vector loop.
3263 : : We are currently defering updating virtual SSA form and creating
3264 : : of a virtual PHI for this case so we do not have to make sure the
3265 : : newly introduced virtual def is in LCSSA form. */
3266 : :
3267 : 33040 : if (MAY_HAVE_DEBUG_BIND_STMTS)
3268 : : {
3269 : 12329 : gcc_assert (!adjust_vec.exists ());
3270 : 12329 : adjust_vec.create (32);
3271 : : }
3272 : 33040 : initialize_original_copy_tables ();
3273 : :
3274 : : /* Record the anchor bb at which the guard should be placed if the scalar
3275 : : loop might be preferred. */
3276 : 33040 : basic_block anchor = loop_preheader_edge (loop)->src;
3277 : :
3278 : : /* Generate the number of iterations for the prolog loop. We do this here
3279 : : so that we can also get the upper bound on the number of iterations. */
3280 : 33040 : tree niters_prolog;
3281 : 33040 : poly_int64 bound_prolog = 0;
3282 : 33040 : if (prolog_peeling)
3283 : : {
3284 : 406 : niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
3285 : : &bound_prolog);
3286 : : /* If algonment peeling is known, we will always execute prolog. */
3287 : 406 : if (TREE_CODE (niters_prolog) == INTEGER_CST)
3288 : 213 : prob_prolog = profile_probability::always ();
3289 : : }
3290 : : else
3291 : 32634 : niters_prolog = build_int_cst (type, 0);
3292 : :
3293 : 33040 : loop_vec_info epilogue_vinfo = loop_vinfo->epilogue_vinfo;
3294 : 33040 : tree niters_vector_mult_vf = NULL_TREE;
3295 : : /* Saving NITERs before the loop, as this may be changed by prologue. */
3296 : 33040 : tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
3297 : 33040 : edge update_e = NULL, skip_e = NULL;
3298 : 33040 : unsigned int lowest_vf = constant_lower_bound (vf);
3299 : : /* Prolog loop may be skipped. */
3300 : 33040 : bool skip_prolog = (prolog_peeling != 0);
3301 : : /* Skip this loop to epilog when there are not enough iterations to enter this
3302 : : vectorized loop. If true we should perform runtime checks on the NITERS
3303 : : to check whether we should skip the current vectorized loop. If we know
3304 : : the number of scalar iterations we may choose to add a runtime check if
3305 : : this number "maybe" smaller than the number of iterations required
3306 : : when we know the number of scalar iterations may potentially
3307 : : be smaller than the number of iterations required to enter this loop, for
3308 : : this we use the upper bounds on the prolog and epilog peeling. When we
3309 : : don't know the number of iterations and don't require versioning it is
3310 : : because we have asserted that there are enough scalar iterations to enter
3311 : : the main loop, so this skip is not necessary. When we are versioning then
3312 : : we only add such a skip if we have chosen to vectorize the epilogue. */
3313 : 8484 : bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3314 : 33040 : ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
3315 : 8484 : bound_prolog + bound_epilog)
3316 : 24593 : : (!LOOP_VINFO_USE_VERSIONING_WITHOUT_PEELING (loop_vinfo)
3317 : 8538 : || vect_epilogues));
3318 : :
3319 : : /* Epilog loop must be executed if the number of iterations for epilog
3320 : : loop is known at compile time, otherwise we need to add a check at
3321 : : the end of vector loop and skip to the end of epilog loop. */
3322 : 33040 : bool skip_epilog = (prolog_peeling < 0
3323 : 32847 : || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3324 : 33040 : || !vf.is_constant ());
3325 : : /* PEELING_FOR_GAPS and peeling for early breaks are special because epilog
3326 : : loop must be executed. */
3327 : 33040 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3328 : 32687 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3329 : 1835 : skip_epilog = false;
3330 : :
3331 : 33040 : class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3332 : 33040 : auto_vec<profile_count> original_counts;
3333 : 33040 : basic_block *original_bbs = NULL;
3334 : :
3335 : 33040 : if (skip_vector)
3336 : : {
3337 : 24539 : split_edge (loop_preheader_edge (loop));
3338 : :
3339 : 24539 : if (epilog_peeling && (vect_epilogues || scalar_loop == NULL))
3340 : : {
3341 : 22328 : original_bbs = get_loop_body (loop);
3342 : 67426 : for (unsigned int i = 0; i < loop->num_nodes; i++)
3343 : 45098 : original_counts.safe_push(original_bbs[i]->count);
3344 : : }
3345 : :
3346 : : /* Due to the order in which we peel prolog and epilog, we first
3347 : : propagate probability to the whole loop. The purpose is to
3348 : : avoid adjusting probabilities of both prolog and vector loops
3349 : : separately. Note in this case, the probability of epilog loop
3350 : : needs to be scaled back later. */
3351 : 24539 : basic_block bb_before_loop = loop_preheader_edge (loop)->src;
3352 : 24539 : if (prob_vector.initialized_p ())
3353 : : {
3354 : 24539 : scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
3355 : 24539 : scale_loop_profile (loop, prob_vector, -1);
3356 : : }
3357 : : }
3358 : :
3359 : 33040 : if (vect_epilogues)
3360 : : {
3361 : : /* Make sure to set the epilogue's epilogue scalar loop, such that we can
3362 : : use the original scalar loop as remaining epilogue if necessary. */
3363 : 6819 : LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
3364 : 6819 : = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3365 : 6819 : LOOP_VINFO_SCALAR_IV_EXIT (epilogue_vinfo)
3366 : 6819 : = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
3367 : : }
3368 : :
3369 : 33040 : if (prolog_peeling)
3370 : : {
3371 : 406 : e = loop_preheader_edge (loop);
3372 : 406 : edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
3373 : 406 : gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, exit_e, e)
3374 : : && !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo));
3375 : :
3376 : : /* Peel prolog and put it on preheader edge of loop. */
3377 : 406 : edge scalar_e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
3378 : 406 : edge prolog_e = NULL;
3379 : 406 : prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, exit_e,
3380 : : scalar_loop, scalar_e,
3381 : : e, &prolog_e);
3382 : 406 : gcc_assert (prolog);
3383 : 406 : prolog->force_vectorize = false;
3384 : :
3385 : 406 : first_loop = prolog;
3386 : 406 : reset_original_copy_tables ();
3387 : :
3388 : : /* Update the number of iterations for prolog loop. */
3389 : 406 : tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
3390 : 406 : vect_set_loop_condition (prolog, prolog_e, NULL, niters_prolog,
3391 : : step_prolog, NULL_TREE, false);
3392 : :
3393 : : /* Skip the prolog loop. */
3394 : 406 : if (skip_prolog)
3395 : : {
3396 : 406 : guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3397 : : niters_prolog, build_int_cst (type, 0));
3398 : 406 : guard_bb = loop_preheader_edge (prolog)->src;
3399 : 406 : basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
3400 : 406 : guard_to = split_edge (loop_preheader_edge (loop));
3401 : 406 : guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
3402 : : guard_to, guard_bb,
3403 : : prob_prolog.invert (),
3404 : : irred_flag);
3405 : 1857 : for (edge alt_e : get_loop_exit_edges (prolog))
3406 : : {
3407 : 639 : if (alt_e == prolog_e)
3408 : 406 : continue;
3409 : 233 : basic_block old_dom
3410 : 233 : = get_immediate_dominator (CDI_DOMINATORS, alt_e->dest);
3411 : 233 : if (flow_bb_inside_loop_p (prolog, old_dom))
3412 : : {
3413 : 68 : auto_vec<basic_block, 8> queue;
3414 : 68 : for (auto son = first_dom_son (CDI_DOMINATORS, old_dom);
3415 : 219 : son; son = next_dom_son (CDI_DOMINATORS, son))
3416 : 151 : if (!flow_bb_inside_loop_p (prolog, son))
3417 : 83 : queue.safe_push (son);
3418 : 287 : for (auto son : queue)
3419 : 83 : set_immediate_dominator (CDI_DOMINATORS, son, guard_bb);
3420 : 68 : }
3421 : 406 : }
3422 : :
3423 : 406 : e = EDGE_PRED (guard_to, 0);
3424 : 406 : e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3425 : 406 : slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
3426 : :
3427 : 406 : scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
3428 : 406 : scale_loop_profile (prolog, prob_prolog,
3429 : 406 : estimated_poly_value (bound_prolog) - 1);
3430 : : }
3431 : :
3432 : : /* Update init address of DRs. */
3433 : 406 : vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
3434 : : /* Update niters for vector loop. */
3435 : 406 : LOOP_VINFO_NITERS (loop_vinfo)
3436 : 406 : = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
3437 : 406 : LOOP_VINFO_NITERSM1 (loop_vinfo)
3438 : 406 : = fold_build2 (MINUS_EXPR, type,
3439 : : LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
3440 : 406 : bool new_var_p = false;
3441 : 406 : niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
3442 : : /* It's guaranteed that vector loop bound before vectorization is at
3443 : : least VF, so set range information for newly generated var. */
3444 : 406 : if (new_var_p)
3445 : : {
3446 : 235 : int_range<1> vr (type,
3447 : 470 : wi::to_wide (build_int_cst (type, lowest_vf)),
3448 : 470 : wi::to_wide (TYPE_MAX_VALUE (type)));
3449 : 235 : set_range_info (niters, vr);
3450 : 235 : }
3451 : :
3452 : : /* Prolog iterates at most bound_prolog times, latch iterates at
3453 : : most bound_prolog - 1 times. */
3454 : 406 : if (bound_prolog.is_constant ())
3455 : 406 : record_niter_bound (prolog, bound_prolog.to_constant () - 1, false,
3456 : : true);
3457 : 406 : delete_update_ssa ();
3458 : 406 : adjust_vec_debug_stmts ();
3459 : 406 : scev_reset ();
3460 : : }
3461 : 33040 : basic_block bb_before_epilog = NULL;
3462 : :
3463 : 33040 : if (epilog_peeling)
3464 : : {
3465 : 33009 : e = LOOP_VINFO_IV_EXIT (loop_vinfo);
3466 : 33009 : gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, e, e));
3467 : :
3468 : : /* Peel epilog and put it on exit edge of loop. If we are vectorizing
3469 : : said epilog then we should use a copy of the main loop as a starting
3470 : : point. This loop may have already had some preliminary transformations
3471 : : to allow for more optimal vectorization, for example if-conversion.
3472 : : If we are not vectorizing the epilog then we should use the scalar loop
3473 : : as the transformations mentioned above make less or no sense when not
3474 : : vectorizing. */
3475 : 33009 : edge scalar_e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
3476 : 33009 : epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
3477 : 6819 : edge epilog_e = vect_epilogues ? e : scalar_e;
3478 : 33009 : edge new_epilog_e = NULL;
3479 : 33009 : auto_vec<basic_block> doms;
3480 : 33009 : epilog
3481 : 33009 : = slpeel_tree_duplicate_loop_to_edge_cfg (loop, e, epilog, epilog_e, e,
3482 : : &new_epilog_e, true, &doms);
3483 : :
3484 : 33009 : LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo) = new_epilog_e;
3485 : 33009 : gcc_assert (epilog);
3486 : 33009 : gcc_assert (new_epilog_e);
3487 : 33009 : epilog->force_vectorize = false;
3488 : 33009 : bb_before_epilog = loop_preheader_edge (epilog)->src;
3489 : :
3490 : : /* Scalar version loop may be preferred. In this case, add guard
3491 : : and skip to epilog. Note this only happens when the number of
3492 : : iterations of loop is unknown at compile time, otherwise this
3493 : : won't be vectorized. */
3494 : 33009 : if (skip_vector)
3495 : : {
3496 : : /* Additional epilogue iteration is peeled if gap exists. */
3497 : 24539 : tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
3498 : 24539 : bound_prolog, bound_epilog,
3499 : : th, &bound_scalar,
3500 : : check_profitability);
3501 : : /* Build guard against NITERSM1 since NITERS may overflow. */
3502 : 24539 : guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
3503 : 24539 : guard_bb = anchor;
3504 : 24539 : guard_to = split_edge (loop_preheader_edge (epilog));
3505 : 24539 : guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
3506 : : guard_to, guard_bb,
3507 : : prob_vector.invert (),
3508 : : irred_flag);
3509 : 24539 : skip_e = guard_e;
3510 : 24539 : e = EDGE_PRED (guard_to, 0);
3511 : 24539 : e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
3512 : :
3513 : : /* Handle any remaining dominator updates needed after
3514 : : inserting the loop skip edge above. */
3515 : 24539 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
3516 : 299 : && prolog_peeling)
3517 : : {
3518 : : /* Adding a skip edge to skip a loop with multiple exits
3519 : : means the dominator of the join blocks for all exits shifts
3520 : : from the prolog skip guard to the loop skip guard. */
3521 : 176 : auto prolog_skip_bb
3522 : 176 : = single_pred (loop_preheader_edge (prolog)->src);
3523 : 176 : auto needs_update
3524 : 176 : = get_dominated_by (CDI_DOMINATORS, prolog_skip_bb);
3525 : :
3526 : : /* Update everything except for the immediate children of
3527 : : the prolog skip block (the prolog and vector preheaders).
3528 : : Those should remain dominated by the prolog skip block itself,
3529 : : since the loop guard edge goes to the epilogue. */
3530 : 926 : for (auto bb : needs_update)
3531 : 398 : if (bb != EDGE_SUCC (prolog_skip_bb, 0)->dest
3532 : 398 : && bb != EDGE_SUCC (prolog_skip_bb, 1)->dest)
3533 : 46 : set_immediate_dominator (CDI_DOMINATORS, bb, guard_bb);
3534 : 176 : }
3535 : :
3536 : 24539 : slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
3537 : :
3538 : : /* Simply propagate profile info from guard_bb to guard_to which is
3539 : : a merge point of control flow. */
3540 : 24539 : profile_count old_count = guard_to->count;
3541 : 24539 : guard_to->count = guard_bb->count;
3542 : :
3543 : : /* Restore the counts of the epilog loop if we didn't use the scalar loop. */
3544 : 24539 : if (vect_epilogues || scalar_loop == NULL)
3545 : : {
3546 : 22328 : gcc_assert(epilog->num_nodes == loop->num_nodes);
3547 : 22328 : basic_block *bbs = get_loop_body (epilog);
3548 : 67426 : for (unsigned int i = 0; i < epilog->num_nodes; i++)
3549 : : {
3550 : 45098 : gcc_assert(get_bb_original (bbs[i]) == original_bbs[i]);
3551 : 45098 : bbs[i]->count = original_counts[i];
3552 : : }
3553 : 22328 : free (bbs);
3554 : 22328 : free (original_bbs);
3555 : : }
3556 : 2211 : else if (old_count.nonzero_p ())
3557 : 2211 : scale_loop_profile (epilog, guard_to->count.probability_in (old_count), -1);
3558 : :
3559 : : /* Only need to handle basic block before epilog loop if it's not
3560 : : the guard_bb, which is the case when skip_vector is true. */
3561 : 24539 : if (guard_bb != bb_before_epilog && single_pred_p (bb_before_epilog))
3562 : 24240 : bb_before_epilog->count = single_pred_edge (bb_before_epilog)->count ();
3563 : 24539 : bb_before_epilog = loop_preheader_edge (epilog)->src;
3564 : : }
3565 : :
3566 : : /* If loop is peeled for non-zero constant times, now niters refers to
3567 : : orig_niters - prolog_peeling, it won't overflow even the orig_niters
3568 : : overflows. */
3569 : 33009 : niters_no_overflow |= (prolog_peeling > 0);
3570 : 33009 : vect_gen_vector_loop_niters (loop_vinfo, niters,
3571 : : niters_vector, step_vector,
3572 : : niters_no_overflow);
3573 : 33009 : if (!integer_onep (*step_vector))
3574 : : {
3575 : : /* On exit from the loop we will have an easy way of calcalating
3576 : : NITERS_VECTOR / STEP * STEP. Install a dummy definition
3577 : : until then. */
3578 : 0 : niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
3579 : 0 : SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
3580 : 0 : *niters_vector_mult_vf_var = niters_vector_mult_vf;
3581 : : }
3582 : : else
3583 : 33009 : vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
3584 : : &niters_vector_mult_vf);
3585 : : /* Update IVs of original loop as if they were advanced by
3586 : : niters_vector_mult_vf steps. */
3587 : 33009 : gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
3588 : 33009 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3589 : 1482 : update_e = single_succ_edge (LOOP_VINFO_IV_EXIT (loop_vinfo)->dest);
3590 : :
3591 : : /* If we have a peeled vector iteration we will never skip the epilog loop
3592 : : and we can simplify the cfg a lot by not doing the edge split. */
3593 : 33009 : if (skip_epilog
3594 : 33009 : || (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
3595 : 1482 : && !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)))
3596 : : {
3597 : 25535 : guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3598 : : niters, niters_vector_mult_vf);
3599 : :
3600 : 25535 : guard_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
3601 : 25535 : edge epilog_e = LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo);
3602 : 25535 : guard_to = epilog_e->dest;
3603 : 26710 : guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
3604 : : skip_vector ? anchor : guard_bb,
3605 : : prob_epilog.invert (),
3606 : : irred_flag);
3607 : 25535 : doms.safe_push (guard_to);
3608 : 25535 : if (vect_epilogues)
3609 : 5577 : epilogue_vinfo->skip_this_loop_edge = guard_e;
3610 : 25535 : edge main_iv = LOOP_VINFO_IV_EXIT (loop_vinfo);
3611 : 25535 : gphi_iterator gsi2 = gsi_start_phis (main_iv->dest);
3612 : 25535 : for (gphi_iterator gsi = gsi_start_phis (guard_to);
3613 : 61777 : !gsi_end_p (gsi); gsi_next (&gsi))
3614 : : {
3615 : : /* We are expecting all of the PHIs we have on epilog_e
3616 : : to be also on the main loop exit. But sometimes
3617 : : a stray virtual definition can appear at epilog_e
3618 : : which we can then take as the same on all exits,
3619 : : we've removed the LC SSA PHI on the main exit before
3620 : : so we wouldn't need to create a loop PHI for it. */
3621 : 36242 : if (virtual_operand_p (gimple_phi_result (*gsi))
3622 : 36242 : && (gsi_end_p (gsi2)
3623 : 35922 : || !virtual_operand_p (gimple_phi_result (*gsi2))))
3624 : 182 : add_phi_arg (*gsi,
3625 : 91 : gimple_phi_arg_def_from_edge (*gsi, epilog_e),
3626 : : guard_e, UNKNOWN_LOCATION);
3627 : : else
3628 : : {
3629 : 36151 : add_phi_arg (*gsi, gimple_phi_result (*gsi2), guard_e,
3630 : : UNKNOWN_LOCATION);
3631 : 36151 : gsi_next (&gsi2);
3632 : : }
3633 : : }
3634 : :
3635 : : /* Only need to handle basic block before epilog loop if it's not
3636 : : the guard_bb, which is the case when skip_vector is true. */
3637 : 25535 : if (guard_bb != bb_before_epilog)
3638 : : {
3639 : 25532 : prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
3640 : :
3641 : 25532 : scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
3642 : : }
3643 : 25535 : scale_loop_profile (epilog, prob_epilog, -1);
3644 : : }
3645 : :
3646 : : /* Identify the right foward edge for the non-early-break case which must
3647 : : be done after splitting the epilog edge. */
3648 : 33009 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3649 : 31527 : update_e = skip_vector ? e : loop_preheader_edge (epilog);
3650 : :
3651 : : /* If we have a peeled vector iteration, all exits are the same, leave it
3652 : : and so the main exit needs to be treated the same as the alternative
3653 : : exits in that we leave their updates to vectorizable_live_operations.
3654 : : */
3655 : 33009 : tree vector_iters_vf = niters_vector_mult_vf;
3656 : 33009 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3657 : : {
3658 : 1482 : tree scal_iv_ty = signed_type_for (TREE_TYPE (vector_iters_vf));
3659 : 1482 : tree tmp_niters_vf = make_ssa_name (scal_iv_ty);
3660 : 1482 : basic_block exit_bb = NULL;
3661 : 1482 : edge update_e = NULL;
3662 : :
3663 : : /* Identify the early exit merge block. I wish we had stored this. */
3664 : 4446 : for (auto e : get_loop_exit_edges (loop))
3665 : 1482 : if (e != LOOP_VINFO_IV_EXIT (loop_vinfo))
3666 : : {
3667 : 1482 : exit_bb = e->dest;
3668 : 1482 : update_e = single_succ_edge (exit_bb);
3669 : 1482 : break;
3670 : 1482 : }
3671 : 1482 : vect_update_ivs_after_vectorizer (loop_vinfo, tmp_niters_vf,
3672 : : update_e, true);
3673 : :
3674 : 1482 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
3675 : 21 : vector_iters_vf = tmp_niters_vf;
3676 : :
3677 : 1482 : LOOP_VINFO_EARLY_BRK_NITERS_VAR (loop_vinfo) = tmp_niters_vf;
3678 : : }
3679 : :
3680 : 33009 : bool recalculate_peel_niters_init
3681 : 33009 : = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
3682 : 33009 : vect_update_ivs_after_vectorizer (loop_vinfo, vector_iters_vf,
3683 : : update_e,
3684 : : recalculate_peel_niters_init);
3685 : :
3686 : : /* Recalculate the dominators after adding the guard edge. */
3687 : 33009 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3688 : 1482 : iterate_fix_dominators (CDI_DOMINATORS, doms, false);
3689 : :
3690 : : /* When we do not have a loop-around edge to the epilog we know
3691 : : the vector loop covered at least VF scalar iterations unless
3692 : : we have early breaks.
3693 : : Update any known upper bound with this knowledge. */
3694 : 33009 : if (! skip_vector
3695 : 8470 : && ! LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3696 : : {
3697 : 7287 : if (epilog->any_upper_bound)
3698 : 7287 : epilog->nb_iterations_upper_bound -= lowest_vf;
3699 : 7287 : if (epilog->any_likely_upper_bound)
3700 : 7287 : epilog->nb_iterations_likely_upper_bound -= lowest_vf;
3701 : 7287 : if (epilog->any_estimate)
3702 : 7285 : epilog->nb_iterations_estimate -= lowest_vf;
3703 : : }
3704 : :
3705 : 33009 : unsigned HOST_WIDE_INT bound;
3706 : 33009 : if (bound_scalar.is_constant (&bound))
3707 : : {
3708 : 33009 : gcc_assert (bound != 0);
3709 : : /* Adjust the upper bound by the extra peeled vector iteration if we
3710 : : are an epilogue of an peeled vect loop and not VLA. For VLA the
3711 : : loop bounds are unknown. */
3712 : 33009 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)
3713 : 33009 : && vf.is_constant ())
3714 : 21 : bound += vf.to_constant ();
3715 : : /* -1 to convert loop iterations to latch iterations. */
3716 : 33009 : record_niter_bound (epilog, bound - 1, false, true);
3717 : 33009 : scale_loop_profile (epilog, profile_probability::always (),
3718 : : bound - 1);
3719 : : }
3720 : :
3721 : 33009 : delete_update_ssa ();
3722 : 33009 : adjust_vec_debug_stmts ();
3723 : 33009 : scev_reset ();
3724 : 33009 : }
3725 : :
3726 : 33040 : if (vect_epilogues)
3727 : : {
3728 : 6819 : epilog->aux = epilogue_vinfo;
3729 : 6819 : LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3730 : 6819 : LOOP_VINFO_IV_EXIT (epilogue_vinfo)
3731 : 6819 : = LOOP_VINFO_EPILOGUE_IV_EXIT (loop_vinfo);
3732 : :
3733 : 6819 : loop_constraint_clear (epilog, LOOP_C_INFINITE);
3734 : :
3735 : : /* We now must calculate the number of NITERS performed by the previous
3736 : : loop and EPILOGUE_NITERS to be performed by the epilogue. */
3737 : 6819 : tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3738 : : niters_prolog, niters_vector_mult_vf);
3739 : :
3740 : : /* If skip_vector we may skip the previous loop, we insert a phi-node to
3741 : : determine whether we are coming from the previous vectorized loop
3742 : : using the update_e edge or the skip_vector basic block using the
3743 : : skip_e edge. */
3744 : 6819 : if (skip_vector)
3745 : : {
3746 : 5646 : gcc_assert (update_e != NULL && skip_e != NULL);
3747 : 5646 : gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3748 : : update_e->dest);
3749 : 5646 : tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3750 : 5646 : gimple *stmt = gimple_build_assign (new_ssa, niters);
3751 : 5646 : gimple_stmt_iterator gsi;
3752 : 5646 : if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3753 : 5646 : && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3754 : : {
3755 : 5646 : gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3756 : 5646 : gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3757 : : }
3758 : : else
3759 : : {
3760 : 0 : gsi = gsi_last_bb (update_e->src);
3761 : 0 : gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3762 : : }
3763 : :
3764 : 5646 : niters = new_ssa;
3765 : 5646 : add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3766 : 5646 : add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3767 : : UNKNOWN_LOCATION);
3768 : 5646 : niters = PHI_RESULT (new_phi);
3769 : 5646 : epilogue_vinfo->main_loop_edge = update_e;
3770 : 5646 : epilogue_vinfo->skip_main_loop_edge = skip_e;
3771 : : }
3772 : :
3773 : : /* Set ADVANCE to the number of iterations performed by the previous
3774 : : loop and its prologue. */
3775 : 6819 : *advance = niters;
3776 : :
3777 : : /* Subtract the number of iterations performed by the vectorized loop
3778 : : from the number of total iterations. */
3779 : 6819 : tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3780 : : before_loop_niters,
3781 : : niters);
3782 : :
3783 : 6819 : LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3784 : 6819 : LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3785 : 6819 : = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3786 : : epilogue_niters,
3787 : : build_one_cst (TREE_TYPE (epilogue_niters)));
3788 : : }
3789 : :
3790 : 33040 : adjust_vec.release ();
3791 : 33040 : free_original_copy_tables ();
3792 : :
3793 : 33040 : return vect_epilogues ? epilog : NULL;
3794 : 33040 : }
3795 : :
3796 : : /* Function vect_create_cond_for_niters_checks.
3797 : :
3798 : : Create a conditional expression that represents the run-time checks for
3799 : : loop's niter. The loop is guaranteed to terminate if the run-time
3800 : : checks hold.
3801 : :
3802 : : Input:
3803 : : COND_EXPR - input conditional expression. New conditions will be chained
3804 : : with logical AND operation. If it is NULL, then the function
3805 : : is used to return the number of alias checks.
3806 : : LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3807 : : to be checked.
3808 : :
3809 : : Output:
3810 : : COND_EXPR - conditional expression.
3811 : :
3812 : : The returned COND_EXPR is the conditional expression to be used in the
3813 : : if statement that controls which version of the loop gets executed at
3814 : : runtime. */
3815 : :
3816 : : static void
3817 : 394 : vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3818 : : {
3819 : 394 : tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3820 : :
3821 : 394 : if (*cond_expr)
3822 : 394 : *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3823 : : *cond_expr, part_cond_expr);
3824 : : else
3825 : 0 : *cond_expr = part_cond_expr;
3826 : 394 : }
3827 : :
3828 : : /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3829 : : and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
3830 : :
3831 : : static void
3832 : 334 : chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3833 : : {
3834 : 334 : if (*cond_expr)
3835 : 334 : *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3836 : : *cond_expr, part_cond_expr);
3837 : : else
3838 : 0 : *cond_expr = part_cond_expr;
3839 : 334 : }
3840 : :
3841 : : /* Function vect_create_cond_for_align_checks.
3842 : :
3843 : : Create a conditional expression that represents the alignment checks for
3844 : : all of data references (array element references) whose alignment must be
3845 : : checked at runtime.
3846 : :
3847 : : Input:
3848 : : COND_EXPR - input conditional expression. New conditions will be chained
3849 : : with logical AND operation.
3850 : : LOOP_VINFO - three fields of the loop information are used.
3851 : : LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3852 : : LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3853 : : LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT indicates which check applies.
3854 : :
3855 : : Output:
3856 : : COND_EXPR_STMT_LIST - statements needed to construct the conditional
3857 : : expression.
3858 : : The returned value is the conditional expression to be used in the if
3859 : : statement that controls which version of the loop gets executed at runtime.
3860 : :
3861 : : Based on the boolean value of LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT, we decide
3862 : : which type of check should be applied and create two different expressions
3863 : : accordingly.
3864 : : 1) When LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT is false, we see if all data refs
3865 : : to be checked are already aligned to an alignment boundary. We create
3866 : : an expression of "(a_1 | a_2 | a_3 | ... | a_n) & mask", where "a_i" is
3867 : : the address of i'th data reference.
3868 : : 2) When LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT is true, we see if all data refs
3869 : : can be aligned to a boundary after a certain amount of peeling, in other
3870 : : words, their addresses have the same bottom bits according to the mask.
3871 : : We create "((a_1 ^ a_2) | (a_2 ^ a_3) | ... | (a_n-1 ^ a_n)) & mask",
3872 : : where "a_i" is the address of i'th data reference.
3873 : :
3874 : : Both algorithms make two assumptions:
3875 : : 1) The number of bytes "n" in a vector is a power of 2.
3876 : : 2) An address "a" is aligned if a%n is zero and that this
3877 : : test can be done as a&(n-1) == 0. For example, for 16
3878 : : byte vectors the test is a&0xf == 0. */
3879 : :
3880 : : static void
3881 : 50 : vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3882 : : tree *cond_expr,
3883 : : gimple_seq *cond_expr_stmt_list)
3884 : : {
3885 : 50 : const vec<stmt_vec_info> &may_misalign_stmts
3886 : : = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3887 : 50 : stmt_vec_info stmt_info;
3888 : 50 : poly_uint64 mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3889 : 50 : tree mask_cst;
3890 : 50 : unsigned int i;
3891 : 50 : tree int_ptrsize_type;
3892 : 50 : char tmp_name[30];
3893 : 50 : tree or_tmp_name = NULL_TREE;
3894 : 50 : tree prev_addr_tmp_name = NULL_TREE;
3895 : 50 : tree and_tmp_name;
3896 : 50 : gimple *and_stmt;
3897 : 50 : tree ptrsize_zero;
3898 : 50 : tree part_cond_expr;
3899 : :
3900 : 50 : gcc_assert (known_ne (mask, 0U));
3901 : :
3902 : 50 : int_ptrsize_type = signed_type_for (ptr_type_node);
3903 : :
3904 : : /* If LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT is true, we should have at least two
3905 : : datarefs to check the mutual alignment. */
3906 : 50 : gcc_assert (may_misalign_stmts.length () > 1
3907 : : || !LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT (loop_vinfo));
3908 : :
3909 : 125 : FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3910 : : {
3911 : 75 : gimple_seq new_stmt_list = NULL;
3912 : 75 : tree addr_base;
3913 : 75 : tree addr_tmp_name;
3914 : 75 : tree xor_tmp_name;
3915 : 75 : tree new_or_tmp_name;
3916 : 75 : gimple *addr_stmt, *or_stmt, *xor_stmt;
3917 : 75 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3918 : 75 : bool negative = tree_int_cst_compare
3919 : 75 : (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3920 : 75 : tree offset = negative
3921 : 75 : ? size_int ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
3922 : : * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))))
3923 : 75 : : size_zero_node;
3924 : :
3925 : : /* create: addr_tmp = (int)(address_of_first_vector) */
3926 : 75 : addr_base =
3927 : 75 : vect_create_addr_base_for_vector_ref (loop_vinfo,
3928 : : stmt_info, &new_stmt_list,
3929 : : offset);
3930 : 75 : if (new_stmt_list != NULL)
3931 : 18 : gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3932 : :
3933 : 75 : sprintf (tmp_name, "addr2int%d", i);
3934 : 75 : addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3935 : 75 : addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3936 : 75 : gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3937 : :
3938 : 75 : if (LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT (loop_vinfo))
3939 : : {
3940 : : /* Create "((a_1 ^ a_2) | (a_2 ^ a_3) | ... | (a_n-1 ^ a_n)) & mask"
3941 : : to check mutual alignment. */
3942 : 48 : if (prev_addr_tmp_name != NULL_TREE)
3943 : : {
3944 : 24 : sprintf (tmp_name, "xorptrs%d_%d", i - 1, i);
3945 : 24 : xor_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL,
3946 : : tmp_name);
3947 : 24 : xor_stmt = gimple_build_assign (xor_tmp_name, BIT_XOR_EXPR,
3948 : : prev_addr_tmp_name,
3949 : : addr_tmp_name);
3950 : 24 : gimple_seq_add_stmt (cond_expr_stmt_list, xor_stmt);
3951 : 24 : if (or_tmp_name == NULL_TREE)
3952 : : {
3953 : : /* Create the 1st XOR when the 2nd data ref is seen. */
3954 : : or_tmp_name = xor_tmp_name;
3955 : : }
3956 : : else
3957 : : {
3958 : : /* Create: or_tmp = or_tmp | new_xor_tmp. */
3959 : 0 : sprintf (tmp_name, "orxors%d", i - 1);
3960 : 0 : new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL,
3961 : : tmp_name);
3962 : 0 : or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3963 : : or_tmp_name, xor_tmp_name);
3964 : 0 : gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3965 : 0 : or_tmp_name = new_or_tmp_name;
3966 : : }
3967 : : }
3968 : : prev_addr_tmp_name = addr_tmp_name;
3969 : : }
3970 : : else
3971 : : {
3972 : : /* Create: "(a_1 | a_2 | a_3 | ... | a_n) & mask" to check if all
3973 : : addresses are already aligned. */
3974 : 27 : if (or_tmp_name != NULL_TREE)
3975 : : {
3976 : : /* Create: or_tmp = or_tmp | addr_tmp. */
3977 : 1 : sprintf (tmp_name, "orptrs%d", i);
3978 : 1 : new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL,
3979 : : tmp_name);
3980 : 1 : or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3981 : : or_tmp_name, addr_tmp_name);
3982 : 1 : gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3983 : 1 : or_tmp_name = new_or_tmp_name;
3984 : : }
3985 : : else
3986 : : or_tmp_name = addr_tmp_name;
3987 : : }
3988 : :
3989 : : } /* end for i */
3990 : :
3991 : 50 : mask_cst = build_int_cst (int_ptrsize_type, mask);
3992 : :
3993 : : /* create: and_tmp = or_tmp & mask */
3994 : 50 : and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3995 : :
3996 : 50 : and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3997 : : or_tmp_name, mask_cst);
3998 : 50 : gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3999 : :
4000 : : /* Make and_tmp the left operand of the conditional test against zero.
4001 : : if and_tmp has a nonzero bit then some address is unaligned. */
4002 : 50 : ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
4003 : 50 : part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
4004 : : and_tmp_name, ptrsize_zero);
4005 : 50 : chain_cond_expr (cond_expr, part_cond_expr);
4006 : 50 : }
4007 : :
4008 : : /* Function vect_create_cond_for_vla_spec_read.
4009 : :
4010 : : Create a conditional expression that represents the run-time checks with
4011 : : max speculative read amount in VLA modes. We check two things:
4012 : : 1) if the max speculative read amount exceeds the min page size
4013 : : 2) if the VF is power-of-2 - done by checking the max read amount instead
4014 : :
4015 : : Input:
4016 : : COND_EXPR - input conditional expression. New conditions will be chained
4017 : : with logical AND operation.
4018 : : LOOP_VINFO - field LOOP_VINFO_MAX_SPEC_READ_AMOUNT contains the max
4019 : : possible speculative read amount in VLA modes.
4020 : :
4021 : : Output:
4022 : : COND_EXPR - conditional expression.
4023 : :
4024 : : The returned COND_EXPR is the conditional expression to be used in the
4025 : : if statement that controls which version of the loop gets executed at
4026 : : runtime. */
4027 : :
4028 : : static void
4029 : 0 : vect_create_cond_for_vla_spec_read (loop_vec_info loop_vinfo, tree *cond_expr)
4030 : : {
4031 : 0 : poly_uint64 read_amount_poly = LOOP_VINFO_MAX_SPEC_READ_AMOUNT (loop_vinfo);
4032 : 0 : tree amount = build_int_cst (long_unsigned_type_node, read_amount_poly);
4033 : :
4034 : : /* Both the read amount and the VF must be variants, and the read amount must
4035 : : be a constant power-of-2 multiple of the VF. */
4036 : 0 : unsigned HOST_WIDE_INT multiple;
4037 : 0 : gcc_assert (!read_amount_poly.is_constant ()
4038 : : && !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()
4039 : : && constant_multiple_p (read_amount_poly,
4040 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo),
4041 : : &multiple)
4042 : : && pow2p_hwi (multiple));
4043 : :
4044 : : tree cst_ul_zero = build_int_cstu (long_unsigned_type_node, 0U);
4045 : : tree cst_ul_one = build_int_cstu (long_unsigned_type_node, 1U);
4046 : : tree cst_ul_pagesize = build_int_cstu (long_unsigned_type_node,
4047 : : (unsigned long) param_min_pagesize);
4048 : :
4049 : : /* Create an expression of "amount & (amount - 1) == 0". */
4050 : : tree amount_m1 = fold_build2 (MINUS_EXPR, long_unsigned_type_node,
4051 : : amount, cst_ul_one);
4052 : : tree amount_and_expr = fold_build2 (BIT_AND_EXPR, long_unsigned_type_node,
4053 : : amount, amount_m1);
4054 : : tree powof2_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
4055 : : amount_and_expr, cst_ul_zero);
4056 : : chain_cond_expr (cond_expr, powof2_cond_expr);
4057 : :
4058 : : /* Create an expression of "amount <= cst_ul_pagesize". */
4059 : : tree pagesize_cond_expr = fold_build2 (LE_EXPR, boolean_type_node,
4060 : : amount, cst_ul_pagesize);
4061 : : chain_cond_expr (cond_expr, pagesize_cond_expr);
4062 : : }
4063 : :
4064 : : /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
4065 : : create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
4066 : : Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
4067 : : and this new condition are true. Treat a null *COND_EXPR as "true". */
4068 : :
4069 : : static void
4070 : 3231 : vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
4071 : : {
4072 : 3231 : const vec<vec_object_pair> &pairs
4073 : : = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
4074 : 3231 : unsigned int i;
4075 : 3231 : vec_object_pair *pair;
4076 : 3239 : FOR_EACH_VEC_ELT (pairs, i, pair)
4077 : : {
4078 : 8 : tree addr1 = build_fold_addr_expr (pair->first);
4079 : 8 : tree addr2 = build_fold_addr_expr (pair->second);
4080 : 8 : tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
4081 : : addr1, addr2);
4082 : 8 : chain_cond_expr (cond_expr, part_cond_expr);
4083 : : }
4084 : 3231 : }
4085 : :
4086 : : /* Create an expression that is true when all lower-bound conditions for
4087 : : the vectorized loop are met. Chain this condition with *COND_EXPR. */
4088 : :
4089 : : static void
4090 : 3231 : vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
4091 : : {
4092 : 3231 : const vec<vec_lower_bound> &lower_bounds
4093 : : = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
4094 : 3507 : for (unsigned int i = 0; i < lower_bounds.length (); ++i)
4095 : : {
4096 : 276 : tree expr = lower_bounds[i].expr;
4097 : 276 : tree type = unsigned_type_for (TREE_TYPE (expr));
4098 : 276 : expr = fold_convert (type, expr);
4099 : 276 : poly_uint64 bound = lower_bounds[i].min_value;
4100 : 276 : if (!lower_bounds[i].unsigned_p)
4101 : : {
4102 : 107 : expr = fold_build2 (PLUS_EXPR, type, expr,
4103 : : build_int_cstu (type, bound - 1));
4104 : 107 : bound += bound - 1;
4105 : : }
4106 : 276 : tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
4107 : : build_int_cstu (type, bound));
4108 : 276 : chain_cond_expr (cond_expr, part_cond_expr);
4109 : : }
4110 : 3231 : }
4111 : :
4112 : : /* Function vect_create_cond_for_alias_checks.
4113 : :
4114 : : Create a conditional expression that represents the run-time checks for
4115 : : overlapping of address ranges represented by a list of data references
4116 : : relations passed as input.
4117 : :
4118 : : Input:
4119 : : COND_EXPR - input conditional expression. New conditions will be chained
4120 : : with logical AND operation. If it is NULL, then the function
4121 : : is used to return the number of alias checks.
4122 : : LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
4123 : : to be checked.
4124 : :
4125 : : Output:
4126 : : COND_EXPR - conditional expression.
4127 : :
4128 : : The returned COND_EXPR is the conditional expression to be used in the if
4129 : : statement that controls which version of the loop gets executed at runtime.
4130 : : */
4131 : :
4132 : : void
4133 : 3231 : vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
4134 : : {
4135 : 3231 : const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
4136 : : LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
4137 : :
4138 : 3231 : if (comp_alias_ddrs.is_empty ())
4139 : : return;
4140 : :
4141 : 3088 : create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
4142 : : &comp_alias_ddrs, cond_expr);
4143 : 3088 : if (dump_enabled_p ())
4144 : 1262 : dump_printf_loc (MSG_NOTE, vect_location,
4145 : : "created %u versioning for alias checks.\n",
4146 : : comp_alias_ddrs.length ());
4147 : : }
4148 : :
4149 : :
4150 : : /* Function vect_loop_versioning.
4151 : :
4152 : : If the loop has data references that may or may not be aligned or/and
4153 : : has data reference relations whose independence was not proven then
4154 : : two versions of the loop need to be generated, one which is vectorized
4155 : : and one which isn't. A test is then generated to control which of the
4156 : : loops is executed. The test checks for the alignment of all of the
4157 : : data references that may or may not be aligned. An additional
4158 : : sequence of runtime tests is generated for each pairs of DDRs whose
4159 : : independence was not proven. The vectorized version of loop is
4160 : : executed only if both alias and alignment tests are passed.
4161 : :
4162 : : The test generated to check which version of loop is executed
4163 : : is modified to also check for profitability as indicated by the
4164 : : cost model threshold TH.
4165 : :
4166 : : The versioning precondition(s) are placed in *COND_EXPR and
4167 : : *COND_EXPR_STMT_LIST. */
4168 : :
4169 : : class loop *
4170 : 3704 : vect_loop_versioning (loop_vec_info loop_vinfo,
4171 : : gimple *loop_vectorized_call)
4172 : : {
4173 : 3704 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
4174 : 3704 : class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
4175 : 3704 : basic_block condition_bb;
4176 : 3704 : gphi_iterator gsi;
4177 : 3704 : gimple_stmt_iterator cond_exp_gsi;
4178 : 3704 : basic_block merge_bb;
4179 : 3704 : basic_block new_exit_bb;
4180 : 3704 : edge new_exit_e, e;
4181 : 3704 : gphi *orig_phi, *new_phi;
4182 : 3704 : tree cond_expr = NULL_TREE;
4183 : 3704 : gimple_seq cond_expr_stmt_list = NULL;
4184 : 3704 : tree arg;
4185 : 3704 : profile_probability prob = profile_probability::likely ();
4186 : 3704 : gimple_seq gimplify_stmt_list = NULL;
4187 : 3704 : tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
4188 : 3704 : bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
4189 : 3704 : bool version_spec_read = LOOP_REQUIRES_VERSIONING_FOR_SPEC_READ (loop_vinfo);
4190 : 3704 : bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
4191 : 3704 : bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
4192 : 3704 : poly_uint64 versioning_threshold
4193 : : = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
4194 : 3704 : tree version_simd_if_cond
4195 : : = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
4196 : 3704 : unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
4197 : :
4198 : 3704 : if (vect_apply_runtime_profitability_check_p (loop_vinfo)
4199 : : && !ordered_p (th, versioning_threshold))
4200 : : cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
4201 : : build_int_cst (TREE_TYPE (scalar_loop_iters),
4202 : : th - 1));
4203 : 3704 : if (maybe_ne (versioning_threshold, 0U))
4204 : : {
4205 : 3696 : tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
4206 : : build_int_cst (TREE_TYPE (scalar_loop_iters),
4207 : : versioning_threshold - 1));
4208 : 3696 : if (cond_expr)
4209 : 0 : cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
4210 : : expr, cond_expr);
4211 : : else
4212 : 3696 : cond_expr = expr;
4213 : : }
4214 : :
4215 : 3704 : tree cost_name = NULL_TREE;
4216 : 3704 : profile_probability prob2 = profile_probability::always ();
4217 : 3704 : if (cond_expr
4218 : 3696 : && EXPR_P (cond_expr)
4219 : 2554 : && (version_niter
4220 : 2554 : || version_align
4221 : : || version_alias
4222 : 2198 : || version_simd_if_cond))
4223 : : {
4224 : 2554 : cost_name = cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
4225 : : &cond_expr_stmt_list,
4226 : : is_gimple_val, NULL_TREE);
4227 : : /* Split prob () into two so that the overall probability of passing
4228 : : both the cost-model and versioning checks is the orig prob. */
4229 : 2554 : prob2 = prob = prob.sqrt ();
4230 : : }
4231 : :
4232 : 3704 : if (version_niter)
4233 : 394 : vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
4234 : :
4235 : 3704 : if (cond_expr)
4236 : : {
4237 : 3696 : gimple_seq tem = NULL;
4238 : 3696 : cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
4239 : : &tem, is_gimple_condexpr_for_cond,
4240 : : NULL_TREE);
4241 : 3696 : gimple_seq_add_seq (&cond_expr_stmt_list, tem);
4242 : : }
4243 : :
4244 : 3704 : if (version_align)
4245 : 50 : vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
4246 : : &cond_expr_stmt_list);
4247 : :
4248 : 3704 : if (version_spec_read)
4249 : 0 : vect_create_cond_for_vla_spec_read (loop_vinfo, &cond_expr);
4250 : :
4251 : 3704 : if (version_alias)
4252 : : {
4253 : 3231 : vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
4254 : 3231 : vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
4255 : 3231 : vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
4256 : : }
4257 : :
4258 : 3704 : if (version_simd_if_cond)
4259 : : {
4260 : 58 : gcc_assert (dom_info_available_p (CDI_DOMINATORS));
4261 : 58 : if (flag_checking)
4262 : 58 : if (basic_block bb
4263 : 58 : = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
4264 : 58 : gcc_assert (bb != loop->header
4265 : : && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
4266 : : && (scalar_loop == NULL
4267 : : || (bb != scalar_loop->header
4268 : : && dominated_by_p (CDI_DOMINATORS,
4269 : : scalar_loop->header, bb))));
4270 : 58 : tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
4271 : 58 : tree c = fold_build2 (NE_EXPR, boolean_type_node,
4272 : : version_simd_if_cond, zero);
4273 : 58 : if (cond_expr)
4274 : 58 : cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
4275 : : c, cond_expr);
4276 : : else
4277 : 0 : cond_expr = c;
4278 : 58 : if (dump_enabled_p ())
4279 : 5 : dump_printf_loc (MSG_NOTE, vect_location,
4280 : : "created versioning for simd if condition check.\n");
4281 : : }
4282 : :
4283 : 3704 : cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
4284 : : &gimplify_stmt_list,
4285 : : is_gimple_condexpr_for_cond, NULL_TREE);
4286 : 3704 : gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
4287 : :
4288 : : /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
4289 : : invariant in. */
4290 : 3704 : class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
4291 : 3704 : for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
4292 : 67317 : !gsi_end_p (gsi); gsi_next (&gsi))
4293 : : {
4294 : 63613 : gimple *stmt = gsi_stmt (gsi);
4295 : 63613 : update_stmt (stmt);
4296 : 63613 : ssa_op_iter iter;
4297 : 63613 : use_operand_p use_p;
4298 : 63613 : basic_block def_bb;
4299 : 146333 : FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
4300 : 82720 : if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
4301 : 82720 : && flow_bb_inside_loop_p (outermost, def_bb))
4302 : 2498 : outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
4303 : : }
4304 : :
4305 : : /* Search for the outermost loop we can version. Avoid versioning of
4306 : : non-perfect nests but allow if-conversion versioned loops inside. */
4307 : 3704 : class loop *loop_to_version = loop;
4308 : 3704 : if (flow_loop_nested_p (outermost, loop))
4309 : : {
4310 : 1444 : if (dump_enabled_p ())
4311 : 649 : dump_printf_loc (MSG_NOTE, vect_location,
4312 : : "trying to apply versioning to outer loop %d\n",
4313 : : outermost->num);
4314 : 1444 : if (outermost->num == 0)
4315 : 1340 : outermost = superloop_at_depth (loop, 1);
4316 : : /* And avoid applying versioning on non-perfect nests. */
4317 : : while (loop_to_version != outermost
4318 : 129 : && (e = single_exit (loop_outer (loop_to_version)))
4319 : 108 : && !(e->flags & EDGE_COMPLEX)
4320 : 107 : && (!loop_outer (loop_to_version)->inner->next
4321 : 61 : || vect_loop_vectorized_call (loop_to_version))
4322 : 1542 : && (!loop_outer (loop_to_version)->inner->next
4323 : 3 : || !loop_outer (loop_to_version)->inner->next->next))
4324 : : loop_to_version = loop_outer (loop_to_version);
4325 : : }
4326 : :
4327 : : /* Apply versioning. If there is already a scalar version created by
4328 : : if-conversion re-use that. Note we cannot re-use the copy of
4329 : : an if-converted outer-loop when vectorizing the inner loop only. */
4330 : 3704 : gcond *cond;
4331 : 3704 : if ((!loop_to_version->inner || loop == loop_to_version)
4332 : 3667 : && loop_vectorized_call)
4333 : : {
4334 : 89 : gcc_assert (scalar_loop);
4335 : 89 : condition_bb = gimple_bb (loop_vectorized_call);
4336 : 178 : cond = as_a <gcond *> (*gsi_last_bb (condition_bb));
4337 : 89 : gimple_cond_set_condition_from_tree (cond, cond_expr);
4338 : 89 : update_stmt (cond);
4339 : :
4340 : 89 : if (cond_expr_stmt_list)
4341 : : {
4342 : 89 : cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
4343 : 89 : gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
4344 : : GSI_SAME_STMT);
4345 : : }
4346 : :
4347 : : /* if-conversion uses profile_probability::always () for both paths,
4348 : : reset the paths probabilities appropriately. */
4349 : 89 : edge te, fe;
4350 : 89 : extract_true_false_edges_from_block (condition_bb, &te, &fe);
4351 : 89 : te->probability = prob;
4352 : 89 : fe->probability = prob.invert ();
4353 : : /* We can scale loops counts immediately but have to postpone
4354 : : scaling the scalar loop because we re-use it during peeling.
4355 : :
4356 : : Ifcvt duplicates loop preheader, loop body and produces an basic
4357 : : block after loop exit. We need to scale all that. */
4358 : 89 : basic_block preheader = loop_preheader_edge (loop_to_version)->src;
4359 : 89 : preheader->count = preheader->count.apply_probability (prob * prob2);
4360 : 89 : scale_loop_frequencies (loop_to_version, prob * prob2);
4361 : : /* When the loop has multiple exits then we can only version itself.
4362 : : This is denoted by loop_to_version == loop. In this case we can
4363 : : do the versioning by selecting the exit edge the vectorizer is
4364 : : currently using. */
4365 : 89 : edge exit_edge;
4366 : 89 : if (loop_to_version == loop)
4367 : 89 : exit_edge = LOOP_VINFO_IV_EXIT (loop_vinfo);
4368 : : else
4369 : 0 : exit_edge = single_exit (loop_to_version);
4370 : 89 : exit_edge->dest->count = preheader->count;
4371 : 89 : LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = (prob * prob2).invert ();
4372 : :
4373 : 89 : nloop = scalar_loop;
4374 : 89 : if (dump_enabled_p ())
4375 : 92 : dump_printf_loc (MSG_NOTE, vect_location,
4376 : : "reusing %sloop version created by if conversion\n",
4377 : : loop_to_version != loop ? "outer " : "");
4378 : 89 : }
4379 : : else
4380 : : {
4381 : 3615 : if (loop_to_version != loop
4382 : 3615 : && dump_enabled_p ())
4383 : 11 : dump_printf_loc (MSG_NOTE, vect_location,
4384 : : "applying loop versioning to outer loop %d\n",
4385 : : loop_to_version->num);
4386 : :
4387 : 3615 : unsigned orig_pe_idx = loop_preheader_edge (loop)->dest_idx;
4388 : :
4389 : 3615 : initialize_original_copy_tables ();
4390 : 7230 : nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
4391 : 3615 : prob * prob2, (prob * prob2).invert (),
4392 : 3615 : prob * prob2, (prob * prob2).invert (),
4393 : : true);
4394 : :
4395 : : /* If the PHI nodes in the loop header were reallocated, we need to fix up
4396 : : our internally stashed copies of those. */
4397 : 3615 : if (loop_to_version == loop)
4398 : 3578 : for (auto gsi = gsi_start_phis (loop->header);
4399 : 14160 : !gsi_end_p (gsi); gsi_next (&gsi))
4400 : 10582 : loop_vinfo->resync_stmt_addr (gsi.phi ());
4401 : :
4402 : : /* We will later insert second conditional so overall outcome of
4403 : : both is prob * prob2. */
4404 : 3615 : edge true_e, false_e;
4405 : 3615 : extract_true_false_edges_from_block (condition_bb, &true_e, &false_e);
4406 : 3615 : true_e->probability = prob;
4407 : 3615 : false_e->probability = prob.invert ();
4408 : 3615 : gcc_assert (nloop);
4409 : 3615 : nloop = get_loop_copy (loop);
4410 : :
4411 : : /* For cycle vectorization with SLP we rely on the PHI arguments
4412 : : appearing in the same order as the SLP node operands which for the
4413 : : loop PHI nodes means the preheader edge dest index needs to remain
4414 : : the same for the analyzed loop which also becomes the vectorized one.
4415 : : Make it so in case the state after versioning differs by redirecting
4416 : : the first edge into the header to the same destination which moves
4417 : : it last. */
4418 : 3615 : if (loop_preheader_edge (loop)->dest_idx != orig_pe_idx)
4419 : : {
4420 : 280 : edge e = EDGE_PRED (loop->header, 0);
4421 : 280 : ssa_redirect_edge (e, e->dest);
4422 : 280 : flush_pending_stmts (e);
4423 : : }
4424 : 3615 : gcc_assert (loop_preheader_edge (loop)->dest_idx == orig_pe_idx);
4425 : :
4426 : : /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
4427 : : reap those otherwise; they also refer to the original
4428 : : loops. */
4429 : : class loop *l = loop;
4430 : 3618 : while (gimple *call = vect_loop_vectorized_call (l))
4431 : : {
4432 : 3 : call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
4433 : 3 : fold_loop_internal_call (call, boolean_false_node);
4434 : 3 : l = loop_outer (l);
4435 : 3 : }
4436 : 3615 : free_original_copy_tables ();
4437 : :
4438 : 3615 : if (cond_expr_stmt_list)
4439 : : {
4440 : 3535 : cond_exp_gsi = gsi_last_bb (condition_bb);
4441 : 3535 : gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
4442 : : GSI_SAME_STMT);
4443 : : }
4444 : :
4445 : : /* Loop versioning violates an assumption we try to maintain during
4446 : : vectorization - that the loop exit block has a single predecessor.
4447 : : After versioning, the exit block of both loop versions is the same
4448 : : basic block (i.e. it has two predecessors). Just in order to simplify
4449 : : following transformations in the vectorizer, we fix this situation
4450 : : here by adding a new (empty) block on the exit-edge of the loop,
4451 : : with the proper loop-exit phis to maintain loop-closed-form.
4452 : : If loop versioning wasn't done from loop, but scalar_loop instead,
4453 : : merge_bb will have already just a single successor. */
4454 : :
4455 : : /* When the loop has multiple exits then we can only version itself.
4456 : : This is denoted by loop_to_version == loop. In this case we can
4457 : : do the versioning by selecting the exit edge the vectorizer is
4458 : : currently using. */
4459 : 3615 : edge exit_edge;
4460 : 3615 : if (loop_to_version == loop)
4461 : 3578 : exit_edge = LOOP_VINFO_IV_EXIT (loop_vinfo);
4462 : : else
4463 : 37 : exit_edge = single_exit (loop_to_version);
4464 : :
4465 : 3615 : gcc_assert (exit_edge);
4466 : 3615 : merge_bb = exit_edge->dest;
4467 : 3615 : if (EDGE_COUNT (merge_bb->preds) >= 2)
4468 : : {
4469 : 3615 : gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
4470 : 3615 : new_exit_bb = split_edge (exit_edge);
4471 : 3615 : new_exit_e = exit_edge;
4472 : 3615 : e = EDGE_SUCC (new_exit_bb, 0);
4473 : :
4474 : 7456 : for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
4475 : 3841 : gsi_next (&gsi))
4476 : : {
4477 : 3841 : tree new_res;
4478 : 3841 : orig_phi = gsi.phi ();
4479 : 3841 : new_res = copy_ssa_name (PHI_RESULT (orig_phi));
4480 : 3841 : new_phi = create_phi_node (new_res, new_exit_bb);
4481 : 3841 : arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
4482 : 3841 : add_phi_arg (new_phi, arg, new_exit_e,
4483 : : gimple_phi_arg_location_from_edge (orig_phi, e));
4484 : 3841 : adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
4485 : : }
4486 : : }
4487 : :
4488 : 3615 : update_ssa (TODO_update_ssa_no_phi);
4489 : : }
4490 : :
4491 : : /* Split the cost model check off to a separate BB. Costing assumes
4492 : : this is the only thing we perform when we enter the scalar loop
4493 : : from a failed cost decision. */
4494 : 3704 : if (cost_name && TREE_CODE (cost_name) == SSA_NAME)
4495 : : {
4496 : 2554 : gimple *def = SSA_NAME_DEF_STMT (cost_name);
4497 : 2554 : gcc_assert (gimple_bb (def) == condition_bb);
4498 : : /* All uses of the cost check are 'true' after the check we
4499 : : are going to insert. */
4500 : 2554 : replace_uses_by (cost_name, boolean_true_node);
4501 : : /* And we're going to build the new single use of it. */
4502 : 2554 : gcond *cond = gimple_build_cond (NE_EXPR, cost_name, boolean_false_node,
4503 : : NULL_TREE, NULL_TREE);
4504 : 2554 : edge e = split_block (gimple_bb (def), def);
4505 : 2554 : gimple_stmt_iterator gsi = gsi_for_stmt (def);
4506 : 2554 : gsi_insert_after (&gsi, cond, GSI_NEW_STMT);
4507 : 2554 : edge true_e, false_e;
4508 : 2554 : extract_true_false_edges_from_block (e->dest, &true_e, &false_e);
4509 : 2554 : e->flags &= ~EDGE_FALLTHRU;
4510 : 2554 : e->flags |= EDGE_TRUE_VALUE;
4511 : 2554 : edge e2 = make_edge (e->src, false_e->dest, EDGE_FALSE_VALUE);
4512 : 2554 : e->probability = prob2;
4513 : 2554 : e2->probability = prob2.invert ();
4514 : 2554 : e->dest->count = e->count ();
4515 : 2554 : set_immediate_dominator (CDI_DOMINATORS, false_e->dest, e->src);
4516 : 2554 : auto_vec<basic_block, 3> adj;
4517 : 2554 : for (basic_block son = first_dom_son (CDI_DOMINATORS, e->dest);
4518 : 7594 : son;
4519 : 5040 : son = next_dom_son (CDI_DOMINATORS, son))
4520 : 7526 : if (EDGE_COUNT (son->preds) > 1)
4521 : 2486 : adj.safe_push (son);
4522 : 10148 : for (auto son : adj)
4523 : 2486 : set_immediate_dominator (CDI_DOMINATORS, son, e->src);
4524 : : //debug_bb (condition_bb);
4525 : : //debug_bb (e->src);
4526 : 2554 : }
4527 : :
4528 : 3704 : if (version_niter)
4529 : : {
4530 : : /* The versioned loop could be infinite, we need to clear existing
4531 : : niter information which is copied from the original loop. */
4532 : 394 : gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
4533 : 394 : vect_free_loop_info_assumptions (nloop);
4534 : : }
4535 : :
4536 : 3704 : if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
4537 : 3704 : && dump_enabled_p ())
4538 : : {
4539 : 791 : if (version_alias)
4540 : 731 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
4541 : : vect_location,
4542 : : "loop versioned for vectorization because of "
4543 : : "possible aliasing\n");
4544 : 791 : if (version_align)
4545 : 20 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
4546 : : vect_location,
4547 : : "loop versioned for vectorization to enhance "
4548 : : "alignment\n");
4549 : :
4550 : : }
4551 : :
4552 : 3704 : return nloop;
4553 : : }
|