Line data Source code
1 : /* Loop Vectorization
2 : Copyright (C) 2003-2026 Free Software Foundation, Inc.
3 : Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 : Ira Rosen <irar@il.ibm.com>
5 :
6 : This file is part of GCC.
7 :
8 : GCC is free software; you can redistribute it and/or modify it under
9 : the terms of the GNU General Public License as published by the Free
10 : Software Foundation; either version 3, or (at your option) any later
11 : version.
12 :
13 : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : for more details.
17 :
18 : You should have received a copy of the GNU General Public License
19 : along with GCC; see the file COPYING3. If not see
20 : <http://www.gnu.org/licenses/>. */
21 :
22 : #define INCLUDE_ALGORITHM
23 : #include "config.h"
24 : #include "system.h"
25 : #include "coretypes.h"
26 : #include "backend.h"
27 : #include "target.h"
28 : #include "rtl.h"
29 : #include "tree.h"
30 : #include "gimple.h"
31 : #include "cfghooks.h"
32 : #include "tree-pass.h"
33 : #include "ssa.h"
34 : #include "optabs-tree.h"
35 : #include "memmodel.h"
36 : #include "optabs.h"
37 : #include "diagnostic-core.h"
38 : #include "fold-const.h"
39 : #include "stor-layout.h"
40 : #include "cfganal.h"
41 : #include "gimplify.h"
42 : #include "gimple-iterator.h"
43 : #include "gimplify-me.h"
44 : #include "tree-ssa-loop-ivopts.h"
45 : #include "tree-ssa-loop-manip.h"
46 : #include "tree-ssa-loop-niter.h"
47 : #include "tree-ssa-loop.h"
48 : #include "cfgloop.h"
49 : #include "tree-scalar-evolution.h"
50 : #include "tree-vectorizer.h"
51 : #include "gimple-fold.h"
52 : #include "cgraph.h"
53 : #include "tree-cfg.h"
54 : #include "tree-if-conv.h"
55 : #include "internal-fn.h"
56 : #include "tree-vector-builder.h"
57 : #include "vec-perm-indices.h"
58 : #include "tree-eh.h"
59 : #include "case-cfn-macros.h"
60 : #include "langhooks.h"
61 : #include "opts.h"
62 : #include "hierarchical_discriminator.h"
63 :
64 : /* Loop Vectorization Pass.
65 :
66 : This pass tries to vectorize loops.
67 :
68 : For example, the vectorizer transforms the following simple loop:
69 :
70 : short a[N]; short b[N]; short c[N]; int i;
71 :
72 : for (i=0; i<N; i++){
73 : a[i] = b[i] + c[i];
74 : }
75 :
76 : as if it was manually vectorized by rewriting the source code into:
77 :
78 : typedef int __attribute__((mode(V8HI))) v8hi;
79 : short a[N]; short b[N]; short c[N]; int i;
80 : v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
81 : v8hi va, vb, vc;
82 :
83 : for (i=0; i<N/8; i++){
84 : vb = pb[i];
85 : vc = pc[i];
86 : va = vb + vc;
87 : pa[i] = va;
88 : }
89 :
90 : The main entry to this pass is vectorize_loops(), in which
91 : the vectorizer applies a set of analyses on a given set of loops,
92 : followed by the actual vectorization transformation for the loops that
93 : had successfully passed the analysis phase.
94 : Throughout this pass we make a distinction between two types of
95 : data: scalars (which are represented by SSA_NAMES), and memory references
96 : ("data-refs"). These two types of data require different handling both
97 : during analysis and transformation. The types of data-refs that the
98 : vectorizer currently supports are ARRAY_REFS which base is an array DECL
99 : (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
100 : accesses are required to have a simple (consecutive) access pattern.
101 :
102 : Analysis phase:
103 : ===============
104 : The driver for the analysis phase is vect_analyze_loop().
105 : It applies a set of analyses, some of which rely on the scalar evolution
106 : analyzer (scev) developed by Sebastian Pop.
107 :
108 : During the analysis phase the vectorizer records some information
109 : per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
110 : loop, as well as general information about the loop as a whole, which is
111 : recorded in a "loop_vec_info" struct attached to each loop.
112 :
113 : Transformation phase:
114 : =====================
115 : The loop transformation phase scans all the stmts in the loop, and
116 : creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
117 : the loop that needs to be vectorized. It inserts the vector code sequence
118 : just before the scalar stmt S, and records a pointer to the vector code
119 : in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
120 : attached to S). This pointer will be used for the vectorization of following
121 : stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
122 : otherwise, we rely on dead code elimination for removing it.
123 :
124 : For example, say stmt S1 was vectorized into stmt VS1:
125 :
126 : VS1: vb = px[i];
127 : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
128 : S2: a = b;
129 :
130 : To vectorize stmt S2, the vectorizer first finds the stmt that defines
131 : the operand 'b' (S1), and gets the relevant vector def 'vb' from the
132 : vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
133 : resulting sequence would be:
134 :
135 : VS1: vb = px[i];
136 : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
137 : VS2: va = vb;
138 : S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
139 :
140 : Operands that are not SSA_NAMEs, are data-refs that appear in
141 : load/store operations (like 'x[i]' in S1), and are handled differently.
142 :
143 : Target modeling:
144 : =================
145 : Currently the only target specific information that is used is the
146 : size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
147 : Targets that can support different sizes of vectors, for now will need
148 : to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
149 : flexibility will be added in the future.
150 :
151 : Since we only vectorize operations which vector form can be
152 : expressed using existing tree codes, to verify that an operation is
153 : supported, the vectorizer checks the relevant optab at the relevant
154 : machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
155 : the value found is CODE_FOR_nothing, then there's no target support, and
156 : we can't vectorize the stmt.
157 :
158 : For additional information on this project see:
159 : http://gcc.gnu.org/projects/tree-ssa/vectorization.html
160 : */
161 :
162 : static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
163 : unsigned *);
164 : static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
165 : gphi **);
166 :
167 :
168 : /* Function vect_is_simple_iv_evolution.
169 :
170 : FORNOW: A simple evolution of an induction variables in the loop is
171 : considered a polynomial evolution. */
172 :
173 : static bool
174 756641 : vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn,
175 : stmt_vec_info stmt_info)
176 : {
177 756641 : tree init_expr;
178 756641 : tree step_expr;
179 756641 : tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
180 756641 : basic_block bb;
181 :
182 : /* When there is no evolution in this loop, the evolution function
183 : is not "simple". */
184 756641 : if (evolution_part == NULL_TREE)
185 : return false;
186 :
187 : /* When the evolution is a polynomial of degree >= 2
188 : the evolution function is not "simple". */
189 805037 : if (tree_is_chrec (evolution_part))
190 : return false;
191 :
192 658645 : step_expr = evolution_part;
193 658645 : init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
194 :
195 658645 : if (dump_enabled_p ())
196 38614 : dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
197 : step_expr, init_expr);
198 :
199 658645 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = init_expr;
200 658645 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = step_expr;
201 :
202 658645 : if (TREE_CODE (step_expr) != INTEGER_CST
203 56080 : && (TREE_CODE (step_expr) != SSA_NAME
204 44511 : || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
205 44261 : && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
206 7722 : || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
207 127 : && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
208 127 : || !flag_associative_math)))
209 707098 : && (TREE_CODE (step_expr) != REAL_CST
210 431 : || !flag_associative_math))
211 : {
212 48396 : if (dump_enabled_p ())
213 2948 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
214 : "step unknown.\n");
215 48396 : return false;
216 : }
217 :
218 : return true;
219 : }
220 :
221 : /* Function vect_is_nonlinear_iv_evolution
222 :
223 : Only support nonlinear induction for integer type
224 : 1. neg
225 : 2. mul by constant
226 : 3. lshift/rshift by constant.
227 :
228 : For neg induction, return a fake step as integer -1. */
229 : static bool
230 143960 : vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
231 : gphi* loop_phi_node)
232 : {
233 143960 : tree init_expr, ev_expr, result, op1, op2;
234 143960 : gimple* def;
235 :
236 143960 : if (gimple_phi_num_args (loop_phi_node) != 2)
237 : return false;
238 :
239 143960 : init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
240 143960 : ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
241 :
242 : /* Support nonlinear induction only for integer type. */
243 143960 : if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
244 : return false;
245 :
246 86519 : result = PHI_RESULT (loop_phi_node);
247 :
248 86519 : if (TREE_CODE (ev_expr) != SSA_NAME
249 84220 : || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
250 86519 : || !is_gimple_assign (def))
251 : return false;
252 :
253 78349 : enum tree_code t_code = gimple_assign_rhs_code (def);
254 78349 : tree step;
255 78349 : switch (t_code)
256 : {
257 1808 : case NEGATE_EXPR:
258 1808 : if (gimple_assign_rhs1 (def) != result)
259 : return false;
260 1808 : step = build_int_cst (TREE_TYPE (init_expr), -1);
261 1808 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
262 1808 : break;
263 :
264 11310 : case RSHIFT_EXPR:
265 11310 : case LSHIFT_EXPR:
266 11310 : case MULT_EXPR:
267 11310 : op1 = gimple_assign_rhs1 (def);
268 11310 : op2 = gimple_assign_rhs2 (def);
269 11310 : if (TREE_CODE (op2) != INTEGER_CST
270 7440 : || op1 != result)
271 : return false;
272 7055 : step = op2;
273 7055 : if (t_code == LSHIFT_EXPR)
274 472 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
275 6583 : else if (t_code == RSHIFT_EXPR)
276 5615 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
277 : /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
278 : else
279 968 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
280 : break;
281 :
282 : default:
283 : return false;
284 : }
285 :
286 8863 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = init_expr;
287 8863 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = step;
288 :
289 8863 : return true;
290 : }
291 :
292 : /* Returns true if Phi is a first-order recurrence. A first-order
293 : recurrence is a non-reduction recurrence relation in which the value of
294 : the recurrence in the current loop iteration equals a value defined in
295 : the previous iteration. */
296 :
297 : static bool
298 65865 : vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
299 : gphi *phi)
300 : {
301 : /* A nested cycle isn't vectorizable as first order recurrence. */
302 65865 : if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
303 : return false;
304 :
305 : /* Ensure the loop latch definition is from within the loop. */
306 65699 : edge latch = loop_latch_edge (loop);
307 65699 : tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
308 65699 : if (TREE_CODE (ldef) != SSA_NAME
309 63057 : || SSA_NAME_IS_DEFAULT_DEF (ldef)
310 62991 : || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
311 124075 : || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
312 7973 : return false;
313 :
314 57726 : tree def = gimple_phi_result (phi);
315 :
316 : /* Ensure every use_stmt of the phi node is dominated by the latch
317 : definition. */
318 57726 : imm_use_iterator imm_iter;
319 57726 : use_operand_p use_p;
320 127709 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
321 69486 : if (!is_gimple_debug (USE_STMT (use_p))
322 135380 : && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
323 45567 : || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
324 : USE_STMT (use_p))))
325 57229 : return false;
326 :
327 : /* First-order recurrence autovectorization needs shuffle vector. */
328 497 : tree scalar_type = TREE_TYPE (def);
329 497 : tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
330 497 : if (!vectype)
331 : return false;
332 :
333 : return true;
334 : }
335 :
336 : /* Function vect_analyze_scalar_cycles_1.
337 :
338 : Examine the cross iteration def-use cycles of scalar variables
339 : in LOOP. LOOP_VINFO represents the loop that is now being
340 : considered for vectorization (can be LOOP, or an outer-loop
341 : enclosing LOOP). SLP indicates there will be some subsequent
342 : slp analyses or not. */
343 :
344 : static void
345 377092 : vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
346 : {
347 377092 : basic_block bb = loop->header;
348 377092 : auto_vec<stmt_vec_info, 64> worklist;
349 377092 : gphi_iterator gsi;
350 :
351 377092 : DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
352 :
353 : /* First - identify all inductions. Reduction detection assumes that all the
354 : inductions have been identified, therefore, this order must not be
355 : changed. */
356 1323862 : for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
357 : {
358 946770 : gphi *phi = gsi.phi ();
359 946770 : tree access_fn = NULL;
360 946770 : tree def = PHI_RESULT (phi);
361 946770 : stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
362 :
363 : /* Skip virtual phi's. The data dependences that are associated with
364 : virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
365 1893540 : if (virtual_operand_p (def))
366 327664 : continue;
367 :
368 : /* Skip already analyzed inner loop PHIs of double reductions. */
369 757588 : if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))
370 947 : continue;
371 :
372 756641 : if (dump_enabled_p ())
373 40683 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
374 : (gimple *) phi);
375 :
376 756641 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
377 :
378 : /* Analyze the evolution function. */
379 756641 : access_fn = analyze_scalar_evolution (loop, def);
380 756641 : if (dump_enabled_p ())
381 40683 : dump_printf_loc (MSG_NOTE, vect_location,
382 : "Access function of PHI: %T\n", access_fn);
383 756641 : if (access_fn)
384 756641 : STRIP_NOPS (access_fn);
385 :
386 894176 : if ((!access_fn
387 756641 : || !vect_is_simple_iv_evolution (loop->num, access_fn, stmt_vinfo)
388 610249 : || (LOOP_VINFO_LOOP (loop_vinfo) != loop
389 10861 : && (TREE_CODE (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo))
390 : != INTEGER_CST)))
391 : /* Only handle nonlinear iv for same loop. */
392 903039 : && (LOOP_VINFO_LOOP (loop_vinfo) != loop
393 143960 : || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo, phi)))
394 : {
395 137535 : worklist.safe_push (stmt_vinfo);
396 137535 : continue;
397 : }
398 :
399 619106 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
400 : != NULL_TREE);
401 619106 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
402 :
403 619106 : if (dump_enabled_p ())
404 35767 : dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
405 619106 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
406 :
407 : /* Mark if we have a non-linear IV. */
408 619106 : LOOP_VINFO_NON_LINEAR_IV (loop_vinfo)
409 619106 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_vinfo) != vect_step_op_add;
410 : }
411 :
412 :
413 : /* Second - identify all reductions and nested cycles. */
414 514627 : while (worklist.length () > 0)
415 : {
416 137535 : stmt_vec_info stmt_vinfo = worklist.pop ();
417 137535 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
418 137535 : tree def = PHI_RESULT (phi);
419 :
420 137535 : if (dump_enabled_p ())
421 4916 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
422 : (gimple *) phi);
423 :
424 275070 : gcc_assert (!virtual_operand_p (def)
425 : && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
426 :
427 137535 : gphi *double_reduc;
428 137535 : stmt_vec_info reduc_stmt_info
429 137535 : = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc);
430 137535 : if (reduc_stmt_info && double_reduc)
431 : {
432 1049 : stmt_vec_info inner_phi_info
433 1049 : = loop_vinfo->lookup_stmt (double_reduc);
434 : /* ??? Pass down flag we're the inner loop of a double reduc. */
435 1049 : stmt_vec_info inner_reduc_info
436 1049 : = vect_is_simple_reduction (loop_vinfo, inner_phi_info, NULL);
437 1049 : if (inner_reduc_info)
438 : {
439 947 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
440 947 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
441 947 : STMT_VINFO_REDUC_DEF (inner_phi_info) = inner_reduc_info;
442 947 : STMT_VINFO_REDUC_DEF (inner_reduc_info) = inner_phi_info;
443 947 : if (dump_enabled_p ())
444 130 : dump_printf_loc (MSG_NOTE, vect_location,
445 : "Detected double reduction.\n");
446 :
447 947 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
448 947 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
449 947 : STMT_VINFO_DEF_TYPE (inner_phi_info) = vect_nested_cycle;
450 : /* Make it accessible for SLP vectorization. */
451 947 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
452 : }
453 102 : else if (dump_enabled_p ())
454 14 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
455 : "Unknown def-use cycle pattern.\n");
456 : }
457 136486 : else if (reduc_stmt_info)
458 : {
459 70621 : if (loop != LOOP_VINFO_LOOP (loop_vinfo))
460 : {
461 2272 : if (dump_enabled_p ())
462 433 : dump_printf_loc (MSG_NOTE, vect_location,
463 : "Detected vectorizable nested cycle.\n");
464 :
465 2272 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
466 : }
467 : else
468 : {
469 68349 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
470 68349 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
471 68349 : if (dump_enabled_p ())
472 3788 : dump_printf_loc (MSG_NOTE, vect_location,
473 : "Detected reduction.\n");
474 :
475 68349 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
476 68349 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
477 68349 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
478 : }
479 : }
480 65865 : else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
481 491 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
482 : else
483 65374 : if (dump_enabled_p ())
484 477 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
485 : "Unknown def-use cycle pattern.\n");
486 : }
487 377092 : }
488 :
489 :
490 : /* Function vect_analyze_scalar_cycles.
491 :
492 : Examine the cross iteration def-use cycles of scalar variables, by
493 : analyzing the loop-header PHIs of scalar variables. Classify each
494 : cycle as one of the following: invariant, induction, reduction, unknown.
495 : We do that for the loop represented by LOOP_VINFO, and also to its
496 : inner-loop, if exists.
497 : Examples for scalar cycles:
498 :
499 : Example1: reduction:
500 :
501 : loop1:
502 : for (i=0; i<N; i++)
503 : sum += a[i];
504 :
505 : Example2: induction:
506 :
507 : loop2:
508 : for (i=0; i<N; i++)
509 : a[i] = i; */
510 :
511 : static void
512 371565 : vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
513 : {
514 371565 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
515 :
516 371565 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
517 :
518 : /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
519 : Reductions in such inner-loop therefore have different properties than
520 : the reductions in the nest that gets vectorized:
521 : 1. When vectorized, they are executed in the same order as in the original
522 : scalar loop, so we can't change the order of computation when
523 : vectorizing them.
524 : 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
525 : current checks are too strict. */
526 :
527 371565 : if (loop->inner)
528 5527 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
529 371565 : }
530 :
531 : /* Function vect_get_loop_niters.
532 :
533 : Determine how many iterations the loop is executed and place it
534 : in NUMBER_OF_ITERATIONS. Place the number of latch iterations
535 : in NUMBER_OF_ITERATIONSM1. Place the condition under which the
536 : niter information holds in ASSUMPTIONS.
537 :
538 : Return the loop exit conditions. */
539 :
540 :
541 : static vec<gcond *>
542 278471 : vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
543 : tree *number_of_iterations, tree *number_of_iterationsm1)
544 : {
545 278471 : auto_vec<edge> exits = get_loop_exit_edges (loop);
546 278471 : vec<gcond *> conds;
547 556942 : conds.create (exits.length ());
548 278471 : class tree_niter_desc niter_desc;
549 278471 : tree niter_assumptions, niter, may_be_zero;
550 :
551 278471 : *assumptions = boolean_true_node;
552 278471 : *number_of_iterationsm1 = chrec_dont_know;
553 278471 : *number_of_iterations = chrec_dont_know;
554 :
555 278471 : DUMP_VECT_SCOPE ("get_loop_niters");
556 :
557 278471 : if (exits.is_empty ())
558 0 : return conds;
559 :
560 278471 : if (dump_enabled_p ())
561 14606 : dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
562 : exits.length ());
563 :
564 : edge exit;
565 : unsigned int i;
566 678213 : FOR_EACH_VEC_ELT (exits, i, exit)
567 : {
568 399742 : gcond *cond = get_loop_exit_condition (exit);
569 399742 : if (cond)
570 399712 : conds.safe_push (cond);
571 :
572 399742 : if (dump_enabled_p ())
573 15740 : dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
574 :
575 399742 : if (exit != main_exit)
576 180068 : continue;
577 :
578 278471 : may_be_zero = NULL_TREE;
579 278471 : if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
580 278471 : || chrec_contains_undetermined (niter_desc.niter))
581 58797 : continue;
582 :
583 219674 : niter_assumptions = niter_desc.assumptions;
584 219674 : may_be_zero = niter_desc.may_be_zero;
585 219674 : niter = niter_desc.niter;
586 :
587 219674 : if (may_be_zero && integer_zerop (may_be_zero))
588 : may_be_zero = NULL_TREE;
589 :
590 9322 : if (may_be_zero)
591 : {
592 9322 : if (COMPARISON_CLASS_P (may_be_zero))
593 : {
594 : /* Try to combine may_be_zero with assumptions, this can simplify
595 : computation of niter expression. */
596 9322 : if (niter_assumptions && !integer_nonzerop (niter_assumptions))
597 1023 : niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
598 : niter_assumptions,
599 : fold_build1 (TRUTH_NOT_EXPR,
600 : boolean_type_node,
601 : may_be_zero));
602 : else
603 8299 : niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
604 : build_int_cst (TREE_TYPE (niter), 0),
605 : rewrite_to_non_trapping_overflow (niter));
606 :
607 219674 : may_be_zero = NULL_TREE;
608 : }
609 0 : else if (integer_nonzerop (may_be_zero))
610 : {
611 0 : *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
612 0 : *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
613 0 : continue;
614 : }
615 : else
616 0 : continue;
617 : }
618 :
619 : /* Loop assumptions are based off the normal exit. */
620 219674 : *assumptions = niter_assumptions;
621 219674 : *number_of_iterationsm1 = niter;
622 :
623 : /* We want the number of loop header executions which is the number
624 : of latch executions plus one.
625 : ??? For UINT_MAX latch executions this number overflows to zero
626 : for loops like do { n++; } while (n != 0); */
627 219674 : if (niter && !chrec_contains_undetermined (niter))
628 : {
629 219674 : niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
630 : unshare_expr (niter),
631 : build_int_cst (TREE_TYPE (niter), 1));
632 219674 : if (TREE_CODE (niter) == INTEGER_CST
633 120865 : && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
634 : {
635 : /* If we manage to fold niter + 1 into INTEGER_CST even when
636 : niter is some complex expression, ensure back
637 : *number_of_iterationsm1 is an INTEGER_CST as well. See
638 : PR113210. */
639 0 : *number_of_iterationsm1
640 0 : = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
641 : build_minus_one_cst (TREE_TYPE (niter)));
642 : }
643 : }
644 219674 : *number_of_iterations = niter;
645 : }
646 :
647 278471 : if (dump_enabled_p ())
648 14606 : dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
649 :
650 278471 : return conds;
651 278471 : }
652 :
653 : /* Determine the main loop exit for the vectorizer. */
654 :
655 : edge
656 491182 : vec_init_loop_exit_info (class loop *loop)
657 : {
658 : /* Before we begin we must first determine which exit is the main one and
659 : which are auxilary exits. */
660 491182 : auto_vec<edge> exits = get_loop_exit_edges (loop);
661 977330 : if (exits.length () == 0)
662 : return NULL;
663 486148 : if (exits.length () == 1)
664 319924 : return exits[0];
665 :
666 : /* If we have multiple exits, look for counting IV exit.
667 : Analyze all exits and return the last one we can analyze. */
668 166224 : class tree_niter_desc niter_desc;
669 166224 : edge candidate = NULL;
670 618009 : for (edge exit : exits)
671 : {
672 472231 : if (!get_loop_exit_condition (exit))
673 : {
674 20446 : if (dump_enabled_p ())
675 14 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
676 : "Unhandled loop exit detected.\n");
677 20446 : return NULL;
678 : }
679 :
680 451785 : if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
681 451785 : && !chrec_contains_undetermined (niter_desc.niter))
682 : {
683 133015 : tree may_be_zero = niter_desc.may_be_zero;
684 133015 : if ((integer_zerop (may_be_zero)
685 : /* As we are handling may_be_zero that's not false by
686 : rewriting niter to may_be_zero ? 0 : niter we require
687 : an empty latch. */
688 462404 : || (single_pred_p (loop->latch)
689 10272 : && exit->src == single_pred (loop->latch)
690 2784 : && (integer_nonzerop (may_be_zero)
691 2784 : || COMPARISON_CLASS_P (may_be_zero))))
692 135799 : && (!candidate
693 5875 : || dominated_by_p (CDI_DOMINATORS, exit->src,
694 5875 : candidate->src)))
695 : candidate = exit;
696 : }
697 : }
698 :
699 : /* If no exit is analyzable by scalar evolution, we return the last exit
700 : under the assummption we are dealing with an uncounted loop. */
701 201422 : if (!candidate && single_pred_p (loop->latch))
702 35198 : candidate = loop_exits_from_bb_p (loop, single_pred (loop->latch));
703 :
704 : return candidate;
705 166224 : }
706 :
707 : /* Function bb_in_loop_p
708 :
709 : Used as predicate for dfs order traversal of the loop bbs. */
710 :
711 : static bool
712 1509031 : bb_in_loop_p (const_basic_block bb, const void *data)
713 : {
714 1509031 : const class loop *const loop = (const class loop *)data;
715 1509031 : if (flow_bb_inside_loop_p (loop, bb))
716 : return true;
717 : return false;
718 : }
719 :
720 :
721 : /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
722 : stmt_vec_info structs for all the stmts in LOOP_IN. */
723 :
724 493412 : _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
725 : : vec_info (vec_info::loop, shared),
726 493412 : loop (loop_in),
727 493412 : num_itersm1 (NULL_TREE),
728 493412 : num_iters (NULL_TREE),
729 493412 : num_iters_unchanged (NULL_TREE),
730 493412 : num_iters_assumptions (NULL_TREE),
731 493412 : vector_costs (nullptr),
732 493412 : scalar_costs (nullptr),
733 493412 : th (0),
734 493412 : versioning_threshold (0),
735 493412 : vectorization_factor (0),
736 493412 : main_loop_edge (nullptr),
737 493412 : skip_main_loop_edge (nullptr),
738 493412 : skip_this_loop_edge (nullptr),
739 493412 : reusable_accumulators (),
740 493412 : suggested_unroll_factor (1),
741 493412 : max_vectorization_factor (0),
742 493412 : mask_skip_niters (NULL_TREE),
743 493412 : mask_skip_niters_pfa_offset (NULL_TREE),
744 493412 : rgroup_compare_type (NULL_TREE),
745 493412 : simd_if_cond (NULL_TREE),
746 493412 : partial_vector_style (vect_partial_vectors_none),
747 493412 : unaligned_dr (NULL),
748 493412 : peeling_for_alignment (0),
749 493412 : ptr_mask (0),
750 493412 : max_spec_read_amount (0),
751 493412 : nonlinear_iv (false),
752 493412 : ivexpr_map (NULL),
753 493412 : scan_map (NULL),
754 493412 : inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
755 493412 : vectorizable (false),
756 493412 : can_use_partial_vectors_p (true),
757 493412 : must_use_partial_vectors_p (false),
758 493412 : using_partial_vectors_p (false),
759 493412 : using_decrementing_iv_p (false),
760 493412 : using_select_vl_p (false),
761 493412 : allow_mutual_alignment (false),
762 493412 : partial_load_store_bias (0),
763 493412 : peeling_for_gaps (false),
764 493412 : peeling_for_niter (false),
765 493412 : early_breaks (false),
766 493412 : loop_iv_cond (NULL),
767 493412 : user_unroll (false),
768 493412 : no_data_dependencies (false),
769 493412 : has_mask_store (false),
770 493412 : scalar_loop_scaling (profile_probability::uninitialized ()),
771 493412 : scalar_loop (NULL),
772 493412 : main_loop_info (NULL),
773 493412 : orig_loop_info (NULL),
774 493412 : epilogue_vinfo (NULL),
775 493412 : drs_advanced_by (NULL_TREE),
776 493412 : vec_loop_main_exit (NULL),
777 493412 : vec_epilogue_loop_main_exit (NULL),
778 493412 : scalar_loop_main_exit (NULL)
779 : {
780 : /* CHECKME: We want to visit all BBs before their successors (except for
781 : latch blocks, for which this assertion wouldn't hold). In the simple
782 : case of the loop forms we allow, a dfs order of the BBs would the same
783 : as reversed postorder traversal, so we are safe. */
784 :
785 493412 : bbs = XCNEWVEC (basic_block, loop->num_nodes);
786 986824 : nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, bbs,
787 493412 : loop->num_nodes, loop);
788 493412 : gcc_assert (nbbs == loop->num_nodes);
789 :
790 1754365 : for (unsigned int i = 0; i < nbbs; i++)
791 : {
792 1260953 : basic_block bb = bbs[i];
793 1260953 : gimple_stmt_iterator si;
794 :
795 2538993 : for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
796 : {
797 1278040 : gimple *phi = gsi_stmt (si);
798 1278040 : gimple_set_uid (phi, 0);
799 1278040 : add_stmt (phi);
800 : }
801 :
802 11556898 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
803 : {
804 9034992 : gimple *stmt = gsi_stmt (si);
805 9034992 : gimple_set_uid (stmt, 0);
806 9034992 : if (is_gimple_debug (stmt))
807 3993572 : continue;
808 5041420 : add_stmt (stmt);
809 : /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
810 : third argument is the #pragma omp simd if (x) condition, when 0,
811 : loop shouldn't be vectorized, when non-zero constant, it should
812 : be vectorized normally, otherwise versioned with vectorized loop
813 : done if the condition is non-zero at runtime. */
814 5041420 : if (loop_in->simduid
815 43372 : && is_gimple_call (stmt)
816 4268 : && gimple_call_internal_p (stmt)
817 4141 : && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
818 4137 : && gimple_call_num_args (stmt) >= 3
819 103 : && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
820 5041523 : && (loop_in->simduid
821 103 : == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
822 : {
823 103 : tree arg = gimple_call_arg (stmt, 2);
824 103 : if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
825 103 : simd_if_cond = arg;
826 : else
827 0 : gcc_assert (integer_nonzerop (arg));
828 : }
829 : }
830 : }
831 493412 : }
832 :
833 : /* Free all levels of rgroup CONTROLS. */
834 :
835 : void
836 1246215 : release_vec_loop_controls (vec<rgroup_controls> *controls)
837 : {
838 1246215 : rgroup_controls *rgc;
839 1246215 : unsigned int i;
840 1263789 : FOR_EACH_VEC_ELT (*controls, i, rgc)
841 17574 : rgc->controls.release ();
842 1246215 : controls->release ();
843 1246215 : }
844 :
845 : /* Free all memory used by the _loop_vec_info, as well as all the
846 : stmt_vec_info structs of all the stmts in the loop. */
847 :
848 493412 : _loop_vec_info::~_loop_vec_info ()
849 : {
850 493412 : free (bbs);
851 :
852 493412 : release_vec_loop_controls (&masks.rgc_vec);
853 493412 : release_vec_loop_controls (&lens);
854 497377 : delete ivexpr_map;
855 493734 : delete scan_map;
856 493412 : delete scalar_costs;
857 493412 : delete vector_costs;
858 640277 : for (auto reduc_info : reduc_infos)
859 142870 : delete reduc_info;
860 :
861 : /* When we release an epiloge vinfo that we do not intend to use
862 : avoid clearing AUX of the main loop which should continue to
863 : point to the main loop vinfo since otherwise we'll leak that. */
864 493412 : if (loop->aux == this)
865 61576 : loop->aux = NULL;
866 986824 : }
867 :
868 : /* Return an invariant or register for EXPR and emit necessary
869 : computations in the LOOP_VINFO loop preheader. */
870 :
871 : tree
872 20631 : cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
873 : {
874 20631 : if (is_gimple_reg (expr)
875 20631 : || is_gimple_min_invariant (expr))
876 6930 : return expr;
877 :
878 13701 : if (! loop_vinfo->ivexpr_map)
879 3965 : loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
880 13701 : tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
881 13701 : if (! cached)
882 : {
883 8867 : gimple_seq stmts = NULL;
884 8867 : cached = force_gimple_operand (unshare_expr (expr),
885 : &stmts, true, NULL_TREE);
886 8867 : if (stmts)
887 : {
888 8727 : edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
889 8727 : gsi_insert_seq_on_edge_immediate (e, stmts);
890 : }
891 : }
892 13701 : return cached;
893 : }
894 :
895 : /* Return true if we can use CMP_TYPE as the comparison type to produce
896 : all masks required to mask LOOP_VINFO. */
897 :
898 : static bool
899 78817 : can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
900 : {
901 78817 : rgroup_controls *rgm;
902 78817 : unsigned int i;
903 91375 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
904 91375 : if (rgm->type != NULL_TREE
905 91375 : && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
906 : cmp_type, rgm->type,
907 : OPTIMIZE_FOR_SPEED))
908 : return false;
909 : return true;
910 : }
911 :
912 : /* Calculate the maximum number of scalars per iteration for every
913 : rgroup in LOOP_VINFO. */
914 :
915 : static unsigned int
916 16755 : vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
917 : {
918 16755 : unsigned int res = 1;
919 16755 : unsigned int i;
920 16755 : rgroup_controls *rgm;
921 41044 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
922 24289 : res = MAX (res, rgm->max_nscalars_per_iter);
923 16755 : return res;
924 : }
925 :
926 : /* Calculate the minimum precision necessary to represent:
927 :
928 : MAX_NITERS * FACTOR
929 :
930 : as an unsigned integer, where MAX_NITERS is the maximum number of
931 : loop header iterations for the original scalar form of LOOP_VINFO. */
932 :
933 : unsigned
934 18531 : vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
935 : {
936 18531 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
937 :
938 : /* Get the maximum number of iterations that is representable
939 : in the counter type. */
940 18531 : tree ni_type;
941 18531 : if (!LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo))
942 18531 : ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
943 : else
944 0 : ni_type = sizetype;
945 18531 : widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
946 :
947 : /* Get a more refined estimate for the number of iterations. */
948 18531 : widest_int max_back_edges;
949 18531 : if (max_loop_iterations (loop, &max_back_edges))
950 18531 : max_ni = wi::smin (max_ni, max_back_edges + 1);
951 :
952 : /* Work out how many bits we need to represent the limit. */
953 18531 : return wi::min_precision (max_ni * factor, UNSIGNED);
954 18531 : }
955 :
956 : /* True if the loop needs peeling or partial vectors when vectorized. */
957 :
958 : static bool
959 112904 : vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
960 : {
961 112904 : unsigned HOST_WIDE_INT const_vf;
962 :
963 112904 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
964 : return true;
965 :
966 12598 : loop_vec_info main_loop_vinfo
967 111642 : = (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
968 111642 : ? LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo) : loop_vinfo);
969 111642 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
970 53525 : && LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo) >= 0)
971 : {
972 : /* Work out the (constant) number of iterations that need to be
973 : peeled for reasons other than niters. */
974 53482 : unsigned int peel_niter
975 : = LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo);
976 53482 : return !multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
977 53482 : LOOP_VINFO_VECT_FACTOR (loop_vinfo));
978 : }
979 :
980 58160 : if (!LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo)
981 58160 : && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf))
982 : {
983 : /* When the number of iterations is a multiple of the vectorization
984 : factor and we are not doing prologue or forced epilogue peeling
985 : the epilogue isn't necessary. */
986 57749 : if (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
987 115498 : >= (unsigned) exact_log2 (const_vf))
988 : return false;
989 : }
990 :
991 : return true;
992 : }
993 :
994 : /* Each statement in LOOP_VINFO can be masked where necessary. Check
995 : whether we can actually generate the masks required. Return true if so,
996 : storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
997 :
998 : static bool
999 16755 : vect_verify_full_masking (loop_vec_info loop_vinfo)
1000 : {
1001 16755 : unsigned int min_ni_width;
1002 :
1003 : /* Use a normal loop if there are no statements that need masking.
1004 : This only happens in rare degenerate cases: it means that the loop
1005 : has no loads, no stores, and no live-out values. */
1006 16755 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1007 : return false;
1008 :
1009 : /* Produce the rgroup controls. */
1010 67279 : for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1011 : {
1012 25262 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1013 25262 : tree vectype = mask.first;
1014 25262 : unsigned nvectors = mask.second;
1015 :
1016 33769 : if (masks->rgc_vec.length () < nvectors)
1017 18420 : masks->rgc_vec.safe_grow_cleared (nvectors, true);
1018 25262 : rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1019 : /* The number of scalars per iteration and the number of vectors are
1020 : both compile-time constants. */
1021 25262 : unsigned int nscalars_per_iter
1022 25262 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1023 25262 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1024 :
1025 25262 : if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1026 : {
1027 20029 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1028 20029 : rgm->type = truth_type_for (vectype);
1029 20029 : rgm->factor = 1;
1030 : }
1031 : }
1032 :
1033 16755 : unsigned int max_nscalars_per_iter
1034 16755 : = vect_get_max_nscalars_per_iter (loop_vinfo);
1035 :
1036 : /* Work out how many bits we need to represent the limit. */
1037 16755 : min_ni_width
1038 16755 : = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1039 :
1040 : /* Find a scalar mode for which WHILE_ULT is supported. */
1041 16755 : opt_scalar_int_mode cmp_mode_iter;
1042 16755 : tree cmp_type = NULL_TREE;
1043 16755 : tree iv_type = NULL_TREE;
1044 16755 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1045 16755 : unsigned int iv_precision = UINT_MAX;
1046 :
1047 16755 : if (iv_limit != -1)
1048 16755 : iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1049 : UNSIGNED);
1050 :
1051 134040 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1052 : {
1053 117285 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1054 117285 : if (cmp_bits >= min_ni_width
1055 117285 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1056 : {
1057 78817 : tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1058 78817 : if (this_type
1059 78817 : && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1060 : {
1061 : /* Although we could stop as soon as we find a valid mode,
1062 : there are at least two reasons why that's not always the
1063 : best choice:
1064 :
1065 : - An IV that's Pmode or wider is more likely to be reusable
1066 : in address calculations than an IV that's narrower than
1067 : Pmode.
1068 :
1069 : - Doing the comparison in IV_PRECISION or wider allows
1070 : a natural 0-based IV, whereas using a narrower comparison
1071 : type requires mitigations against wrap-around.
1072 :
1073 : Conversely, if the IV limit is variable, doing the comparison
1074 : in a wider type than the original type can introduce
1075 : unnecessary extensions, so picking the widest valid mode
1076 : is not always a good choice either.
1077 :
1078 : Here we prefer the first IV type that's Pmode or wider,
1079 : and the first comparison type that's IV_PRECISION or wider.
1080 : (The comparison type must be no wider than the IV type,
1081 : to avoid extensions in the vector loop.)
1082 :
1083 : ??? We might want to try continuing beyond Pmode for ILP32
1084 : targets if CMP_BITS < IV_PRECISION. */
1085 0 : iv_type = this_type;
1086 0 : if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1087 : cmp_type = this_type;
1088 0 : if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1089 : break;
1090 : }
1091 : }
1092 : }
1093 :
1094 16755 : if (!cmp_type)
1095 : {
1096 16755 : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1097 16755 : return false;
1098 : }
1099 :
1100 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1101 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1102 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1103 0 : return true;
1104 16755 : }
1105 :
1106 : /* Each statement in LOOP_VINFO can be masked where necessary. Check
1107 : whether we can actually generate AVX512 style masks. Return true if so,
1108 : storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1109 :
1110 : static bool
1111 16755 : vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1112 : {
1113 : /* Produce differently organized rgc_vec and differently check
1114 : we can produce masks. */
1115 :
1116 : /* Use a normal loop if there are no statements that need masking.
1117 : This only happens in rare degenerate cases: it means that the loop
1118 : has no loads, no stores, and no live-out values. */
1119 16755 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1120 : return false;
1121 :
1122 : /* For the decrementing IV we need to represent all values in
1123 : [0, niter + niter_skip] where niter_skip is the elements we
1124 : skip in the first iteration for prologue peeling. */
1125 16755 : tree iv_type = NULL_TREE;
1126 16755 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1127 16755 : unsigned int iv_precision = UINT_MAX;
1128 16755 : if (iv_limit != -1)
1129 16755 : iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1130 :
1131 : /* First compute the type for the IV we use to track the remaining
1132 : scalar iterations. */
1133 16755 : opt_scalar_int_mode cmp_mode_iter;
1134 21696 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1135 : {
1136 21696 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1137 21696 : if (cmp_bits >= iv_precision
1138 21696 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1139 : {
1140 16755 : iv_type = build_nonstandard_integer_type (cmp_bits, true);
1141 16755 : if (iv_type)
1142 : break;
1143 : }
1144 : }
1145 16755 : if (!iv_type)
1146 : return false;
1147 :
1148 : /* Produce the rgroup controls. */
1149 67279 : for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1150 : {
1151 25262 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1152 25262 : tree vectype = mask.first;
1153 25262 : unsigned nvectors = mask.second;
1154 :
1155 : /* The number of scalars per iteration and the number of vectors are
1156 : both compile-time constants. */
1157 25262 : unsigned int nscalars_per_iter
1158 25262 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1159 25262 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1160 :
1161 : /* We index the rgroup_controls vector with nscalars_per_iter
1162 : which we keep constant and instead have a varying nvectors,
1163 : remembering the vector mask with the fewest nV. */
1164 33769 : if (masks->rgc_vec.length () < nscalars_per_iter)
1165 16793 : masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1166 25262 : rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1167 :
1168 25262 : if (!rgm->type || rgm->factor > nvectors)
1169 : {
1170 18398 : rgm->type = truth_type_for (vectype);
1171 18398 : rgm->compare_type = NULL_TREE;
1172 18398 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1173 18398 : rgm->factor = nvectors;
1174 18398 : rgm->bias_adjusted_ctrl = NULL_TREE;
1175 : }
1176 : }
1177 :
1178 : /* There is no fixed compare type we are going to use but we have to
1179 : be able to get at one for each mask group. */
1180 16755 : unsigned int min_ni_width
1181 16755 : = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1182 :
1183 16755 : bool ok = true;
1184 64142 : for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1185 : {
1186 17532 : tree mask_type = rgc.type;
1187 17532 : if (!mask_type)
1188 700 : continue;
1189 :
1190 : /* For now vect_get_loop_mask only supports integer mode masks
1191 : when we need to split it. */
1192 16832 : if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1193 16832 : || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1194 : {
1195 : ok = false;
1196 : break;
1197 : }
1198 :
1199 : /* If iv_type is usable as compare type use that - we can elide the
1200 : saturation in that case. */
1201 13181 : if (TYPE_PRECISION (iv_type) >= min_ni_width)
1202 : {
1203 13181 : tree cmp_vectype
1204 13181 : = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1205 13181 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1206 4668 : rgc.compare_type = cmp_vectype;
1207 : }
1208 13181 : if (!rgc.compare_type)
1209 24888 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1210 : {
1211 24884 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1212 24884 : if (cmp_bits >= min_ni_width
1213 24884 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1214 : {
1215 24872 : tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1216 24872 : if (!cmp_type)
1217 0 : continue;
1218 :
1219 : /* Check whether we can produce the mask with cmp_type. */
1220 24872 : tree cmp_vectype
1221 24872 : = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1222 24872 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1223 : {
1224 8509 : rgc.compare_type = cmp_vectype;
1225 8509 : break;
1226 : }
1227 : }
1228 : }
1229 13181 : if (!rgc.compare_type)
1230 : {
1231 : ok = false;
1232 : break;
1233 : }
1234 : }
1235 16755 : if (!ok)
1236 : {
1237 3655 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1238 3655 : return false;
1239 : }
1240 :
1241 13100 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1242 13100 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1243 13100 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1244 13100 : return true;
1245 16755 : }
1246 :
1247 : /* Check whether we can use vector access with length based on precison
1248 : comparison. So far, to keep it simple, we only allow the case that the
1249 : precision of the target supported length is larger than the precision
1250 : required by loop niters. */
1251 :
1252 : static bool
1253 6 : vect_verify_loop_lens (loop_vec_info loop_vinfo)
1254 : {
1255 6 : if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1256 : return false;
1257 :
1258 0 : if (!VECTOR_MODE_P (loop_vinfo->vector_mode))
1259 : return false;
1260 :
1261 0 : machine_mode len_load_mode, len_store_mode;
1262 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1263 0 : .exists (&len_load_mode))
1264 0 : return false;
1265 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1266 0 : .exists (&len_store_mode))
1267 0 : return false;
1268 :
1269 0 : signed char partial_load_bias = internal_len_load_store_bias
1270 0 : (IFN_LEN_LOAD, len_load_mode);
1271 :
1272 0 : signed char partial_store_bias = internal_len_load_store_bias
1273 0 : (IFN_LEN_STORE, len_store_mode);
1274 :
1275 0 : gcc_assert (partial_load_bias == partial_store_bias);
1276 :
1277 0 : if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1278 : return false;
1279 :
1280 : /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1281 : len_loads with a length of zero. In order to avoid that we prohibit
1282 : more than one loop length here. */
1283 0 : if (partial_load_bias == -1
1284 0 : && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1285 : return false;
1286 :
1287 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1288 :
1289 0 : unsigned int max_nitems_per_iter = 1;
1290 0 : unsigned int i;
1291 0 : rgroup_controls *rgl;
1292 : /* Find the maximum number of items per iteration for every rgroup. */
1293 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1294 : {
1295 0 : unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1296 0 : max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1297 : }
1298 :
1299 : /* Work out how many bits we need to represent the length limit. */
1300 0 : unsigned int min_ni_prec
1301 0 : = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1302 :
1303 : /* Now use the maximum of below precisions for one suitable IV type:
1304 : - the IV's natural precision
1305 : - the precision needed to hold: the maximum number of scalar
1306 : iterations multiplied by the scale factor (min_ni_prec above)
1307 : - the Pmode precision
1308 :
1309 : If min_ni_prec is less than the precision of the current niters,
1310 : we perfer to still use the niters type. Prefer to use Pmode and
1311 : wider IV to avoid narrow conversions. */
1312 :
1313 0 : unsigned int ni_prec
1314 0 : = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1315 0 : min_ni_prec = MAX (min_ni_prec, ni_prec);
1316 0 : min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1317 :
1318 0 : tree iv_type = NULL_TREE;
1319 0 : opt_scalar_int_mode tmode_iter;
1320 0 : FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1321 : {
1322 0 : scalar_mode tmode = tmode_iter.require ();
1323 0 : unsigned int tbits = GET_MODE_BITSIZE (tmode);
1324 :
1325 : /* ??? Do we really want to construct one IV whose precision exceeds
1326 : BITS_PER_WORD? */
1327 0 : if (tbits > BITS_PER_WORD)
1328 : break;
1329 :
1330 : /* Find the first available standard integral type. */
1331 0 : if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1332 : {
1333 0 : iv_type = build_nonstandard_integer_type (tbits, true);
1334 0 : break;
1335 : }
1336 : }
1337 :
1338 0 : if (!iv_type)
1339 : {
1340 0 : if (dump_enabled_p ())
1341 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1342 : "can't vectorize with length-based partial vectors"
1343 : " because there is no suitable iv type.\n");
1344 0 : return false;
1345 : }
1346 :
1347 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1348 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1349 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1350 :
1351 0 : return true;
1352 : }
1353 :
1354 : /* Calculate the cost of one scalar iteration of the loop. */
1355 : static void
1356 295302 : vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1357 : {
1358 295302 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1359 295302 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1360 295302 : int nbbs = loop->num_nodes, factor;
1361 295302 : int innerloop_iters, i;
1362 :
1363 295302 : DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1364 :
1365 : /* Gather costs for statements in the scalar loop. */
1366 :
1367 : /* FORNOW. */
1368 295302 : innerloop_iters = 1;
1369 295302 : if (loop->inner)
1370 1348 : innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1371 :
1372 1041051 : for (i = 0; i < nbbs; i++)
1373 : {
1374 745749 : gimple_stmt_iterator si;
1375 745749 : basic_block bb = bbs[i];
1376 :
1377 745749 : if (bb->loop_father == loop->inner)
1378 : factor = innerloop_iters;
1379 : else
1380 743053 : factor = 1;
1381 :
1382 5922919 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1383 : {
1384 4431421 : gimple *stmt = gsi_stmt (si);
1385 4431421 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1386 :
1387 4431421 : if (!is_gimple_assign (stmt)
1388 : && !is_gimple_call (stmt)
1389 : && !is_a<gcond *> (stmt))
1390 1634686 : continue;
1391 :
1392 : /* Skip stmts that are not vectorized inside the loop. */
1393 2796735 : stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1394 2796735 : if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1395 1361694 : && (!STMT_VINFO_LIVE_P (vstmt_info)
1396 47 : || !VECTORIZABLE_CYCLE_DEF
1397 : (STMT_VINFO_DEF_TYPE (vstmt_info))))
1398 1361694 : continue;
1399 :
1400 1435041 : vect_cost_for_stmt kind;
1401 1435041 : if (STMT_VINFO_DATA_REF (stmt_info))
1402 : {
1403 686658 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1404 : kind = scalar_load;
1405 : else
1406 240373 : kind = scalar_store;
1407 : }
1408 748383 : else if (vect_nop_conversion_p (stmt_info))
1409 41369 : continue;
1410 : else
1411 : kind = scalar_stmt;
1412 :
1413 : /* We are using vect_prologue here to avoid scaling twice
1414 : by the inner loop factor. */
1415 1393672 : record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1416 : factor, kind, stmt_info, 0, vect_prologue);
1417 : }
1418 : }
1419 :
1420 : /* Now accumulate cost. */
1421 295302 : loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1422 295302 : add_stmt_costs (loop_vinfo->scalar_costs,
1423 : &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1424 295302 : loop_vinfo->scalar_costs->finish_cost (nullptr);
1425 295302 : }
1426 :
1427 : /* Function vect_analyze_loop_form.
1428 :
1429 : Verify that certain CFG restrictions hold, including:
1430 : - the loop has a pre-header
1431 : - the loop has a single entry
1432 : - nested loops can have only a single exit.
1433 : - the loop exit condition is simple enough
1434 : - the number of iterations can be analyzed, i.e, a countable loop. The
1435 : niter could be analyzed under some assumptions. */
1436 :
1437 : opt_result
1438 454831 : vect_analyze_loop_form (class loop *loop, gimple *loop_vectorized_call,
1439 : vect_loop_form_info *info)
1440 : {
1441 454831 : DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1442 :
1443 454831 : edge exit_e = vec_init_loop_exit_info (loop);
1444 454831 : if (!exit_e)
1445 29474 : return opt_result::failure_at (vect_location,
1446 : "not vectorized:"
1447 : " Infinite loop detected.\n");
1448 425357 : if (loop_vectorized_call)
1449 : {
1450 28583 : tree arg = gimple_call_arg (loop_vectorized_call, 1);
1451 28583 : class loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg));
1452 28583 : edge scalar_exit_e = vec_init_loop_exit_info (scalar_loop);
1453 28583 : if (!scalar_exit_e)
1454 0 : return opt_result::failure_at (vect_location,
1455 : "not vectorized:"
1456 : " could not determine main exit from"
1457 : " loop with multiple exits.\n");
1458 : }
1459 :
1460 425357 : info->loop_exit = exit_e;
1461 425357 : if (dump_enabled_p ())
1462 16005 : dump_printf_loc (MSG_NOTE, vect_location,
1463 : "using as main loop exit: %d -> %d [AUX: %p]\n",
1464 16005 : exit_e->src->index, exit_e->dest->index, exit_e->aux);
1465 :
1466 : /* Check if we have any control flow that doesn't leave the loop. */
1467 425357 : basic_block *bbs = get_loop_body (loop);
1468 1392573 : for (unsigned i = 0; i < loop->num_nodes; i++)
1469 1082983 : if (EDGE_COUNT (bbs[i]->succs) != 1
1470 1082983 : && (EDGE_COUNT (bbs[i]->succs) != 2
1471 649264 : || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1472 : {
1473 115767 : free (bbs);
1474 115767 : return opt_result::failure_at (vect_location,
1475 : "not vectorized:"
1476 : " unsupported control flow in loop.\n");
1477 : }
1478 :
1479 : /* Check if we have any control flow that doesn't leave the loop. */
1480 310685 : bool has_phi = false;
1481 310685 : for (unsigned i = 0; i < loop->num_nodes; i++)
1482 310228 : if (!gimple_seq_empty_p (phi_nodes (bbs[i])))
1483 : {
1484 : has_phi = true;
1485 : break;
1486 : }
1487 309590 : if (!has_phi)
1488 457 : return opt_result::failure_at (vect_location,
1489 : "not vectorized:"
1490 : " no scalar evolution detected in loop.\n");
1491 :
1492 309133 : free (bbs);
1493 :
1494 : /* Different restrictions apply when we are considering an inner-most loop,
1495 : vs. an outer (nested) loop.
1496 : (FORNOW. May want to relax some of these restrictions in the future). */
1497 :
1498 309133 : info->inner_loop_cond = NULL;
1499 309133 : if (!loop->inner)
1500 : {
1501 : /* Inner-most loop. */
1502 :
1503 290661 : if (empty_block_p (loop->header))
1504 0 : return opt_result::failure_at (vect_location,
1505 : "not vectorized: empty loop.\n");
1506 : }
1507 : else
1508 : {
1509 18472 : class loop *innerloop = loop->inner;
1510 18472 : edge entryedge;
1511 :
1512 : /* Nested loop. We currently require that the loop is doubly-nested,
1513 : contains a single inner loop with a single exit to the block
1514 : with the single exit condition in the outer loop.
1515 : Vectorizable outer-loops look like this:
1516 :
1517 : (pre-header)
1518 : |
1519 : header <---+
1520 : | |
1521 : inner-loop |
1522 : | |
1523 : tail ------+
1524 : |
1525 : (exit-bb)
1526 :
1527 : The inner-loop also has the properties expected of inner-most loops
1528 : as described above. */
1529 :
1530 18472 : if ((loop->inner)->inner || (loop->inner)->next)
1531 2935 : return opt_result::failure_at (vect_location,
1532 : "not vectorized:"
1533 : " multiple nested loops.\n");
1534 :
1535 15537 : entryedge = loop_preheader_edge (innerloop);
1536 15537 : if (entryedge->src != loop->header
1537 15039 : || !single_exit (innerloop)
1538 26946 : || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1539 4470 : return opt_result::failure_at (vect_location,
1540 : "not vectorized:"
1541 : " unsupported outerloop form.\n");
1542 :
1543 : /* Analyze the inner-loop. */
1544 11067 : vect_loop_form_info inner;
1545 11067 : opt_result res = vect_analyze_loop_form (loop->inner, NULL, &inner);
1546 11067 : if (!res)
1547 : {
1548 416 : if (dump_enabled_p ())
1549 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1550 : "not vectorized: Bad inner loop.\n");
1551 416 : return res;
1552 : }
1553 :
1554 : /* Don't support analyzing niter under assumptions for inner
1555 : loop. */
1556 10651 : if (!integer_onep (inner.assumptions))
1557 257 : return opt_result::failure_at (vect_location,
1558 : "not vectorized: Bad inner loop.\n");
1559 :
1560 10394 : if (inner.number_of_iterations == chrec_dont_know
1561 10394 : || !expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1562 1837 : return opt_result::failure_at (vect_location,
1563 : "not vectorized: inner-loop count not"
1564 : " invariant.\n");
1565 :
1566 8557 : if (dump_enabled_p ())
1567 1049 : dump_printf_loc (MSG_NOTE, vect_location,
1568 : "Considering outer-loop vectorization.\n");
1569 8557 : info->inner_loop_cond = inner.conds[0];
1570 11067 : }
1571 :
1572 299218 : if (EDGE_COUNT (loop->header->preds) != 2)
1573 0 : return opt_result::failure_at (vect_location,
1574 : "not vectorized:"
1575 : " too many incoming edges.\n");
1576 :
1577 : /* We assume that the latch is empty. */
1578 299218 : basic_block latch = loop->latch;
1579 299218 : do
1580 : {
1581 299218 : if (!empty_block_p (latch)
1582 299218 : || !gimple_seq_empty_p (phi_nodes (latch)))
1583 20714 : return opt_result::failure_at (vect_location,
1584 : "not vectorized: latch block not "
1585 : "empty.\n");
1586 278504 : latch = single_pred (latch);
1587 : }
1588 557008 : while (single_succ_p (latch));
1589 :
1590 : /* Make sure there is no abnormal exit. */
1591 278504 : auto_vec<edge> exits = get_loop_exit_edges (loop);
1592 1235254 : for (edge e : exits)
1593 : {
1594 399775 : if (e->flags & EDGE_ABNORMAL)
1595 33 : return opt_result::failure_at (vect_location,
1596 : "not vectorized:"
1597 : " abnormal loop exit edge.\n");
1598 : }
1599 :
1600 278471 : info->conds
1601 278471 : = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1602 : &info->number_of_iterations,
1603 278471 : &info->number_of_iterationsm1);
1604 278471 : if (info->conds.is_empty ())
1605 30 : return opt_result::failure_at
1606 30 : (vect_location,
1607 : "not vectorized: complicated exit condition.\n");
1608 :
1609 : /* Determine what the primary and alternate exit conds are. */
1610 678153 : for (unsigned i = 0; i < info->conds.length (); i++)
1611 : {
1612 399712 : gcond *cond = info->conds[i];
1613 399712 : if (exit_e->src == gimple_bb (cond))
1614 278441 : std::swap (info->conds[0], info->conds[i]);
1615 : }
1616 :
1617 278441 : if (chrec_contains_undetermined (info->number_of_iterations))
1618 : {
1619 58767 : if (dump_enabled_p ())
1620 257 : dump_printf_loc (MSG_NOTE, vect_location,
1621 : "Loop being analyzed as uncounted.\n");
1622 58767 : if (loop->inner)
1623 562 : return opt_result::failure_at
1624 562 : (vect_location,
1625 : "not vectorized: outer loop vectorization of uncounted loops"
1626 : " is unsupported.\n");
1627 58205 : return opt_result::success ();
1628 : }
1629 :
1630 219674 : if (integer_zerop (info->assumptions))
1631 4 : return opt_result::failure_at
1632 4 : (info->conds[0],
1633 : "not vectorized: number of iterations cannot be computed.\n");
1634 :
1635 219670 : if (integer_zerop (info->number_of_iterations))
1636 12 : return opt_result::failure_at
1637 12 : (info->conds[0],
1638 : "not vectorized: number of iterations = 0.\n");
1639 :
1640 219658 : if (!(tree_fits_shwi_p (info->number_of_iterations)
1641 120842 : && tree_to_shwi (info->number_of_iterations) > 0))
1642 : {
1643 98816 : if (dump_enabled_p ())
1644 : {
1645 2468 : dump_printf_loc (MSG_NOTE, vect_location,
1646 : "Symbolic number of iterations is ");
1647 2468 : dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1648 2468 : dump_printf (MSG_NOTE, "\n");
1649 : }
1650 : }
1651 :
1652 219658 : if (!integer_onep (info->assumptions))
1653 : {
1654 8487 : if (dump_enabled_p ())
1655 : {
1656 66 : dump_printf_loc (MSG_NOTE, vect_location,
1657 : "Loop to be versioned with niter assumption ");
1658 66 : dump_generic_expr (MSG_NOTE, TDF_SLIM, info->assumptions);
1659 66 : dump_printf (MSG_NOTE, "\n");
1660 : }
1661 : }
1662 :
1663 219658 : return opt_result::success ();
1664 278504 : }
1665 :
1666 : /* Create a loop_vec_info for LOOP with SHARED and the
1667 : vect_analyze_loop_form result. */
1668 :
1669 : loop_vec_info
1670 493412 : vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1671 : const vect_loop_form_info *info,
1672 : loop_vec_info orig_loop_info)
1673 : {
1674 493412 : loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1675 493412 : LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1676 493412 : LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1677 493412 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1678 493412 : LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_info;
1679 493412 : if (orig_loop_info && LOOP_VINFO_EPILOGUE_P (orig_loop_info))
1680 171 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo)
1681 171 : = LOOP_VINFO_MAIN_LOOP_INFO (orig_loop_info);
1682 : else
1683 493241 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo) = orig_loop_info;
1684 : /* Also record the assumptions for versioning. */
1685 493412 : if (!integer_onep (info->assumptions) && !orig_loop_info)
1686 18799 : LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1687 :
1688 2221726 : for (gcond *cond : info->conds)
1689 : {
1690 741490 : stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1691 : /* Mark the statement as a condition. */
1692 741490 : STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1693 : }
1694 :
1695 493412 : unsigned cond_id = 0;
1696 493412 : if (!LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo))
1697 408974 : LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[cond_id++];
1698 :
1699 825928 : for (; cond_id < info->conds.length (); cond_id ++)
1700 332516 : LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[cond_id]);
1701 :
1702 493412 : LOOP_VINFO_MAIN_EXIT (loop_vinfo) = info->loop_exit;
1703 :
1704 : /* Check to see if we're vectorizing multiple exits. */
1705 493412 : LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1706 493412 : = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1707 :
1708 493412 : if (info->inner_loop_cond)
1709 : {
1710 : /* If we have an estimate on the number of iterations of the inner
1711 : loop use that to limit the scale for costing, otherwise use
1712 : --param vect-inner-loop-cost-factor literally. */
1713 8672 : widest_int nit;
1714 8672 : if (estimated_stmt_executions (loop->inner, &nit))
1715 7391 : LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1716 7391 : = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1717 8672 : }
1718 :
1719 493412 : return loop_vinfo;
1720 : }
1721 :
1722 :
1723 :
1724 : /* Return true if we know that the iteration count is smaller than the
1725 : vectorization factor. Return false if it isn't, or if we can't be sure
1726 : either way. */
1727 :
1728 : static bool
1729 112095 : vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1730 : {
1731 112095 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1732 :
1733 112095 : HOST_WIDE_INT max_niter;
1734 112095 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1735 53743 : max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1736 : else
1737 58352 : max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1738 :
1739 112095 : if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1740 10789 : return true;
1741 :
1742 : return false;
1743 : }
1744 :
1745 : /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1746 : is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1747 : definitely no, or -1 if it's worth retrying. */
1748 :
1749 : static int
1750 112103 : vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1751 : unsigned *suggested_unroll_factor)
1752 : {
1753 112103 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1754 112103 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1755 :
1756 : /* Only loops that can handle partially-populated vectors can have iteration
1757 : counts less than the vectorization factor. */
1758 112103 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
1759 112103 : && vect_known_niters_smaller_than_vf (loop_vinfo))
1760 : {
1761 10779 : if (dump_enabled_p ())
1762 236 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1763 : "not vectorized: iteration count smaller than "
1764 : "vectorization factor.\n");
1765 10779 : return 0;
1766 : }
1767 :
1768 : /* If we know the number of iterations we can do better, for the
1769 : epilogue we can also decide whether the main loop leaves us
1770 : with enough iterations, prefering a smaller vector epilog then
1771 : also possibly used for the case we skip the vector loop. */
1772 101324 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1773 : {
1774 44192 : widest_int scalar_niters
1775 44192 : = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
1776 44192 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1777 : {
1778 2692 : loop_vec_info orig_loop_vinfo
1779 : = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1780 2692 : loop_vec_info main_loop_vinfo
1781 : = LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo);
1782 2692 : unsigned lowest_vf
1783 2692 : = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
1784 2692 : int prolog_peeling = 0;
1785 2692 : if (!vect_use_loop_mask_for_alignment_p (main_loop_vinfo))
1786 2692 : prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo);
1787 2692 : if (prolog_peeling >= 0
1788 2692 : && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
1789 : lowest_vf))
1790 : {
1791 5374 : unsigned gap
1792 2687 : = LOOP_VINFO_PEELING_FOR_GAPS (main_loop_vinfo) ? 1 : 0;
1793 5374 : scalar_niters = ((scalar_niters - gap - prolog_peeling)
1794 5374 : % lowest_vf + gap);
1795 : }
1796 : }
1797 : /* Reject vectorizing for a single scalar iteration, even if
1798 : we could in principle implement that using partial vectors.
1799 : But allow such vectorization if VF == 1 in case we do not
1800 : need to peel for gaps (if we need, avoid vectorization for
1801 : reasons of code footprint). */
1802 44192 : unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
1803 44192 : if (scalar_niters <= peeling_gap + 1
1804 44192 : && (assumed_vf > 1 || peeling_gap != 0))
1805 : {
1806 690 : if (dump_enabled_p ())
1807 159 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1808 : "not vectorized: loop only has a single "
1809 : "scalar iteration.\n");
1810 690 : return 0;
1811 : }
1812 :
1813 43502 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1814 : {
1815 : /* Check that the loop processes at least one full vector. */
1816 43491 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1817 43491 : if (known_lt (scalar_niters, vf))
1818 : {
1819 364 : if (dump_enabled_p ())
1820 296 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1821 : "loop does not have enough iterations "
1822 : "to support vectorization.\n");
1823 404 : return 0;
1824 : }
1825 :
1826 : /* If we need to peel an extra epilogue iteration to handle data
1827 : accesses with gaps, check that there are enough scalar iterations
1828 : available.
1829 :
1830 : The check above is redundant with this one when peeling for gaps,
1831 : but the distinction is useful for diagnostics. */
1832 43127 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1833 43426 : && known_le (scalar_niters, vf))
1834 : {
1835 40 : if (dump_enabled_p ())
1836 9 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1837 : "loop does not have enough iterations "
1838 : "to support peeling for gaps.\n");
1839 40 : return 0;
1840 : }
1841 : }
1842 44192 : }
1843 :
1844 : /* If using the "very cheap" model. reject cases in which we'd keep
1845 : a copy of the scalar code (even if we might be able to vectorize it). */
1846 100230 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1847 100230 : && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1848 49443 : || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
1849 : {
1850 721 : if (dump_enabled_p ())
1851 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1852 : "some scalar iterations would need to be peeled\n");
1853 721 : return 0;
1854 : }
1855 :
1856 99509 : int min_profitable_iters, min_profitable_estimate;
1857 99509 : vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1858 : &min_profitable_estimate,
1859 : suggested_unroll_factor);
1860 :
1861 99509 : if (min_profitable_iters < 0)
1862 : {
1863 24191 : if (dump_enabled_p ())
1864 22 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1865 : "not vectorized: vectorization not profitable.\n");
1866 24191 : if (dump_enabled_p ())
1867 22 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1868 : "not vectorized: vector version will never be "
1869 : "profitable.\n");
1870 24191 : return -1;
1871 : }
1872 :
1873 75318 : int min_scalar_loop_bound = (param_min_vect_loop_bound
1874 75318 : * assumed_vf);
1875 :
1876 : /* Use the cost model only if it is more conservative than user specified
1877 : threshold. */
1878 75318 : unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1879 : min_profitable_iters);
1880 :
1881 75318 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1882 :
1883 38098 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1884 113416 : && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1885 : {
1886 384 : if (dump_enabled_p ())
1887 1 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1888 : "not vectorized: vectorization not profitable.\n");
1889 384 : if (dump_enabled_p ())
1890 1 : dump_printf_loc (MSG_NOTE, vect_location,
1891 : "not vectorized: iteration count smaller than user "
1892 : "specified loop bound parameter or minimum profitable "
1893 : "iterations (whichever is more conservative).\n");
1894 384 : return 0;
1895 : }
1896 :
1897 : /* The static profitablity threshold min_profitable_estimate includes
1898 : the cost of having to check at runtime whether the scalar loop
1899 : should be used instead. If it turns out that we don't need or want
1900 : such a check, the threshold we should use for the static estimate
1901 : is simply the point at which the vector loop becomes more profitable
1902 : than the scalar loop. */
1903 74934 : if (min_profitable_estimate > min_profitable_iters
1904 16145 : && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1905 15628 : && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1906 287 : && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1907 75221 : && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1908 : {
1909 11 : if (dump_enabled_p ())
1910 6 : dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1911 : " choice between the scalar and vector loops\n");
1912 11 : min_profitable_estimate = min_profitable_iters;
1913 : }
1914 :
1915 : /* If the vector loop needs multiple iterations to be beneficial then
1916 : things are probably too close to call, and the conservative thing
1917 : would be to stick with the scalar code. */
1918 74934 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1919 74934 : && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1920 : {
1921 8562 : if (dump_enabled_p ())
1922 177 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1923 : "one iteration of the vector loop would be"
1924 : " more expensive than the equivalent number of"
1925 : " iterations of the scalar loop\n");
1926 8562 : return 0;
1927 : }
1928 :
1929 66372 : HOST_WIDE_INT estimated_niter;
1930 :
1931 : /* If we are vectorizing an epilogue then we know the maximum number of
1932 : scalar iterations it will cover is at least one lower than the
1933 : vectorization factor of the main loop. */
1934 66372 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1935 11038 : estimated_niter
1936 11038 : = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1937 : else
1938 : {
1939 55334 : estimated_niter = estimated_stmt_executions_int (loop);
1940 55334 : if (estimated_niter == -1)
1941 21106 : estimated_niter = likely_max_stmt_executions_int (loop);
1942 : }
1943 32144 : if (estimated_niter != -1
1944 64545 : && ((unsigned HOST_WIDE_INT) estimated_niter
1945 64545 : < MAX (th, (unsigned) min_profitable_estimate)))
1946 : {
1947 4471 : if (dump_enabled_p ())
1948 28 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1949 : "not vectorized: estimated iteration count too "
1950 : "small.\n");
1951 4471 : if (dump_enabled_p ())
1952 28 : dump_printf_loc (MSG_NOTE, vect_location,
1953 : "not vectorized: estimated iteration count smaller "
1954 : "than specified loop bound parameter or minimum "
1955 : "profitable iterations (whichever is more "
1956 : "conservative).\n");
1957 4471 : return -1;
1958 : }
1959 :
1960 : /* As we cannot use a runtime check to gate profitability for uncounted
1961 : loops require either an estimate or if none, at least a profitable
1962 : vectorization within the first vector iteration (that condition
1963 : will practically never be true due to the required epilog and
1964 : likely alignment prologue). */
1965 61901 : if (LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo)
1966 155 : && estimated_niter == -1
1967 62029 : && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1968 : {
1969 112 : if (dump_enabled_p ())
1970 2 : dump_printf_loc (MSG_NOTE, vect_location,
1971 : "not vectorized: no loop iteration estimate on the "
1972 : "uncounted loop and not trivially profitable.\n");
1973 112 : return -1;
1974 : }
1975 :
1976 : return 1;
1977 : }
1978 :
1979 : /* Gather data references in LOOP with body BBS and store them into
1980 : *DATAREFS. */
1981 :
1982 : static opt_result
1983 275714 : vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1984 : vec<data_reference_p> *datarefs)
1985 : {
1986 821436 : for (unsigned i = 0; i < loop->num_nodes; i++)
1987 1217768 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1988 5147041 : !gsi_end_p (gsi); gsi_next (&gsi))
1989 : {
1990 4601319 : gimple *stmt = gsi_stmt (gsi);
1991 4601319 : if (is_gimple_debug (stmt))
1992 2119002 : continue;
1993 2482447 : opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1994 : NULL, 0);
1995 2482447 : if (!res)
1996 : {
1997 63292 : if (is_gimple_call (stmt) && loop->safelen)
1998 : {
1999 404 : tree fndecl = gimple_call_fndecl (stmt), op;
2000 404 : if (fndecl == NULL_TREE
2001 404 : && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2002 : {
2003 0 : fndecl = gimple_call_arg (stmt, 0);
2004 0 : gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2005 0 : fndecl = TREE_OPERAND (fndecl, 0);
2006 0 : gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2007 : }
2008 404 : if (fndecl != NULL_TREE)
2009 : {
2010 367 : cgraph_node *node = cgraph_node::get (fndecl);
2011 367 : if (node != NULL && node->simd_clones != NULL)
2012 : {
2013 131 : unsigned int j, n = gimple_call_num_args (stmt);
2014 545 : for (j = 0; j < n; j++)
2015 : {
2016 284 : op = gimple_call_arg (stmt, j);
2017 284 : if (DECL_P (op)
2018 284 : || (REFERENCE_CLASS_P (op)
2019 0 : && get_base_address (op)))
2020 : break;
2021 : }
2022 131 : op = gimple_call_lhs (stmt);
2023 : /* Ignore #pragma omp declare simd functions
2024 : if they don't have data references in the
2025 : call stmt itself. */
2026 261 : if (j == n
2027 131 : && !(op
2028 120 : && (DECL_P (op)
2029 120 : || (REFERENCE_CLASS_P (op)
2030 0 : && get_base_address (op)))))
2031 130 : continue;
2032 : }
2033 : }
2034 : }
2035 63162 : return res;
2036 : }
2037 : /* If dependence analysis will give up due to the limit on the
2038 : number of datarefs stop here and fail fatally. */
2039 4244841 : if (datarefs->length ()
2040 1825686 : > (unsigned)param_loop_max_datarefs_for_datadeps)
2041 0 : return opt_result::failure_at (stmt, "exceeded param "
2042 : "loop-max-datarefs-for-datadeps\n");
2043 : }
2044 212552 : return opt_result::success ();
2045 : }
2046 :
2047 : /* Determine if operating on full vectors for LOOP_VINFO might leave
2048 : some scalar iterations still to do. If so, decide how we should
2049 : handle those scalar iterations. The possibilities are:
2050 :
2051 : (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2052 : In this case:
2053 :
2054 : LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2055 : LOOP_VINFO_PEELING_FOR_NITER == false
2056 :
2057 : (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2058 : to handle the remaining scalar iterations. In this case:
2059 :
2060 : LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2061 : LOOP_VINFO_PEELING_FOR_NITER == true
2062 :
2063 : The MASKED_P argument specifies to what extent
2064 : param_vect_partial_vector_usage is to be honored. For MASKED_P == 0
2065 : no partial vectors are to be used, for MASKED_P == -1 it's
2066 : param_vect_partial_vector_usage that gets to decide whether we may
2067 : consider partial vector usage. For MASKED_P == 1 partial vectors
2068 : may be used if possible.
2069 :
2070 : */
2071 :
2072 : static opt_result
2073 112904 : vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2074 : int masked_p)
2075 : {
2076 : /* Determine whether there would be any scalar iterations left over. */
2077 112904 : bool need_peeling_or_partial_vectors_p
2078 112904 : = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2079 :
2080 : /* Decide whether to vectorize the loop with partial vectors. */
2081 112904 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2082 112904 : if (masked_p == 0
2083 112904 : || (masked_p == -1 && param_vect_partial_vector_usage == 0))
2084 : /* If requested explicitly do not use partial vectors. */
2085 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2086 121 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2087 42 : && LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo))
2088 0 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2089 121 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2090 42 : && need_peeling_or_partial_vectors_p)
2091 : {
2092 : /* For partial-vector-usage=1, try to push the handling of partial
2093 : vectors to the epilogue, with the main loop continuing to operate
2094 : on full vectors.
2095 :
2096 : If we are unrolling we also do not want to use partial vectors. This
2097 : is to avoid the overhead of generating multiple masks and also to
2098 : avoid having to execute entire iterations of FALSE masked instructions
2099 : when dealing with one or less full iterations.
2100 :
2101 : ??? We could then end up failing to use partial vectors if we
2102 : decide to peel iterations into a prologue, and if the main loop
2103 : then ends up processing fewer than VF iterations. */
2104 34 : if ((param_vect_partial_vector_usage == 1
2105 8 : || loop_vinfo->suggested_unroll_factor > 1)
2106 26 : && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2107 52 : && !vect_known_niters_smaller_than_vf (loop_vinfo))
2108 : ;
2109 : else
2110 26 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2111 : }
2112 :
2113 112904 : if (LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo)
2114 0 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2115 0 : return opt_result::failure_at (vect_location,
2116 : "not vectorized: loop needs but cannot "
2117 : "use partial vectors\n");
2118 :
2119 112904 : if (dump_enabled_p ())
2120 12010 : dump_printf_loc (MSG_NOTE, vect_location,
2121 : "operating on %s vectors%s.\n",
2122 12010 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2123 : ? "partial" : "full",
2124 12010 : LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2125 : ? " for epilogue loop" : "");
2126 :
2127 112904 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2128 225808 : = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2129 112904 : && need_peeling_or_partial_vectors_p);
2130 :
2131 112904 : return opt_result::success ();
2132 : }
2133 :
2134 : /* Function vect_analyze_loop_2.
2135 :
2136 : Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2137 : analyses will record information in some members of LOOP_VINFO. FATAL
2138 : indicates if some analysis meets fatal error. If one non-NULL pointer
2139 : SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2140 : worked out suggested unroll factor, while one NULL pointer shows it's
2141 : going to apply the suggested unroll factor.
2142 : SINGLE_LANE_SLP_DONE_FOR_SUGGESTED_UF is to hold whether single-lane
2143 : slp was forced when the suggested unroll factor was worked out. */
2144 : static opt_result
2145 492712 : vect_analyze_loop_2 (loop_vec_info loop_vinfo, int masked_p, bool &fatal,
2146 : unsigned *suggested_unroll_factor,
2147 : bool& single_lane_slp_done_for_suggested_uf)
2148 : {
2149 492712 : opt_result ok = opt_result::success ();
2150 492712 : int res;
2151 492712 : unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2152 492712 : loop_vec_info orig_loop_vinfo = NULL;
2153 :
2154 : /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2155 : loop_vec_info of the first vectorized loop. */
2156 492712 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2157 18098 : orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2158 : else
2159 : orig_loop_vinfo = loop_vinfo;
2160 18098 : gcc_assert (orig_loop_vinfo);
2161 :
2162 : /* We can't mask on niters for uncounted loops due to unkown upper bound. */
2163 492712 : if (LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo))
2164 84438 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2165 :
2166 : /* The first group of checks is independent of the vector size. */
2167 492712 : fatal = true;
2168 :
2169 492712 : if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2170 492712 : && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2171 5 : return opt_result::failure_at (vect_location,
2172 : "not vectorized: simd if(0)\n");
2173 :
2174 : /* Find all data references in the loop (which correspond to vdefs/vuses)
2175 : and analyze their evolution in the loop. */
2176 :
2177 492707 : loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2178 :
2179 : /* Gather the data references. */
2180 492707 : if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2181 : {
2182 275714 : opt_result res
2183 275714 : = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2184 : &LOOP_VINFO_DATAREFS (loop_vinfo));
2185 275714 : if (!res)
2186 : {
2187 63162 : if (dump_enabled_p ())
2188 1642 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2189 : "not vectorized: loop contains function "
2190 : "calls or data references that cannot "
2191 : "be analyzed\n");
2192 63162 : return res;
2193 : }
2194 212552 : loop_vinfo->shared->save_datarefs ();
2195 : }
2196 : else
2197 216993 : loop_vinfo->shared->check_datarefs ();
2198 :
2199 : /* Analyze the data references and also adjust the minimal
2200 : vectorization factor according to the loads and stores. */
2201 :
2202 429545 : ok = vect_analyze_data_refs (loop_vinfo, &fatal);
2203 429545 : if (!ok)
2204 : {
2205 57980 : if (dump_enabled_p ())
2206 1033 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2207 : "bad data references.\n");
2208 57980 : return ok;
2209 : }
2210 :
2211 : /* Check if we are applying unroll factor now. */
2212 371565 : bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2213 371565 : gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2214 :
2215 : /* When single-lane SLP was forced and we are applying suggested unroll
2216 : factor, keep that decision here. */
2217 743130 : bool force_single_lane = (applying_suggested_uf
2218 371565 : && single_lane_slp_done_for_suggested_uf);
2219 :
2220 : /* Classify all cross-iteration scalar data-flow cycles.
2221 : Cross-iteration cycles caused by virtual phis are analyzed separately. */
2222 371565 : vect_analyze_scalar_cycles (loop_vinfo);
2223 :
2224 371565 : vect_pattern_recog (loop_vinfo);
2225 :
2226 : /* Analyze the access patterns of the data-refs in the loop (consecutive,
2227 : complex, etc.). FORNOW: Only handle consecutive access pattern. */
2228 :
2229 371565 : ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2230 371565 : if (!ok)
2231 : {
2232 7901 : if (dump_enabled_p ())
2233 291 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2234 : "bad data access.\n");
2235 7901 : return ok;
2236 : }
2237 :
2238 : /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2239 :
2240 363664 : ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2241 363664 : if (!ok)
2242 : {
2243 45040 : if (dump_enabled_p ())
2244 401 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2245 : "unexpected pattern.\n");
2246 45040 : return ok;
2247 : }
2248 :
2249 : /* While the rest of the analysis below depends on it in some way. */
2250 318624 : fatal = false;
2251 :
2252 : /* Analyze data dependences between the data-refs in the loop
2253 : and adjust the maximum vectorization factor according to
2254 : the dependences.
2255 : FORNOW: fail at the first data dependence that we encounter. */
2256 :
2257 318624 : ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2258 318624 : if (!ok)
2259 : {
2260 23322 : if (dump_enabled_p ())
2261 538 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2262 : "bad data dependence.\n");
2263 23322 : return ok;
2264 : }
2265 295302 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2266 :
2267 : /* Compute the scalar iteration cost. */
2268 295302 : vect_compute_single_scalar_iteration_cost (loop_vinfo);
2269 :
2270 295302 : bool saved_can_use_partial_vectors_p
2271 : = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2272 :
2273 : /* This is the point where we can re-start analysis with single-lane
2274 : SLP forced. */
2275 423170 : start_over:
2276 :
2277 : /* Check the SLP opportunities in the loop, analyze and build
2278 : SLP trees. */
2279 846340 : ok = vect_analyze_slp (loop_vinfo, loop_vinfo->stmt_vec_infos.length (),
2280 : force_single_lane);
2281 423170 : if (!ok)
2282 20711 : return ok;
2283 :
2284 : /* If there are any SLP instances mark them as pure_slp and compute
2285 : the overall vectorization factor. */
2286 402459 : if (!vect_make_slp_decision (loop_vinfo))
2287 46630 : return opt_result::failure_at (vect_location, "no stmts to vectorize.\n");
2288 :
2289 355829 : if (dump_enabled_p ())
2290 18425 : dump_printf_loc (MSG_NOTE, vect_location, "Loop contains only SLP stmts\n");
2291 :
2292 : /* Dump the vectorization factor from the SLP decision. */
2293 355829 : if (dump_enabled_p ())
2294 : {
2295 18425 : dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
2296 18425 : dump_dec (MSG_NOTE, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2297 18425 : dump_printf (MSG_NOTE, "\n");
2298 : }
2299 :
2300 : /* We don't expect to have to roll back to anything other than an empty
2301 : set of rgroups. */
2302 355829 : gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2303 :
2304 : /* Apply the suggested unrolling factor, this was determined by the backend
2305 : during finish_cost the first time we ran the analyzis for this
2306 : vector mode. */
2307 355829 : if (applying_suggested_uf)
2308 247 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2309 :
2310 : /* Now the vectorization factor is final. */
2311 355829 : poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2312 355829 : gcc_assert (known_ne (vectorization_factor, 0U));
2313 :
2314 : /* Optimize the SLP graph with the vectorization factor fixed. */
2315 355829 : vect_optimize_slp (loop_vinfo);
2316 :
2317 : /* Gather the loads reachable from the SLP graph entries. */
2318 355829 : vect_gather_slp_loads (loop_vinfo);
2319 :
2320 355829 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2321 : {
2322 13820 : dump_printf_loc (MSG_NOTE, vect_location,
2323 : "vectorization_factor = ");
2324 13820 : dump_dec (MSG_NOTE, vectorization_factor);
2325 13820 : dump_printf (MSG_NOTE, ", niters = %wd\n",
2326 13820 : LOOP_VINFO_INT_NITERS (loop_vinfo));
2327 : }
2328 :
2329 355829 : if (max_vf != MAX_VECTORIZATION_FACTOR
2330 355829 : && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2331 41 : return opt_result::failure_at (vect_location, "bad data dependence.\n");
2332 :
2333 355788 : loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2334 :
2335 : /* Analyze the alignment of the data-refs in the loop. */
2336 355788 : vect_analyze_data_refs_alignment (loop_vinfo);
2337 :
2338 : /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2339 : It is important to call pruning after vect_analyze_data_ref_accesses,
2340 : since we use grouping information gathered by interleaving analysis. */
2341 355788 : ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2342 355788 : if (!ok)
2343 16641 : return ok;
2344 :
2345 : /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2346 : vectorization, since we do not want to add extra peeling or
2347 : add versioning for alignment. */
2348 339147 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2349 : /* This pass will decide on using loop versioning and/or loop peeling in
2350 : order to enhance the alignment of data references in the loop. */
2351 324021 : ok = vect_enhance_data_refs_alignment (loop_vinfo);
2352 339147 : if (!ok)
2353 0 : return ok;
2354 :
2355 : /* Analyze operations in the SLP instances. We can't simply
2356 : remove unsupported SLP instances as this makes the above
2357 : SLP kind detection invalid and might also affect the VF. */
2358 339147 : if (! vect_slp_analyze_operations (loop_vinfo))
2359 : {
2360 226243 : ok = opt_result::failure_at (vect_location,
2361 : "unsupported SLP instances\n");
2362 226243 : goto again;
2363 : }
2364 :
2365 : /* For now, we don't expect to mix both masking and length approaches for one
2366 : loop, disable it if both are recorded. */
2367 112904 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2368 16761 : && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2369 129659 : && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2370 : {
2371 0 : if (dump_enabled_p ())
2372 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2373 : "can't vectorize a loop with partial vectors"
2374 : " because we don't expect to mix different"
2375 : " approaches with partial vectors for the"
2376 : " same loop.\n");
2377 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2378 : }
2379 :
2380 : /* If we still have the option of using partial vectors,
2381 : check whether we can generate the necessary loop controls. */
2382 112904 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2383 : {
2384 16761 : if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2385 : {
2386 16755 : if (!vect_verify_full_masking (loop_vinfo)
2387 16755 : && !vect_verify_full_masking_avx512 (loop_vinfo))
2388 3655 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2389 : }
2390 : else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
2391 6 : if (!vect_verify_loop_lens (loop_vinfo))
2392 6 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2393 : }
2394 :
2395 : /* Decide whether this loop_vinfo should use partial vectors or peeling,
2396 : assuming that the loop will be used as a main loop. We will redo
2397 : this analysis later if we instead decide to use the loop as an
2398 : epilogue loop. */
2399 112904 : ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, masked_p);
2400 112904 : if (!ok)
2401 0 : return ok;
2402 :
2403 : /* If we're vectorizing a loop that uses length "controls" and
2404 : can iterate more than once, we apply decrementing IV approach
2405 : in loop control. */
2406 112904 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2407 26 : && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
2408 0 : && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
2409 112904 : && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2410 0 : && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
2411 : LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
2412 0 : LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
2413 :
2414 : /* If a loop uses length controls and has a decrementing loop control IV,
2415 : we will normally pass that IV through a MIN_EXPR to calcaluate the
2416 : basis for the length controls. E.g. in a loop that processes one
2417 : element per scalar iteration, the number of elements would be
2418 : MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
2419 :
2420 : This MIN_EXPR approach allows us to use pointer IVs with an invariant
2421 : step, since only the final iteration of the vector loop can have
2422 : inactive lanes.
2423 :
2424 : However, some targets have a dedicated instruction for calculating the
2425 : preferred length, given the total number of elements that still need to
2426 : be processed. This is encapsulated in the SELECT_VL internal function.
2427 :
2428 : If the target supports SELECT_VL, we can use it instead of MIN_EXPR
2429 : to determine the basis for the length controls. However, unlike the
2430 : MIN_EXPR calculation, the SELECT_VL calculation can decide to make
2431 : lanes inactive in any iteration of the vector loop, not just the last
2432 : iteration. This SELECT_VL approach therefore requires us to use pointer
2433 : IVs with variable steps.
2434 :
2435 : Once we've decided how many elements should be processed by one
2436 : iteration of the vector loop, we need to populate the rgroup controls.
2437 : If a loop has multiple rgroups, we need to make sure that those rgroups
2438 : "line up" (that is, they must be consistent about which elements are
2439 : active and which aren't). This is done by vect_adjust_loop_lens_control.
2440 :
2441 : In principle, it would be possible to use vect_adjust_loop_lens_control
2442 : on either the result of a MIN_EXPR or the result of a SELECT_VL.
2443 : However:
2444 :
2445 : (1) In practice, it only makes sense to use SELECT_VL when a vector
2446 : operation will be controlled directly by the result. It is not
2447 : worth using SELECT_VL if it would only be the input to other
2448 : calculations.
2449 :
2450 : (2) If we use SELECT_VL for an rgroup that has N controls, each associated
2451 : pointer IV will need N updates by a variable amount (N-1 updates
2452 : within the iteration and 1 update to move to the next iteration).
2453 :
2454 : Because of this, we prefer to use the MIN_EXPR approach whenever there
2455 : is more than one length control.
2456 :
2457 : In addition, SELECT_VL always operates to a granularity of 1 unit.
2458 : If we wanted to use it to control an SLP operation on N consecutive
2459 : elements, we would need to make the SELECT_VL inputs measure scalar
2460 : iterations (rather than elements) and then multiply the SELECT_VL
2461 : result by N. But using SELECT_VL this way is inefficient because
2462 : of (1) above.
2463 :
2464 : 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
2465 : satisfied:
2466 :
2467 : (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
2468 : (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
2469 :
2470 : Since SELECT_VL (variable step) will make SCEV analysis failed and then
2471 : we will fail to gain benefits of following unroll optimizations. We prefer
2472 : using the MIN_EXPR approach in this situation. */
2473 112904 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
2474 : {
2475 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
2476 0 : if (LOOP_VINFO_LENS (loop_vinfo).length () == 1
2477 0 : && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1
2478 0 : && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2479 : || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
2480 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
2481 :
2482 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
2483 0 : for (auto rgc : LOOP_VINFO_LENS (loop_vinfo))
2484 0 : if (rgc.type
2485 0 : && !direct_internal_fn_supported_p (IFN_SELECT_VL,
2486 : rgc.type, iv_type,
2487 : OPTIMIZE_FOR_SPEED))
2488 : {
2489 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2490 0 : break;
2491 : }
2492 :
2493 : /* If any of the SLP instances cover more than a single lane
2494 : we cannot use .SELECT_VL at the moment, even if the number
2495 : of lanes is uniform throughout the SLP graph. */
2496 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
2497 0 : for (slp_instance inst : LOOP_VINFO_SLP_INSTANCES (loop_vinfo))
2498 0 : if (SLP_TREE_LANES (SLP_INSTANCE_TREE (inst)) != 1
2499 0 : && !(SLP_INSTANCE_KIND (inst) == slp_inst_kind_store
2500 0 : && SLP_INSTANCE_TREE (inst)->ldst_lanes))
2501 : {
2502 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2503 0 : break;
2504 : }
2505 : }
2506 :
2507 : /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2508 : to be able to handle fewer than VF scalars, or needs to have a lower VF
2509 : than the main loop. */
2510 112904 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2511 12671 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2512 : {
2513 12661 : poly_uint64 unscaled_vf
2514 12661 : = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2515 : orig_loop_vinfo->suggested_unroll_factor);
2516 12661 : if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
2517 285 : return opt_result::failure_at (vect_location,
2518 : "Vectorization factor too high for"
2519 : " epilogue loop.\n");
2520 : }
2521 :
2522 : /* If the epilogue needs peeling for gaps but the main loop doesn't give
2523 : up on the epilogue. */
2524 112619 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2525 12386 : && LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2526 67 : && (LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo)
2527 : != LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
2528 4 : return opt_result::failure_at (vect_location,
2529 : "Epilogue loop requires peeling for gaps "
2530 : "but main loop does not.\n");
2531 :
2532 : /* If an epilogue loop is required make sure we can create one. */
2533 112615 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2534 111363 : || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2535 32677 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
2536 : {
2537 80878 : if (dump_enabled_p ())
2538 5283 : dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2539 80878 : if (!vect_can_advance_ivs_p (loop_vinfo)
2540 161244 : || !slpeel_can_duplicate_loop_p (loop,
2541 : LOOP_VINFO_MAIN_EXIT (loop_vinfo),
2542 80366 : LOOP_VINFO_MAIN_EXIT (loop_vinfo)))
2543 : {
2544 512 : ok = opt_result::failure_at (vect_location,
2545 : "not vectorized: can't create required "
2546 : "epilog loop\n");
2547 512 : goto again;
2548 : }
2549 : }
2550 :
2551 : /* Check the costings of the loop make vectorizing worthwhile. */
2552 112103 : res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2553 112103 : if (res < 0 && !param_vect_allow_possibly_not_worthwhile_vectorizations)
2554 : {
2555 28774 : ok = opt_result::failure_at (vect_location,
2556 : "Loop costings may not be worthwhile.\n");
2557 28774 : goto again;
2558 : }
2559 83329 : if (!res)
2560 21540 : return opt_result::failure_at (vect_location,
2561 : "Loop costings not worthwhile.\n");
2562 :
2563 : /* During peeling, we need to check if number of loop iterations is
2564 : enough for both peeled prolog loop and vector loop. This check
2565 : can be merged along with threshold check of loop versioning, so
2566 : increase threshold for this case if necessary.
2567 :
2568 : If we are analyzing an epilogue we still want to check what its
2569 : versioning threshold would be. If we decide to vectorize the epilogues we
2570 : will want to use the lowest versioning threshold of all epilogues and main
2571 : loop. This will enable us to enter a vectorized epilogue even when
2572 : versioning the loop. We can't simply check whether the epilogue requires
2573 : versioning though since we may have skipped some versioning checks when
2574 : analyzing the epilogue. For instance, checks for alias versioning will be
2575 : skipped when dealing with epilogues as we assume we already checked them
2576 : for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2577 61789 : if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2578 : {
2579 5827 : poly_uint64 niters_th = 0;
2580 5827 : unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2581 :
2582 5827 : if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2583 : {
2584 : /* Niters for peeled prolog loop. */
2585 5827 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2586 : {
2587 118 : dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2588 118 : tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2589 118 : niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2590 : }
2591 : else
2592 5709 : niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2593 : }
2594 :
2595 : /* Niters for at least one iteration of vectorized loop. */
2596 5827 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2597 5823 : niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2598 : /* One additional iteration because of peeling for gap. */
2599 5827 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2600 60 : niters_th += 1;
2601 :
2602 : /* Use the same condition as vect_transform_loop to decide when to use
2603 : the cost to determine a versioning threshold. */
2604 5827 : if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2605 5827 : && ordered_p (th, niters_th))
2606 3924 : niters_th = ordered_max (poly_uint64 (th), niters_th);
2607 :
2608 5827 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2609 : }
2610 :
2611 61789 : gcc_assert (known_eq (vectorization_factor,
2612 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2613 :
2614 61789 : single_lane_slp_done_for_suggested_uf = force_single_lane;
2615 :
2616 : /* Ok to vectorize! */
2617 61789 : LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2618 61789 : return opt_result::success ();
2619 :
2620 255529 : again:
2621 : /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2622 255529 : gcc_assert (!ok);
2623 :
2624 : /* Try again with single-lane SLP. */
2625 255529 : if (force_single_lane)
2626 126721 : return ok;
2627 :
2628 : /* If we are applying suggested unroll factor, we don't need to
2629 : re-try any more as we want to keep the SLP mode fixed. */
2630 128808 : if (applying_suggested_uf)
2631 6 : return ok;
2632 :
2633 : /* Likewise if the grouped loads or stores in the SLP cannot be handled
2634 : via interleaving or lane instructions. */
2635 : slp_instance instance;
2636 : slp_tree node;
2637 : unsigned i, j;
2638 350361 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2639 : {
2640 222493 : if (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance)) != vect_internal_def)
2641 0 : continue;
2642 :
2643 222493 : stmt_vec_info vinfo;
2644 222493 : vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2645 222493 : if (!vinfo || !STMT_VINFO_GROUPED_ACCESS (vinfo))
2646 219962 : continue;
2647 2531 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2648 2531 : unsigned int size = DR_GROUP_SIZE (vinfo);
2649 2531 : tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
2650 2531 : if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
2651 4382 : && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2652 5057 : && ! vect_grouped_store_supported (vectype, size))
2653 675 : return opt_result::failure_at (vinfo->stmt,
2654 : "unsupported grouped store\n");
2655 224694 : FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2656 : {
2657 1943 : vinfo = SLP_TREE_REPRESENTATIVE (node);
2658 1943 : if (STMT_VINFO_GROUPED_ACCESS (vinfo))
2659 : {
2660 1681 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2661 1681 : bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2662 1681 : size = DR_GROUP_SIZE (vinfo);
2663 1681 : vectype = SLP_TREE_VECTYPE (node);
2664 1681 : if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
2665 1681 : && ! vect_grouped_load_supported (vectype, single_element_p,
2666 : size))
2667 259 : return opt_result::failure_at (vinfo->stmt,
2668 : "unsupported grouped load\n");
2669 : }
2670 : }
2671 : }
2672 :
2673 : /* Roll back state appropriately. Force single-lane SLP this time. */
2674 127868 : force_single_lane = true;
2675 127868 : if (dump_enabled_p ())
2676 3297 : dump_printf_loc (MSG_NOTE, vect_location,
2677 : "re-trying with single-lane SLP\n");
2678 :
2679 : /* Reset the vectorization factor. */
2680 127868 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = 0;
2681 : /* Free the SLP instances. */
2682 349420 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2683 221552 : vect_free_slp_instance (instance);
2684 127868 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2685 : /* Reset altered state on stmts. */
2686 489884 : for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2687 : {
2688 362016 : basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2689 362016 : for (gimple_stmt_iterator si = gsi_start_phis (bb);
2690 641651 : !gsi_end_p (si); gsi_next (&si))
2691 : {
2692 279635 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2693 279635 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2694 279635 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2695 : {
2696 : /* vectorizable_reduction adjusts reduction stmt def-types,
2697 : restore them to that of the PHI. */
2698 20572 : STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2699 20572 : = STMT_VINFO_DEF_TYPE (stmt_info);
2700 20572 : STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2701 : (STMT_VINFO_REDUC_DEF (stmt_info)))
2702 20572 : = STMT_VINFO_DEF_TYPE (stmt_info);
2703 : }
2704 : }
2705 724032 : for (gimple_stmt_iterator si = gsi_start_bb (bb);
2706 2212489 : !gsi_end_p (si); gsi_next (&si))
2707 : {
2708 1850473 : if (is_gimple_debug (gsi_stmt (si)))
2709 707893 : continue;
2710 1142580 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2711 1142580 : if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2712 : {
2713 218071 : stmt_vec_info pattern_stmt_info
2714 : = STMT_VINFO_RELATED_STMT (stmt_info);
2715 218071 : if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2716 0 : STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2717 : }
2718 : }
2719 : }
2720 : /* Free optimized alias test DDRS. */
2721 127868 : LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2722 127868 : LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2723 127868 : LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2724 : /* Reset target cost data. */
2725 127868 : delete loop_vinfo->vector_costs;
2726 127868 : loop_vinfo->vector_costs = nullptr;
2727 : /* Reset accumulated rgroup information. */
2728 127868 : LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
2729 127868 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
2730 127868 : release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2731 : /* Reset assorted flags. */
2732 127868 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2733 127868 : LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2734 127868 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2735 127868 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2736 127868 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2737 127868 : = saved_can_use_partial_vectors_p;
2738 127868 : LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2739 127868 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2740 127868 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2741 127868 : LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = false;
2742 :
2743 127868 : if (loop_vinfo->scan_map)
2744 122 : loop_vinfo->scan_map->empty ();
2745 :
2746 127868 : goto start_over;
2747 : }
2748 :
2749 : /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2750 : to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2751 : OLD_LOOP_VINFO is better unless something specifically indicates
2752 : otherwise.
2753 :
2754 : Note that this deliberately isn't a partial order. */
2755 :
2756 : static bool
2757 5 : vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2758 : loop_vec_info old_loop_vinfo)
2759 : {
2760 5 : struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2761 5 : gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2762 :
2763 5 : poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2764 5 : poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2765 :
2766 : /* Always prefer a VF of loop->simdlen over any other VF. */
2767 5 : if (loop->simdlen)
2768 : {
2769 0 : bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2770 0 : bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2771 0 : if (new_simdlen_p != old_simdlen_p)
2772 : return new_simdlen_p;
2773 : }
2774 :
2775 5 : const auto *old_costs = old_loop_vinfo->vector_costs;
2776 5 : const auto *new_costs = new_loop_vinfo->vector_costs;
2777 5 : if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2778 0 : return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2779 :
2780 5 : return new_costs->better_main_loop_than_p (old_costs);
2781 : }
2782 :
2783 : /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2784 : true if we should. */
2785 :
2786 : static bool
2787 5 : vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2788 : loop_vec_info old_loop_vinfo)
2789 : {
2790 5 : if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2791 : return false;
2792 :
2793 1 : if (dump_enabled_p ())
2794 1 : dump_printf_loc (MSG_NOTE, vect_location,
2795 : "***** Preferring vector mode %s to vector mode %s\n",
2796 1 : GET_MODE_NAME (new_loop_vinfo->vector_mode),
2797 1 : GET_MODE_NAME (old_loop_vinfo->vector_mode));
2798 : return true;
2799 : }
2800 :
2801 : /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if ORIG_LOOP_VINFO is
2802 : not NULL. When MASKED_P is not -1 override the default
2803 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P with it.
2804 : Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance MODE_I to the next
2805 : mode useful to analyze.
2806 : Return the loop_vinfo on success and wrapped null on failure. */
2807 :
2808 : static opt_loop_vec_info
2809 492465 : vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2810 : const vect_loop_form_info *loop_form_info,
2811 : loop_vec_info orig_loop_vinfo,
2812 : const vector_modes &vector_modes, unsigned &mode_i,
2813 : int masked_p,
2814 : machine_mode &autodetected_vector_mode,
2815 : bool &fatal)
2816 : {
2817 492465 : loop_vec_info loop_vinfo
2818 492465 : = vect_create_loop_vinfo (loop, shared, loop_form_info, orig_loop_vinfo);
2819 :
2820 492465 : machine_mode vector_mode = vector_modes[mode_i];
2821 492465 : loop_vinfo->vector_mode = vector_mode;
2822 492465 : unsigned int suggested_unroll_factor = 1;
2823 492465 : bool single_lane_slp_done_for_suggested_uf = false;
2824 :
2825 : /* Run the main analysis. */
2826 492465 : opt_result res = vect_analyze_loop_2 (loop_vinfo, masked_p, fatal,
2827 : &suggested_unroll_factor,
2828 : single_lane_slp_done_for_suggested_uf);
2829 492465 : if (dump_enabled_p ())
2830 20387 : dump_printf_loc (MSG_NOTE, vect_location,
2831 : "***** Analysis %s with vector mode %s\n",
2832 20387 : res ? "succeeded" : "failed",
2833 20387 : GET_MODE_NAME (loop_vinfo->vector_mode));
2834 :
2835 492465 : auto user_unroll = LOOP_VINFO_LOOP (loop_vinfo)->unroll;
2836 492465 : if (res && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2837 : /* Check to see if the user wants to unroll or if the target wants to. */
2838 546986 : && (suggested_unroll_factor > 1 || user_unroll > 1))
2839 : {
2840 261 : if (suggested_unroll_factor == 1)
2841 : {
2842 44 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
2843 44 : suggested_unroll_factor = user_unroll / assumed_vf;
2844 44 : if (suggested_unroll_factor > 1)
2845 : {
2846 30 : if (dump_enabled_p ())
2847 20 : dump_printf_loc (MSG_NOTE, vect_location,
2848 : "setting unroll factor to %d based on user requested "
2849 : "unroll factor %d and suggested vectorization "
2850 : "factor: %d\n",
2851 : suggested_unroll_factor, user_unroll, assumed_vf);
2852 : }
2853 : }
2854 :
2855 261 : if (suggested_unroll_factor > 1)
2856 : {
2857 247 : if (dump_enabled_p ())
2858 44 : dump_printf_loc (MSG_NOTE, vect_location,
2859 : "***** Re-trying analysis for unrolling"
2860 : " with unroll factor %d and %s slp.\n",
2861 : suggested_unroll_factor,
2862 : single_lane_slp_done_for_suggested_uf
2863 : ? "single-lane" : "");
2864 247 : loop_vec_info unroll_vinfo
2865 247 : = vect_create_loop_vinfo (loop, shared, loop_form_info, NULL);
2866 247 : unroll_vinfo->vector_mode = vector_mode;
2867 247 : unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
2868 247 : opt_result new_res
2869 247 : = vect_analyze_loop_2 (unroll_vinfo, masked_p, fatal, NULL,
2870 : single_lane_slp_done_for_suggested_uf);
2871 247 : if (new_res)
2872 : {
2873 201 : delete loop_vinfo;
2874 201 : loop_vinfo = unroll_vinfo;
2875 : }
2876 : else
2877 46 : delete unroll_vinfo;
2878 : }
2879 :
2880 : /* Record that we have honored a user unroll factor. */
2881 261 : LOOP_VINFO_USER_UNROLL (loop_vinfo) = user_unroll > 1;
2882 : }
2883 :
2884 : /* Remember the autodetected vector mode. */
2885 492465 : if (vector_mode == VOIDmode)
2886 266512 : autodetected_vector_mode = loop_vinfo->vector_mode;
2887 :
2888 : /* Advance mode_i, first skipping modes that would result in the
2889 : same analysis result. */
2890 2288449 : while (mode_i + 1 < vector_modes.length ()
2891 1616875 : && vect_chooses_same_modes_p (loop_vinfo,
2892 718883 : vector_modes[mode_i + 1]))
2893 : {
2894 405527 : if (dump_enabled_p ())
2895 17028 : dump_printf_loc (MSG_NOTE, vect_location,
2896 : "***** The result for vector mode %s would"
2897 : " be the same\n",
2898 17028 : GET_MODE_NAME (vector_modes[mode_i + 1]));
2899 405527 : mode_i += 1;
2900 : }
2901 492465 : if (mode_i + 1 < vector_modes.length ()
2902 805821 : && vect_chooses_same_modes_p (autodetected_vector_mode,
2903 313356 : vector_modes[mode_i + 1]))
2904 : {
2905 349 : if (dump_enabled_p ())
2906 10 : dump_printf_loc (MSG_NOTE, vect_location,
2907 : "***** Skipping vector mode %s, which would"
2908 : " repeat the analysis for %s\n",
2909 10 : GET_MODE_NAME (vector_modes[mode_i + 1]),
2910 10 : GET_MODE_NAME (autodetected_vector_mode));
2911 349 : mode_i += 1;
2912 : }
2913 492465 : mode_i++;
2914 :
2915 492465 : if (!res)
2916 : {
2917 430877 : delete loop_vinfo;
2918 430877 : if (fatal)
2919 103778 : gcc_checking_assert (orig_loop_vinfo == NULL);
2920 430877 : return opt_loop_vec_info::propagate_failure (res);
2921 : }
2922 :
2923 61588 : return opt_loop_vec_info::success (loop_vinfo);
2924 : }
2925 :
2926 : /* Function vect_analyze_loop.
2927 :
2928 : Apply a set of analyses on LOOP, and create a loop_vec_info struct
2929 : for it. The different analyses will record information in the
2930 : loop_vec_info struct. */
2931 : opt_loop_vec_info
2932 465309 : vect_analyze_loop (class loop *loop, gimple *loop_vectorized_call,
2933 : vec_info_shared *shared)
2934 : {
2935 465309 : DUMP_VECT_SCOPE ("analyze_loop_nest");
2936 :
2937 465309 : if (loop_outer (loop)
2938 465309 : && loop_vec_info_for_loop (loop_outer (loop))
2939 465867 : && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2940 558 : return opt_loop_vec_info::failure_at (vect_location,
2941 : "outer-loop already vectorized.\n");
2942 :
2943 464751 : if (!find_loop_nest (loop, &shared->loop_nest))
2944 22264 : return opt_loop_vec_info::failure_at
2945 22264 : (vect_location,
2946 : "not vectorized: loop nest containing two or more consecutive inner"
2947 : " loops cannot be vectorized\n");
2948 :
2949 : /* Analyze the loop form. */
2950 442487 : vect_loop_form_info loop_form_info;
2951 442487 : opt_result res = vect_analyze_loop_form (loop, loop_vectorized_call,
2952 : &loop_form_info);
2953 442487 : if (!res)
2954 : {
2955 175975 : if (dump_enabled_p ())
2956 1532 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2957 : "bad loop form.\n");
2958 175975 : return opt_loop_vec_info::propagate_failure (res);
2959 : }
2960 266512 : if (!integer_onep (loop_form_info.assumptions))
2961 : {
2962 : /* We consider to vectorize this loop by versioning it under
2963 : some assumptions. In order to do this, we need to clear
2964 : existing information computed by scev and niter analyzer. */
2965 8230 : scev_reset_htab ();
2966 8230 : free_numbers_of_iterations_estimates (loop);
2967 : /* Also set flag for this loop so that following scev and niter
2968 : analysis are done under the assumptions. */
2969 8230 : loop_constraint_set (loop, LOOP_C_FINITE);
2970 : }
2971 : else
2972 : /* Clear the existing niter information to make sure the nonwrapping flag
2973 : will be calculated and set propriately. */
2974 258282 : free_numbers_of_iterations_estimates (loop);
2975 :
2976 266512 : auto_vector_modes vector_modes;
2977 : /* Autodetect first vector size we try. */
2978 266512 : vector_modes.safe_push (VOIDmode);
2979 266512 : unsigned int autovec_flags
2980 533024 : = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2981 266512 : loop->simdlen != 0);
2982 266512 : bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2983 266512 : && !unlimited_cost_model (loop));
2984 266512 : machine_mode autodetected_vector_mode = VOIDmode;
2985 266512 : opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2986 266512 : unsigned int mode_i = 0;
2987 266512 : unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2988 :
2989 : /* Keep track of the VF for each mode. Initialize all to 0 which indicates
2990 : a mode has not been analyzed. */
2991 266512 : auto_vec<poly_uint64, 8> cached_vf_per_mode;
2992 2676934 : for (unsigned i = 0; i < vector_modes.length (); ++i)
2993 1071955 : cached_vf_per_mode.safe_push (0);
2994 :
2995 : /* First determine the main loop vectorization mode, either the first
2996 : one that works, starting with auto-detecting the vector mode and then
2997 : following the targets order of preference, or the one with the
2998 : lowest cost if pick_lowest_cost_p. */
2999 682222 : while (1)
3000 : {
3001 474367 : bool fatal;
3002 474367 : unsigned int last_mode_i = mode_i;
3003 : /* Set cached VF to -1 prior to analysis, which indicates a mode has
3004 : failed. */
3005 474367 : cached_vf_per_mode[last_mode_i] = -1;
3006 474367 : opt_loop_vec_info loop_vinfo
3007 474367 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3008 : NULL, vector_modes, mode_i, -1,
3009 : autodetected_vector_mode, fatal);
3010 474367 : if (fatal)
3011 : break;
3012 :
3013 370589 : if (loop_vinfo)
3014 : {
3015 : /* Analyzis has been successful so update the VF value. The
3016 : VF should always be a multiple of unroll_factor and we want to
3017 : capture the original VF here. */
3018 54521 : cached_vf_per_mode[last_mode_i]
3019 54521 : = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3020 54521 : loop_vinfo->suggested_unroll_factor);
3021 : /* Once we hit the desired simdlen for the first time,
3022 : discard any previous attempts. */
3023 54521 : if (simdlen
3024 54521 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3025 : {
3026 47 : delete first_loop_vinfo;
3027 : first_loop_vinfo = opt_loop_vec_info::success (NULL);
3028 : simdlen = 0;
3029 : }
3030 54474 : else if (pick_lowest_cost_p
3031 10 : && first_loop_vinfo
3032 54479 : && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3033 : {
3034 : /* Pick loop_vinfo over first_loop_vinfo. */
3035 1 : delete first_loop_vinfo;
3036 1 : first_loop_vinfo = opt_loop_vec_info::success (NULL);
3037 : }
3038 54521 : if (first_loop_vinfo == NULL)
3039 : first_loop_vinfo = loop_vinfo;
3040 : else
3041 : {
3042 6 : delete loop_vinfo;
3043 6 : loop_vinfo = opt_loop_vec_info::success (NULL);
3044 : }
3045 :
3046 : /* Commit to first_loop_vinfo if we have no reason to try
3047 : alternatives. */
3048 54521 : if (!simdlen && !pick_lowest_cost_p)
3049 : break;
3050 : }
3051 316087 : if (mode_i == vector_modes.length ()
3052 316087 : || autodetected_vector_mode == VOIDmode)
3053 : break;
3054 :
3055 : /* Try the next biggest vector size. */
3056 207855 : if (dump_enabled_p ())
3057 3967 : dump_printf_loc (MSG_NOTE, vect_location,
3058 : "***** Re-trying analysis with vector mode %s\n",
3059 3967 : GET_MODE_NAME (vector_modes[mode_i]));
3060 207855 : }
3061 266512 : if (!first_loop_vinfo)
3062 212003 : return opt_loop_vec_info::propagate_failure (res);
3063 :
3064 54509 : if (dump_enabled_p ())
3065 9506 : dump_printf_loc (MSG_NOTE, vect_location,
3066 : "***** Choosing vector mode %s\n",
3067 9506 : GET_MODE_NAME (first_loop_vinfo->vector_mode));
3068 :
3069 : /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3070 : enabled, SIMDUID is not set, it is the innermost loop and we have
3071 : either already found the loop's SIMDLEN or there was no SIMDLEN to
3072 : begin with.
3073 : TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3074 54509 : bool vect_epilogues = (!simdlen
3075 54507 : && loop->inner == NULL
3076 53935 : && param_vect_epilogues_nomask
3077 52862 : && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3078 : /* No code motion support for multiple epilogues so for now
3079 : not supported when multiple exits. */
3080 25984 : && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3081 25512 : && !loop->simduid
3082 78608 : && loop_cost_model (loop) > VECT_COST_MODEL_VERY_CHEAP);
3083 54509 : if (!vect_epilogues)
3084 41576 : return first_loop_vinfo;
3085 :
3086 : /* Now analyze first_loop_vinfo for epilogue vectorization. */
3087 :
3088 : /* For epilogues start the analysis from the first mode. The motivation
3089 : behind starting from the beginning comes from cases where the VECTOR_MODES
3090 : array may contain length-agnostic and length-specific modes. Their
3091 : ordering is not guaranteed, so we could end up picking a mode for the main
3092 : loop that is after the epilogue's optimal mode. */
3093 12933 : int masked_p = -1;
3094 12933 : if (!unlimited_cost_model (loop)
3095 12933 : && (first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p)
3096 : != VOIDmode))
3097 : {
3098 4 : vector_modes[0]
3099 4 : = first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p);
3100 4 : cached_vf_per_mode[0] = 0;
3101 : }
3102 : else
3103 12929 : vector_modes[0] = autodetected_vector_mode;
3104 12933 : mode_i = 0;
3105 :
3106 12969 : bool supports_partial_vectors = (param_vect_partial_vector_usage != 0
3107 12933 : || masked_p == 1);
3108 : if (supports_partial_vectors
3109 36 : && !partial_vectors_supported_p ()
3110 36 : && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (first_loop_vinfo))
3111 : supports_partial_vectors = false;
3112 12933 : poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3113 :
3114 12933 : loop_vec_info orig_loop_vinfo = first_loop_vinfo;
3115 13121 : do
3116 : {
3117 : /* Let the user override what the target suggests. */
3118 13027 : if (OPTION_SET_P (param_vect_partial_vector_usage))
3119 43 : masked_p = -1;
3120 :
3121 44622 : while (1)
3122 : {
3123 : /* If the target does not support partial vectors we can shorten the
3124 : number of modes to analyze for the epilogue as we know we can't
3125 : pick a mode that would lead to a VF at least as big as the
3126 : FIRST_VINFO_VF. */
3127 58426 : if (!supports_partial_vectors
3128 44622 : && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3129 : {
3130 13833 : mode_i++;
3131 27666 : if (mode_i == vector_modes.length ())
3132 : break;
3133 26495 : continue;
3134 : }
3135 : /* We would need an exhaustive search to find all modes we
3136 : skipped but that would lead to the same result as the
3137 : analysis it was skipped for and where we'd could check
3138 : cached_vf_per_mode against.
3139 : Check for the autodetected mode, which is the common
3140 : situation on x86 which does not perform cost comparison. */
3141 43480 : if (!supports_partial_vectors
3142 30779 : && maybe_ge (cached_vf_per_mode[0], first_vinfo_vf)
3143 61029 : && vect_chooses_same_modes_p (autodetected_vector_mode,
3144 30240 : vector_modes[mode_i]))
3145 : {
3146 12691 : mode_i++;
3147 25382 : if (mode_i == vector_modes.length ())
3148 : break;
3149 12691 : continue;
3150 : }
3151 :
3152 18098 : if (dump_enabled_p ())
3153 3231 : dump_printf_loc (MSG_NOTE, vect_location,
3154 : "***** Re-trying epilogue analysis with vector "
3155 3231 : "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3156 :
3157 18098 : bool fatal;
3158 18098 : opt_loop_vec_info loop_vinfo
3159 18098 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3160 : orig_loop_vinfo,
3161 : vector_modes, mode_i, masked_p,
3162 : autodetected_vector_mode, fatal);
3163 18098 : if (fatal)
3164 : break;
3165 :
3166 18098 : if (loop_vinfo)
3167 : {
3168 7067 : if (pick_lowest_cost_p
3169 4 : && orig_loop_vinfo->epilogue_vinfo
3170 7067 : && vect_joust_loop_vinfos (loop_vinfo,
3171 0 : orig_loop_vinfo->epilogue_vinfo))
3172 : {
3173 0 : gcc_assert (vect_epilogues);
3174 0 : delete orig_loop_vinfo->epilogue_vinfo;
3175 0 : orig_loop_vinfo->epilogue_vinfo = nullptr;
3176 : }
3177 7067 : if (!orig_loop_vinfo->epilogue_vinfo)
3178 7067 : orig_loop_vinfo->epilogue_vinfo = loop_vinfo;
3179 : else
3180 : {
3181 0 : delete loop_vinfo;
3182 0 : loop_vinfo = opt_loop_vec_info::success (NULL);
3183 : }
3184 :
3185 : /* For now only allow one epilogue loop, but allow
3186 : pick_lowest_cost_p to replace it, so commit to the
3187 : first epilogue if we have no reason to try alternatives. */
3188 7067 : if (!pick_lowest_cost_p)
3189 : break;
3190 : }
3191 :
3192 : /* Revert back to the default from the suggested prefered
3193 : epilogue vectorization mode. */
3194 11035 : masked_p = -1;
3195 22070 : if (mode_i == vector_modes.length ())
3196 : break;
3197 : }
3198 :
3199 13027 : orig_loop_vinfo = orig_loop_vinfo->epilogue_vinfo;
3200 13027 : if (!orig_loop_vinfo)
3201 : break;
3202 :
3203 : /* When we selected a first vectorized epilogue, see if the target
3204 : suggests to have another one. */
3205 7067 : masked_p = -1;
3206 7067 : if (!unlimited_cost_model (loop)
3207 4129 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (orig_loop_vinfo)
3208 11189 : && (orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p)
3209 : != VOIDmode))
3210 : {
3211 188 : vector_modes[0]
3212 94 : = orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p);
3213 94 : cached_vf_per_mode[0] = 0;
3214 94 : mode_i = 0;
3215 : }
3216 : else
3217 : break;
3218 94 : }
3219 : while (1);
3220 :
3221 12933 : if (first_loop_vinfo->epilogue_vinfo)
3222 : {
3223 6978 : poly_uint64 lowest_th
3224 6978 : = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3225 6978 : loop_vec_info epilog_vinfo = first_loop_vinfo->epilogue_vinfo;
3226 7067 : do
3227 : {
3228 7067 : poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (epilog_vinfo);
3229 7067 : gcc_assert (!LOOP_REQUIRES_VERSIONING (epilog_vinfo)
3230 : || maybe_ne (lowest_th, 0U));
3231 : /* Keep track of the known smallest versioning threshold. */
3232 7067 : if (ordered_p (lowest_th, th))
3233 7067 : lowest_th = ordered_min (lowest_th, th);
3234 7067 : epilog_vinfo = epilog_vinfo->epilogue_vinfo;
3235 : }
3236 7067 : while (epilog_vinfo);
3237 6978 : LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3238 6978 : if (dump_enabled_p ())
3239 1443 : dump_printf_loc (MSG_NOTE, vect_location,
3240 : "***** Choosing epilogue vector mode %s\n",
3241 1443 : GET_MODE_NAME
3242 : (first_loop_vinfo->epilogue_vinfo->vector_mode));
3243 : }
3244 :
3245 12933 : return first_loop_vinfo;
3246 708999 : }
3247 :
3248 : /* Return true if there is an in-order reduction function for CODE, storing
3249 : it in *REDUC_FN if so. */
3250 :
3251 : static bool
3252 4714 : fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3253 : {
3254 : /* We support MINUS_EXPR by negating the operand. This also preserves an
3255 : initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3256 : (-0.0) = -0.0. */
3257 4714 : if (code == PLUS_EXPR || code == MINUS_EXPR)
3258 : {
3259 4038 : *reduc_fn = IFN_FOLD_LEFT_PLUS;
3260 0 : return true;
3261 : }
3262 : return false;
3263 : }
3264 :
3265 : /* Function reduction_fn_for_scalar_code
3266 :
3267 : Input:
3268 : CODE - tree_code of a reduction operations.
3269 :
3270 : Output:
3271 : REDUC_FN - the corresponding internal function to be used to reduce the
3272 : vector of partial results into a single scalar result, or IFN_LAST
3273 : if the operation is a supported reduction operation, but does not have
3274 : such an internal function.
3275 :
3276 : Return FALSE if CODE currently cannot be vectorized as reduction. */
3277 :
3278 : bool
3279 1964269 : reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3280 : {
3281 1964269 : if (code.is_tree_code ())
3282 1964211 : switch (tree_code (code))
3283 : {
3284 15079 : case MAX_EXPR:
3285 15079 : *reduc_fn = IFN_REDUC_MAX;
3286 15079 : return true;
3287 :
3288 49746 : case MIN_EXPR:
3289 49746 : *reduc_fn = IFN_REDUC_MIN;
3290 49746 : return true;
3291 :
3292 1062940 : case PLUS_EXPR:
3293 1062940 : *reduc_fn = IFN_REDUC_PLUS;
3294 1062940 : return true;
3295 :
3296 233593 : case BIT_AND_EXPR:
3297 233593 : *reduc_fn = IFN_REDUC_AND;
3298 233593 : return true;
3299 :
3300 280413 : case BIT_IOR_EXPR:
3301 280413 : *reduc_fn = IFN_REDUC_IOR;
3302 280413 : return true;
3303 :
3304 42929 : case BIT_XOR_EXPR:
3305 42929 : *reduc_fn = IFN_REDUC_XOR;
3306 42929 : return true;
3307 :
3308 279511 : case MULT_EXPR:
3309 279511 : case MINUS_EXPR:
3310 279511 : *reduc_fn = IFN_LAST;
3311 279511 : return true;
3312 :
3313 : default:
3314 : return false;
3315 : }
3316 : else
3317 58 : switch (combined_fn (code))
3318 : {
3319 34 : CASE_CFN_FMAX:
3320 34 : *reduc_fn = IFN_REDUC_FMAX;
3321 34 : return true;
3322 :
3323 24 : CASE_CFN_FMIN:
3324 24 : *reduc_fn = IFN_REDUC_FMIN;
3325 24 : return true;
3326 :
3327 : default:
3328 : return false;
3329 : }
3330 : }
3331 :
3332 : /* Set *SBOOL_FN to the corresponding function working on vector masks
3333 : for REDUC_FN. Return true if that exists, false otherwise. */
3334 :
3335 : static bool
3336 0 : sbool_reduction_fn_for_fn (internal_fn reduc_fn, internal_fn *sbool_fn)
3337 : {
3338 0 : switch (reduc_fn)
3339 : {
3340 0 : case IFN_REDUC_AND:
3341 0 : *sbool_fn = IFN_REDUC_SBOOL_AND;
3342 0 : return true;
3343 0 : case IFN_REDUC_IOR:
3344 0 : *sbool_fn = IFN_REDUC_SBOOL_IOR;
3345 0 : return true;
3346 0 : case IFN_REDUC_XOR:
3347 0 : *sbool_fn = IFN_REDUC_SBOOL_XOR;
3348 0 : return true;
3349 : default:
3350 : return false;
3351 : }
3352 : }
3353 :
3354 : /* If there is a neutral value X such that a reduction would not be affected
3355 : by the introduction of additional X elements, return that X, otherwise
3356 : return null. CODE is the code of the reduction and SCALAR_TYPE is type
3357 : of the scalar elements. If the reduction has just a single initial value
3358 : then INITIAL_VALUE is that value, otherwise it is null.
3359 : If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3360 : In that case no signed zero is returned. */
3361 :
3362 : tree
3363 52246 : neutral_op_for_reduction (tree scalar_type, code_helper code,
3364 : tree initial_value, bool as_initial)
3365 : {
3366 52246 : if (code.is_tree_code ())
3367 52188 : switch (tree_code (code))
3368 : {
3369 7792 : case DOT_PROD_EXPR:
3370 7792 : case SAD_EXPR:
3371 7792 : case MINUS_EXPR:
3372 7792 : case BIT_IOR_EXPR:
3373 7792 : case BIT_XOR_EXPR:
3374 7792 : return build_zero_cst (scalar_type);
3375 39415 : case WIDEN_SUM_EXPR:
3376 39415 : case PLUS_EXPR:
3377 39415 : if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3378 92 : return build_real (scalar_type, dconstm0);
3379 : else
3380 39323 : return build_zero_cst (scalar_type);
3381 :
3382 2048 : case MULT_EXPR:
3383 2048 : return build_one_cst (scalar_type);
3384 :
3385 934 : case BIT_AND_EXPR:
3386 934 : return build_all_ones_cst (scalar_type);
3387 :
3388 : case MAX_EXPR:
3389 : case MIN_EXPR:
3390 : return initial_value;
3391 :
3392 384 : default:
3393 384 : return NULL_TREE;
3394 : }
3395 : else
3396 58 : switch (combined_fn (code))
3397 : {
3398 : CASE_CFN_FMIN:
3399 : CASE_CFN_FMAX:
3400 : return initial_value;
3401 :
3402 0 : default:
3403 0 : return NULL_TREE;
3404 : }
3405 : }
3406 :
3407 : /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3408 : STMT is printed with a message MSG. */
3409 :
3410 : static void
3411 577 : report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3412 : {
3413 577 : dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3414 577 : }
3415 :
3416 : /* Return true if we need an in-order reduction for operation CODE
3417 : on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3418 : overflow must wrap. */
3419 :
3420 : bool
3421 6368017 : needs_fold_left_reduction_p (tree type, code_helper code)
3422 : {
3423 : /* CHECKME: check for !flag_finite_math_only too? */
3424 6368017 : if (SCALAR_FLOAT_TYPE_P (type))
3425 : {
3426 546752 : if (code.is_tree_code ())
3427 546698 : switch (tree_code (code))
3428 : {
3429 : case MIN_EXPR:
3430 : case MAX_EXPR:
3431 : return false;
3432 :
3433 544999 : default:
3434 544999 : return !flag_associative_math;
3435 : }
3436 : else
3437 54 : switch (combined_fn (code))
3438 : {
3439 : CASE_CFN_FMIN:
3440 : CASE_CFN_FMAX:
3441 : return false;
3442 :
3443 2 : default:
3444 2 : return !flag_associative_math;
3445 : }
3446 : }
3447 :
3448 5821265 : if (INTEGRAL_TYPE_P (type))
3449 5820395 : return (!code.is_tree_code ()
3450 5820395 : || !operation_no_trapping_overflow (type, tree_code (code)));
3451 :
3452 870 : if (SAT_FIXED_POINT_TYPE_P (type))
3453 : return true;
3454 :
3455 : return false;
3456 : }
3457 :
3458 : /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3459 : has a handled computation expression. Store the main reduction
3460 : operation in *CODE. */
3461 :
3462 : static bool
3463 76364 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3464 : tree loop_arg, code_helper *code,
3465 : vec<std::pair<ssa_op_iter, use_operand_p> > &path,
3466 : bool inner_loop_of_double_reduc)
3467 : {
3468 76364 : auto_bitmap visited;
3469 76364 : tree lookfor = PHI_RESULT (phi);
3470 76364 : ssa_op_iter curri;
3471 76364 : use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3472 160350 : while (USE_FROM_PTR (curr) != loop_arg)
3473 7622 : curr = op_iter_next_use (&curri);
3474 76364 : curri.i = curri.numops;
3475 701955 : do
3476 : {
3477 701955 : path.safe_push (std::make_pair (curri, curr));
3478 701955 : tree use = USE_FROM_PTR (curr);
3479 701955 : if (use == lookfor)
3480 : break;
3481 625983 : gimple *def = SSA_NAME_DEF_STMT (use);
3482 625983 : if (gimple_nop_p (def)
3483 625983 : || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3484 : {
3485 523834 : pop:
3486 523834 : do
3487 : {
3488 523834 : std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3489 523834 : curri = x.first;
3490 523834 : curr = x.second;
3491 575160 : do
3492 575160 : curr = op_iter_next_use (&curri);
3493 : /* Skip already visited or non-SSA operands (from iterating
3494 : over PHI args). */
3495 : while (curr != NULL_USE_OPERAND_P
3496 1150320 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3497 204966 : || ! bitmap_set_bit (visited,
3498 204966 : SSA_NAME_VERSION
3499 : (USE_FROM_PTR (curr)))));
3500 : }
3501 1047668 : while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3502 180900 : if (curr == NULL_USE_OPERAND_P)
3503 : break;
3504 : }
3505 : else
3506 : {
3507 526119 : if (gimple_code (def) == GIMPLE_PHI)
3508 56402 : curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3509 : else
3510 469717 : curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3511 : while (curr != NULL_USE_OPERAND_P
3512 638682 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3513 552623 : || ! bitmap_set_bit (visited,
3514 552623 : SSA_NAME_VERSION
3515 : (USE_FROM_PTR (curr)))))
3516 112563 : curr = op_iter_next_use (&curri);
3517 526119 : if (curr == NULL_USE_OPERAND_P)
3518 81036 : goto pop;
3519 : }
3520 : }
3521 : while (1);
3522 76364 : if (dump_file && (dump_flags & TDF_DETAILS))
3523 : {
3524 3980 : dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3525 3980 : unsigned i;
3526 3980 : std::pair<ssa_op_iter, use_operand_p> *x;
3527 13527 : FOR_EACH_VEC_ELT (path, i, x)
3528 9547 : dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3529 3980 : dump_printf (MSG_NOTE, "\n");
3530 : }
3531 :
3532 : /* Check whether the reduction path detected is valid. */
3533 76364 : bool fail = path.length () == 0;
3534 76364 : bool neg = false;
3535 76364 : int sign = -1;
3536 76364 : *code = ERROR_MARK;
3537 161902 : for (unsigned i = 1; i < path.length (); ++i)
3538 : {
3539 92182 : gimple *use_stmt = USE_STMT (path[i].second);
3540 92182 : gimple_match_op op;
3541 92182 : if (!gimple_extract_op (use_stmt, &op))
3542 : {
3543 : fail = true;
3544 6644 : break;
3545 : }
3546 91295 : unsigned int opi = op.num_ops;
3547 91295 : if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3548 : {
3549 : /* The following make sure we can compute the operand index
3550 : easily plus it mostly disallows chaining via COND_EXPR condition
3551 : operands. */
3552 147908 : for (opi = 0; opi < op.num_ops; ++opi)
3553 146924 : if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3554 : break;
3555 : }
3556 3524 : else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3557 : {
3558 7062 : for (opi = 0; opi < op.num_ops; ++opi)
3559 7062 : if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3560 : break;
3561 : }
3562 91295 : if (opi == op.num_ops)
3563 : {
3564 : fail = true;
3565 : break;
3566 : }
3567 90311 : op.code = canonicalize_code (op.code, op.type);
3568 90311 : if (op.code == MINUS_EXPR)
3569 : {
3570 3872 : op.code = PLUS_EXPR;
3571 : /* Track whether we negate the reduction value each iteration. */
3572 3872 : if (op.ops[1] == op.ops[opi])
3573 34 : neg = ! neg;
3574 : }
3575 86439 : else if (op.code == IFN_COND_SUB)
3576 : {
3577 2 : op.code = IFN_COND_ADD;
3578 : /* Track whether we negate the reduction value each iteration. */
3579 2 : if (op.ops[2] == op.ops[opi])
3580 0 : neg = ! neg;
3581 : }
3582 : /* For an FMA the reduction code is the PLUS if the addition chain
3583 : is the reduction. */
3584 86437 : else if (op.code == IFN_FMA && opi == 2)
3585 28 : op.code = PLUS_EXPR;
3586 90311 : if (CONVERT_EXPR_CODE_P (op.code)
3587 90311 : && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3588 : ;
3589 86837 : else if (*code == ERROR_MARK)
3590 : {
3591 74198 : *code = op.code;
3592 74198 : sign = TYPE_SIGN (op.type);
3593 : }
3594 12639 : else if (op.code != *code)
3595 : {
3596 : fail = true;
3597 : break;
3598 : }
3599 11317 : else if ((op.code == MIN_EXPR
3600 11233 : || op.code == MAX_EXPR)
3601 11332 : && sign != TYPE_SIGN (op.type))
3602 : {
3603 : fail = true;
3604 : break;
3605 : }
3606 : /* Check there's only a single stmt the op is used on. For the
3607 : not value-changing tail and the last stmt allow out-of-loop uses,
3608 : but not when this is the inner loop of a double reduction.
3609 : ??? We could relax this and handle arbitrary live stmts by
3610 : forcing a scalar epilogue for example. */
3611 88986 : imm_use_iterator imm_iter;
3612 88986 : use_operand_p use_p;
3613 88986 : gimple *op_use_stmt;
3614 88986 : unsigned cnt = 0;
3615 92480 : bool cond_fn_p = op.code.is_internal_fn ()
3616 3494 : && (conditional_internal_fn_code (internal_fn (op.code))
3617 88986 : != ERROR_MARK);
3618 :
3619 303908 : FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3620 : {
3621 : /* In case of a COND_OP (mask, op1, op2, op1) reduction we should
3622 : have op1 twice (once as definition, once as else) in the same
3623 : operation. Enforce this. */
3624 125936 : if (cond_fn_p && op_use_stmt == use_stmt)
3625 : {
3626 3428 : gcall *call = as_a<gcall *> (use_stmt);
3627 3428 : unsigned else_pos
3628 3428 : = internal_fn_else_index (internal_fn (op.code));
3629 3428 : if (gimple_call_arg (call, else_pos) != op.ops[opi])
3630 : {
3631 : fail = true;
3632 : break;
3633 : }
3634 17140 : for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
3635 : {
3636 13712 : if (j == else_pos)
3637 3428 : continue;
3638 10284 : if (gimple_call_arg (call, j) == op.ops[opi])
3639 3428 : cnt++;
3640 : }
3641 : }
3642 122508 : else if (!is_gimple_debug (op_use_stmt)
3643 122508 : && ((*code != ERROR_MARK || inner_loop_of_double_reduc)
3644 1806 : || flow_bb_inside_loop_p (loop,
3645 1806 : gimple_bb (op_use_stmt))))
3646 178159 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3647 89084 : cnt++;
3648 88986 : }
3649 :
3650 88986 : if (cnt != 1)
3651 : {
3652 : fail = true;
3653 : break;
3654 : }
3655 : }
3656 83411 : return ! fail && ! neg && *code != ERROR_MARK;
3657 76364 : }
3658 :
3659 : bool
3660 21 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3661 : tree loop_arg, enum tree_code code)
3662 : {
3663 21 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3664 21 : code_helper code_;
3665 21 : return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path, false)
3666 21 : && code_ == code);
3667 21 : }
3668 :
3669 :
3670 :
3671 : /* Function vect_is_simple_reduction
3672 :
3673 : (1) Detect a cross-iteration def-use cycle that represents a simple
3674 : reduction computation. We look for the following pattern:
3675 :
3676 : loop_header:
3677 : a1 = phi < a0, a2 >
3678 : a3 = ...
3679 : a2 = operation (a3, a1)
3680 :
3681 : or
3682 :
3683 : a3 = ...
3684 : loop_header:
3685 : a1 = phi < a0, a2 >
3686 : a2 = operation (a3, a1)
3687 :
3688 : such that:
3689 : 1. operation is commutative and associative and it is safe to
3690 : change the order of the computation
3691 : 2. no uses for a2 in the loop (a2 is used out of the loop)
3692 : 3. no uses of a1 in the loop besides the reduction operation
3693 : 4. no uses of a1 outside the loop.
3694 :
3695 : Conditions 1,4 are tested here.
3696 : Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3697 :
3698 : (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3699 : nested cycles.
3700 :
3701 : (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3702 : reductions:
3703 :
3704 : a1 = phi < a0, a2 >
3705 : inner loop (def of a3)
3706 : a2 = phi < a3 >
3707 :
3708 : (4) Detect condition expressions, ie:
3709 : for (int i = 0; i < N; i++)
3710 : if (a[i] < val)
3711 : ret_val = a[i];
3712 :
3713 : */
3714 :
3715 : static stmt_vec_info
3716 138584 : vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3717 : gphi **double_reduc)
3718 : {
3719 138584 : gphi *phi = as_a <gphi *> (phi_info->stmt);
3720 138584 : gimple *phi_use_stmt = NULL;
3721 138584 : imm_use_iterator imm_iter;
3722 138584 : use_operand_p use_p;
3723 :
3724 : /* When double_reduc is NULL we are testing the inner loop of a
3725 : double reduction. */
3726 138584 : bool inner_loop_of_double_reduc = double_reduc == NULL;
3727 138584 : if (double_reduc)
3728 137535 : *double_reduc = NULL;
3729 138584 : STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3730 :
3731 138584 : tree phi_name = PHI_RESULT (phi);
3732 : /* ??? If there are no uses of the PHI result the inner loop reduction
3733 : won't be detected as possibly double-reduction by vectorizable_reduction
3734 : because that tries to walk the PHI arg from the preheader edge which
3735 : can be constant. See PR60382. */
3736 138584 : if (has_zero_uses (phi_name))
3737 : return NULL;
3738 138457 : class loop *loop = (gimple_bb (phi))->loop_father;
3739 138457 : unsigned nphi_def_loop_uses = 0;
3740 529242 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3741 : {
3742 264064 : gimple *use_stmt = USE_STMT (use_p);
3743 264064 : if (is_gimple_debug (use_stmt))
3744 71416 : continue;
3745 :
3746 192648 : if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3747 : {
3748 11736 : if (dump_enabled_p ())
3749 40 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3750 : "intermediate value used outside loop.\n");
3751 :
3752 11736 : return NULL;
3753 : }
3754 :
3755 : /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
3756 : op1 twice (once as definition, once as else) in the same operation.
3757 : Only count it as one. */
3758 180912 : if (use_stmt != phi_use_stmt)
3759 : {
3760 177047 : nphi_def_loop_uses++;
3761 177047 : phi_use_stmt = use_stmt;
3762 : }
3763 11736 : }
3764 :
3765 126721 : tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3766 126721 : if (TREE_CODE (latch_def) != SSA_NAME)
3767 : {
3768 1449 : if (dump_enabled_p ())
3769 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3770 : "reduction: not ssa_name: %T\n", latch_def);
3771 1449 : return NULL;
3772 : }
3773 :
3774 125272 : stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3775 125272 : if (!def_stmt_info
3776 125272 : || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3777 161 : return NULL;
3778 :
3779 125111 : bool nested_in_vect_loop
3780 125111 : = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3781 125111 : unsigned nlatch_def_loop_uses = 0;
3782 125111 : auto_vec<gphi *, 3> lcphis;
3783 613014 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3784 : {
3785 362792 : gimple *use_stmt = USE_STMT (use_p);
3786 362792 : if (is_gimple_debug (use_stmt))
3787 109754 : continue;
3788 253038 : if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3789 164241 : nlatch_def_loop_uses++;
3790 : else
3791 : /* We can have more than one loop-closed PHI. */
3792 88797 : lcphis.safe_push (as_a <gphi *> (use_stmt));
3793 125111 : }
3794 :
3795 : /* If we are vectorizing an inner reduction we are executing that
3796 : in the original order only in case we are not dealing with a
3797 : double reduction. */
3798 125111 : if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3799 : {
3800 2272 : if (dump_enabled_p ())
3801 433 : report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3802 : "detected nested cycle: ");
3803 2272 : return def_stmt_info;
3804 : }
3805 :
3806 : /* When the inner loop of a double reduction ends up with more than
3807 : one loop-closed PHI we have failed to classify alternate such
3808 : PHIs as double reduction, leading to wrong code. See PR103237. */
3809 123876 : if (inner_loop_of_double_reduc && lcphis.length () != 1)
3810 : {
3811 1 : if (dump_enabled_p ())
3812 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3813 : "unhandle double reduction\n");
3814 1 : return NULL;
3815 : }
3816 :
3817 : /* If this isn't a nested cycle or if the nested cycle reduction value
3818 : is used ouside of the inner loop we cannot handle uses of the reduction
3819 : value. */
3820 122838 : if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3821 : {
3822 45191 : if (dump_enabled_p ())
3823 401 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3824 : "reduction used in loop.\n");
3825 45191 : return NULL;
3826 : }
3827 :
3828 : /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3829 : defined in the inner loop. */
3830 77647 : if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3831 : {
3832 1304 : tree op1 = PHI_ARG_DEF (def_stmt, 0);
3833 1304 : if (gimple_phi_num_args (def_stmt) != 1
3834 1304 : || TREE_CODE (op1) != SSA_NAME)
3835 : {
3836 91 : if (dump_enabled_p ())
3837 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3838 : "unsupported phi node definition.\n");
3839 :
3840 91 : return NULL;
3841 : }
3842 :
3843 : /* Verify there is an inner cycle composed of the PHI phi_use_stmt
3844 : and the latch definition op1. */
3845 1213 : gimple *def1 = SSA_NAME_DEF_STMT (op1);
3846 1213 : if (gimple_bb (def1)
3847 1213 : && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3848 1213 : && loop->inner
3849 1159 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3850 1159 : && (is_gimple_assign (def1) || is_gimple_call (def1))
3851 1150 : && is_a <gphi *> (phi_use_stmt)
3852 1138 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
3853 1138 : && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
3854 : loop_latch_edge (loop->inner)))
3855 2349 : && lcphis.length () == 1)
3856 : {
3857 1049 : if (dump_enabled_p ())
3858 144 : report_vect_op (MSG_NOTE, def_stmt,
3859 : "detected double reduction: ");
3860 :
3861 1049 : *double_reduc = as_a <gphi *> (phi_use_stmt);
3862 1049 : return def_stmt_info;
3863 : }
3864 :
3865 164 : return NULL;
3866 : }
3867 :
3868 : /* Look for the expression computing latch_def from then loop PHI result. */
3869 76343 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3870 76343 : code_helper code;
3871 76343 : if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3872 : path, inner_loop_of_double_reduc))
3873 : {
3874 69296 : STMT_VINFO_REDUC_CODE (phi_info) = code;
3875 69296 : if (code == COND_EXPR && !nested_in_vect_loop)
3876 8193 : STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3877 :
3878 : /* Fill in STMT_VINFO_REDUC_IDX. */
3879 69296 : unsigned i;
3880 222470 : for (i = path.length () - 1; i >= 1; --i)
3881 : {
3882 83878 : gimple *stmt = USE_STMT (path[i].second);
3883 83878 : stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3884 83878 : gimple_match_op op;
3885 83878 : if (!gimple_extract_op (stmt, &op))
3886 0 : gcc_unreachable ();
3887 83878 : if (gassign *assign = dyn_cast<gassign *> (stmt))
3888 80374 : STMT_VINFO_REDUC_IDX (stmt_info)
3889 80374 : = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3890 : else
3891 : {
3892 3504 : gcall *call = as_a<gcall *> (stmt);
3893 3504 : STMT_VINFO_REDUC_IDX (stmt_info)
3894 3504 : = path[i].second->use - gimple_call_arg_ptr (call, 0);
3895 : }
3896 : }
3897 69296 : if (dump_enabled_p ())
3898 3918 : dump_printf_loc (MSG_NOTE, vect_location,
3899 : "reduction: detected reduction\n");
3900 :
3901 69296 : return def_stmt_info;
3902 : }
3903 :
3904 7047 : if (dump_enabled_p ())
3905 86 : dump_printf_loc (MSG_NOTE, vect_location,
3906 : "reduction: unknown pattern\n");
3907 :
3908 : return NULL;
3909 201454 : }
3910 :
3911 : /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3912 : PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3913 : or -1 if not known. */
3914 :
3915 : static int
3916 361821 : vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3917 : {
3918 361821 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
3919 361821 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3920 : {
3921 156416 : if (dump_enabled_p ())
3922 2903 : dump_printf_loc (MSG_NOTE, vect_location,
3923 : "cost model: epilogue peel iters set to vf/2 "
3924 : "because loop iterations are unknown .\n");
3925 156416 : return assumed_vf / 2;
3926 : }
3927 : else
3928 : {
3929 205405 : int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3930 205405 : peel_iters_prologue = MIN (niters, peel_iters_prologue);
3931 205405 : int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3932 : /* If we need to peel for gaps, but no peeling is required, we have to
3933 : peel VF iterations. */
3934 205405 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3935 205405 : peel_iters_epilogue = assumed_vf;
3936 205405 : return peel_iters_epilogue;
3937 : }
3938 : }
3939 :
3940 : /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3941 : int
3942 279487 : vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3943 : int *peel_iters_epilogue,
3944 : stmt_vector_for_cost *scalar_cost_vec,
3945 : stmt_vector_for_cost *prologue_cost_vec,
3946 : stmt_vector_for_cost *epilogue_cost_vec)
3947 : {
3948 279487 : int retval = 0;
3949 :
3950 279487 : *peel_iters_epilogue
3951 279487 : = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3952 :
3953 279487 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3954 : {
3955 : /* If peeled iterations are known but number of scalar loop
3956 : iterations are unknown, count a taken branch per peeled loop. */
3957 107257 : if (peel_iters_prologue > 0)
3958 68643 : retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3959 : vect_prologue);
3960 107257 : if (*peel_iters_epilogue > 0)
3961 107180 : retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3962 : vect_epilogue);
3963 : }
3964 :
3965 279487 : stmt_info_for_cost *si;
3966 279487 : int j;
3967 279487 : if (peel_iters_prologue)
3968 605439 : FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3969 478282 : retval += record_stmt_cost (prologue_cost_vec,
3970 478282 : si->count * peel_iters_prologue,
3971 : si->kind, si->stmt_info, si->misalign,
3972 : vect_prologue);
3973 279487 : if (*peel_iters_epilogue)
3974 941858 : FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3975 747455 : retval += record_stmt_cost (epilogue_cost_vec,
3976 747455 : si->count * *peel_iters_epilogue,
3977 : si->kind, si->stmt_info, si->misalign,
3978 : vect_epilogue);
3979 :
3980 279487 : return retval;
3981 : }
3982 :
3983 : /* Function vect_estimate_min_profitable_iters
3984 :
3985 : Return the number of iterations required for the vector version of the
3986 : loop to be profitable relative to the cost of the scalar version of the
3987 : loop.
3988 :
3989 : *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3990 : of iterations for vectorization. -1 value means loop vectorization
3991 : is not profitable. This returned value may be used for dynamic
3992 : profitability check.
3993 :
3994 : *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3995 : for static check against estimated number of iterations. */
3996 :
3997 : static void
3998 99509 : vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3999 : int *ret_min_profitable_niters,
4000 : int *ret_min_profitable_estimate,
4001 : unsigned *suggested_unroll_factor)
4002 : {
4003 99509 : int min_profitable_iters;
4004 99509 : int min_profitable_estimate;
4005 99509 : int peel_iters_prologue;
4006 99509 : int peel_iters_epilogue;
4007 99509 : unsigned vec_inside_cost = 0;
4008 99509 : int vec_outside_cost = 0;
4009 99509 : unsigned vec_prologue_cost = 0;
4010 99509 : unsigned vec_epilogue_cost = 0;
4011 99509 : int scalar_single_iter_cost = 0;
4012 99509 : int scalar_outside_cost = 0;
4013 99509 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
4014 99509 : int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4015 99509 : vector_costs *target_cost_data = loop_vinfo->vector_costs;
4016 :
4017 : /* Cost model disabled. */
4018 99509 : if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4019 : {
4020 16873 : if (dump_enabled_p ())
4021 10600 : dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4022 16873 : *ret_min_profitable_niters = 0;
4023 16873 : *ret_min_profitable_estimate = 0;
4024 16873 : return;
4025 : }
4026 :
4027 : /* Requires loop versioning tests to handle misalignment. */
4028 82636 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4029 : {
4030 : /* FIXME: Make cost depend on complexity of individual check. */
4031 13 : unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4032 13 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4033 13 : if (dump_enabled_p ())
4034 1 : dump_printf (MSG_NOTE,
4035 : "cost model: Adding cost of checks for loop "
4036 : "versioning to treat misalignment.\n");
4037 : }
4038 :
4039 : /* Requires loop versioning with alias checks. */
4040 82636 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4041 : {
4042 : /* FIXME: Make cost depend on complexity of individual check. */
4043 4120 : unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4044 4120 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4045 4120 : len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4046 2 : if (len)
4047 : /* Count LEN - 1 ANDs and LEN comparisons. */
4048 2 : (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4049 : scalar_stmt, vect_prologue);
4050 4120 : len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4051 1108 : if (len)
4052 : {
4053 : /* Count LEN - 1 ANDs and LEN comparisons. */
4054 1108 : unsigned int nstmts = len * 2 - 1;
4055 : /* +1 for each bias that needs adding. */
4056 2216 : for (unsigned int i = 0; i < len; ++i)
4057 1108 : if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4058 125 : nstmts += 1;
4059 1108 : (void) add_stmt_cost (target_cost_data, nstmts,
4060 : scalar_stmt, vect_prologue);
4061 : }
4062 4120 : if (dump_enabled_p ())
4063 15 : dump_printf (MSG_NOTE,
4064 : "cost model: Adding cost of checks for loop "
4065 : "versioning aliasing.\n");
4066 : }
4067 :
4068 : /* Requires loop versioning with niter checks. */
4069 82636 : if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4070 : {
4071 : /* FIXME: Make cost depend on complexity of individual check. */
4072 665 : (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4073 : NULL, NULL, NULL_TREE, 0, vect_prologue);
4074 665 : if (dump_enabled_p ())
4075 1 : dump_printf (MSG_NOTE,
4076 : "cost model: Adding cost of checks for loop "
4077 : "versioning niters.\n");
4078 : }
4079 :
4080 82636 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4081 4794 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4082 : vect_prologue);
4083 :
4084 : /* Count statements in scalar loop. Using this as scalar cost for a single
4085 : iteration for now.
4086 :
4087 : TODO: Add outer loop support.
4088 :
4089 : TODO: Consider assigning different costs to different scalar
4090 : statements. */
4091 :
4092 82636 : scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4093 :
4094 : /* Add additional cost for the peeled instructions in prologue and epilogue
4095 : loop. (For fully-masked loops there will be no peeling.)
4096 :
4097 : FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4098 : at compile-time - we assume it's vf/2 (the worst would be vf-1).
4099 :
4100 : TODO: Build an expression that represents peel_iters for prologue and
4101 : epilogue to be used in a run-time test. */
4102 :
4103 82636 : bool prologue_need_br_taken_cost = false;
4104 82636 : bool prologue_need_br_not_taken_cost = false;
4105 :
4106 : /* Calculate peel_iters_prologue. */
4107 82636 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4108 : peel_iters_prologue = 0;
4109 82636 : else if (npeel < 0)
4110 : {
4111 280 : peel_iters_prologue = assumed_vf / 2;
4112 280 : if (dump_enabled_p ())
4113 6 : dump_printf (MSG_NOTE, "cost model: "
4114 : "prologue peel iters set to vf/2.\n");
4115 :
4116 : /* If peeled iterations are unknown, count a taken branch and a not taken
4117 : branch per peeled loop. Even if scalar loop iterations are known,
4118 : vector iterations are not known since peeled prologue iterations are
4119 : not known. Hence guards remain the same. */
4120 : prologue_need_br_taken_cost = true;
4121 : prologue_need_br_not_taken_cost = true;
4122 : }
4123 : else
4124 : {
4125 82356 : peel_iters_prologue = npeel;
4126 82356 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4127 : /* If peeled iterations are known but number of scalar loop
4128 : iterations are unknown, count a taken branch per peeled loop. */
4129 82636 : prologue_need_br_taken_cost = true;
4130 : }
4131 :
4132 82636 : bool epilogue_need_br_taken_cost = false;
4133 82636 : bool epilogue_need_br_not_taken_cost = false;
4134 :
4135 : /* Calculate peel_iters_epilogue. */
4136 82636 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4137 : /* We need to peel exactly one iteration for gaps. */
4138 22 : peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4139 82614 : else if (npeel < 0)
4140 : {
4141 : /* If peeling for alignment is unknown, loop bound of main loop
4142 : becomes unknown. */
4143 280 : peel_iters_epilogue = assumed_vf / 2;
4144 280 : if (dump_enabled_p ())
4145 6 : dump_printf (MSG_NOTE, "cost model: "
4146 : "epilogue peel iters set to vf/2 because "
4147 : "peeling for alignment is unknown.\n");
4148 :
4149 : /* See the same reason above in peel_iters_prologue calculation. */
4150 : epilogue_need_br_taken_cost = true;
4151 : epilogue_need_br_not_taken_cost = true;
4152 : }
4153 : else
4154 : {
4155 82334 : peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4156 82334 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4157 : /* If peeled iterations are known but number of scalar loop
4158 : iterations are unknown, count a taken branch per peeled loop. */
4159 82636 : epilogue_need_br_taken_cost = true;
4160 : }
4161 :
4162 82636 : stmt_info_for_cost *si;
4163 82636 : int j;
4164 : /* Add costs associated with peel_iters_prologue. */
4165 82636 : if (peel_iters_prologue)
4166 1028 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4167 : {
4168 739 : (void) add_stmt_cost (target_cost_data,
4169 739 : si->count * peel_iters_prologue, si->kind,
4170 : si->stmt_info, si->node, si->vectype,
4171 : si->misalign, vect_prologue);
4172 : }
4173 :
4174 : /* Add costs associated with peel_iters_epilogue. */
4175 82636 : if (peel_iters_epilogue)
4176 282404 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4177 : {
4178 224083 : (void) add_stmt_cost (target_cost_data,
4179 224083 : si->count * peel_iters_epilogue, si->kind,
4180 : si->stmt_info, si->node, si->vectype,
4181 : si->misalign, vect_epilogue);
4182 : }
4183 :
4184 : /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4185 :
4186 82636 : if (prologue_need_br_taken_cost)
4187 280 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4188 : vect_prologue);
4189 :
4190 82636 : if (prologue_need_br_not_taken_cost)
4191 280 : (void) add_stmt_cost (target_cost_data, 1,
4192 : cond_branch_not_taken, vect_prologue);
4193 :
4194 82636 : if (epilogue_need_br_taken_cost)
4195 48863 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4196 : vect_epilogue);
4197 :
4198 82636 : if (epilogue_need_br_not_taken_cost)
4199 280 : (void) add_stmt_cost (target_cost_data, 1,
4200 : cond_branch_not_taken, vect_epilogue);
4201 :
4202 : /* Take care of special costs for rgroup controls of partial vectors. */
4203 22 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4204 82658 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4205 : == vect_partial_vectors_avx512))
4206 : {
4207 : /* Calculate how many masks we need to generate. */
4208 22 : unsigned int num_masks = 0;
4209 22 : bool need_saturation = false;
4210 90 : for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4211 24 : if (rgm.type)
4212 : {
4213 22 : unsigned nvectors = rgm.factor;
4214 22 : num_masks += nvectors;
4215 22 : if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4216 22 : < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4217 7 : need_saturation = true;
4218 : }
4219 :
4220 : /* ??? The target isn't able to identify the costs below as
4221 : producing masks so it cannot penaltize cases where we'd run
4222 : out of mask registers for example. */
4223 :
4224 : /* ??? We are also failing to account for smaller vector masks
4225 : we generate by splitting larger masks in vect_get_loop_mask. */
4226 :
4227 : /* In the worst case, we need to generate each mask in the prologue
4228 : and in the loop body. We need one splat per group and one
4229 : compare per mask.
4230 :
4231 : Sometimes the prologue mask will fold to a constant,
4232 : so the actual prologue cost might be smaller. However, it's
4233 : simpler and safer to use the worst-case cost; if this ends up
4234 : being the tie-breaker between vectorizing or not, then it's
4235 : probably better not to vectorize. */
4236 22 : (void) add_stmt_cost (target_cost_data,
4237 : num_masks
4238 22 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4239 : vector_stmt, NULL, NULL, NULL_TREE, 0,
4240 : vect_prologue);
4241 44 : (void) add_stmt_cost (target_cost_data,
4242 : num_masks
4243 44 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4244 : vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4245 :
4246 : /* When we need saturation we need it both in the prologue and
4247 : the epilogue. */
4248 22 : if (need_saturation)
4249 : {
4250 7 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4251 : NULL, NULL, NULL_TREE, 0, vect_prologue);
4252 7 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4253 : NULL, NULL, NULL_TREE, 0, vect_body);
4254 : }
4255 : }
4256 0 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4257 82614 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4258 : == vect_partial_vectors_while_ult))
4259 : {
4260 : /* Calculate how many masks we need to generate. */
4261 : unsigned int num_masks = 0;
4262 : rgroup_controls *rgm;
4263 : unsigned int num_vectors_m1;
4264 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4265 : num_vectors_m1, rgm)
4266 0 : if (rgm->type)
4267 0 : num_masks += num_vectors_m1 + 1;
4268 0 : gcc_assert (num_masks > 0);
4269 :
4270 : /* In the worst case, we need to generate each mask in the prologue
4271 : and in the loop body. One of the loop body mask instructions
4272 : replaces the comparison in the scalar loop, and since we don't
4273 : count the scalar comparison against the scalar body, we shouldn't
4274 : count that vector instruction against the vector body either.
4275 :
4276 : Sometimes we can use unpacks instead of generating prologue
4277 : masks and sometimes the prologue mask will fold to a constant,
4278 : so the actual prologue cost might be smaller. However, it's
4279 : simpler and safer to use the worst-case cost; if this ends up
4280 : being the tie-breaker between vectorizing or not, then it's
4281 : probably better not to vectorize. */
4282 0 : (void) add_stmt_cost (target_cost_data, num_masks,
4283 : vector_stmt, NULL, NULL, NULL_TREE, 0,
4284 : vect_prologue);
4285 0 : (void) add_stmt_cost (target_cost_data, num_masks - 1,
4286 : vector_stmt, NULL, NULL, NULL_TREE, 0,
4287 : vect_body);
4288 : }
4289 82614 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4290 : {
4291 : /* Referring to the functions vect_set_loop_condition_partial_vectors
4292 : and vect_set_loop_controls_directly, we need to generate each
4293 : length in the prologue and in the loop body if required. Although
4294 : there are some possible optimizations, we consider the worst case
4295 : here. */
4296 :
4297 0 : bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4298 0 : signed char partial_load_store_bias
4299 : = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4300 0 : bool need_iterate_p
4301 0 : = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4302 0 : && !vect_known_niters_smaller_than_vf (loop_vinfo));
4303 :
4304 : /* Calculate how many statements to be added. */
4305 0 : unsigned int prologue_stmts = 0;
4306 0 : unsigned int body_stmts = 0;
4307 :
4308 0 : rgroup_controls *rgc;
4309 0 : unsigned int num_vectors_m1;
4310 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4311 0 : if (rgc->type)
4312 : {
4313 : /* May need one SHIFT for nitems_total computation. */
4314 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4315 0 : if (nitems != 1 && !niters_known_p)
4316 0 : prologue_stmts += 1;
4317 :
4318 : /* May need one MAX and one MINUS for wrap around. */
4319 0 : if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4320 0 : prologue_stmts += 2;
4321 :
4322 : /* Need one MAX and one MINUS for each batch limit excepting for
4323 : the 1st one. */
4324 0 : prologue_stmts += num_vectors_m1 * 2;
4325 :
4326 0 : unsigned int num_vectors = num_vectors_m1 + 1;
4327 :
4328 : /* Need to set up lengths in prologue, only one MIN required
4329 : for each since start index is zero. */
4330 0 : prologue_stmts += num_vectors;
4331 :
4332 : /* If we have a non-zero partial load bias, we need one PLUS
4333 : to adjust the load length. */
4334 0 : if (partial_load_store_bias != 0)
4335 0 : body_stmts += 1;
4336 :
4337 0 : unsigned int length_update_cost = 0;
4338 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4339 : /* For decrement IV style, Each only need a single SELECT_VL
4340 : or MIN since beginning to calculate the number of elements
4341 : need to be processed in current iteration. */
4342 : length_update_cost = 1;
4343 : else
4344 : /* For increment IV stype, Each may need two MINs and one MINUS to
4345 : update lengths in body for next iteration. */
4346 0 : length_update_cost = 3;
4347 :
4348 0 : if (need_iterate_p)
4349 0 : body_stmts += length_update_cost * num_vectors;
4350 : }
4351 :
4352 0 : (void) add_stmt_cost (target_cost_data, prologue_stmts,
4353 : scalar_stmt, vect_prologue);
4354 0 : (void) add_stmt_cost (target_cost_data, body_stmts,
4355 : scalar_stmt, vect_body);
4356 : }
4357 :
4358 : /* FORNOW: The scalar outside cost is incremented in one of the
4359 : following ways:
4360 :
4361 : 1. The vectorizer checks for alignment and aliasing and generates
4362 : a condition that allows dynamic vectorization. A cost model
4363 : check is ANDED with the versioning condition. Hence scalar code
4364 : path now has the added cost of the versioning check.
4365 :
4366 : if (cost > th & versioning_check)
4367 : jmp to vector code
4368 :
4369 : Hence run-time scalar is incremented by not-taken branch cost.
4370 :
4371 : 2. The vectorizer then checks if a prologue is required. If the
4372 : cost model check was not done before during versioning, it has to
4373 : be done before the prologue check.
4374 :
4375 : if (cost <= th)
4376 : prologue = scalar_iters
4377 : if (prologue == 0)
4378 : jmp to vector code
4379 : else
4380 : execute prologue
4381 : if (prologue == num_iters)
4382 : go to exit
4383 :
4384 : Hence the run-time scalar cost is incremented by a taken branch,
4385 : plus a not-taken branch, plus a taken branch cost.
4386 :
4387 : 3. The vectorizer then checks if an epilogue is required. If the
4388 : cost model check was not done before during prologue check, it
4389 : has to be done with the epilogue check.
4390 :
4391 : if (prologue == 0)
4392 : jmp to vector code
4393 : else
4394 : execute prologue
4395 : if (prologue == num_iters)
4396 : go to exit
4397 : vector code:
4398 : if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4399 : jmp to epilogue
4400 :
4401 : Hence the run-time scalar cost should be incremented by 2 taken
4402 : branches.
4403 :
4404 : TODO: The back end may reorder the BBS's differently and reverse
4405 : conditions/branch directions. Change the estimates below to
4406 : something more reasonable. */
4407 :
4408 : /* If the number of iterations is known and we do not do versioning, we can
4409 : decide whether to vectorize at compile time. Hence the scalar version
4410 : do not carry cost model guard costs. */
4411 33216 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4412 115852 : || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4413 : {
4414 : /* Cost model check occurs at versioning. */
4415 50032 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4416 4794 : scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4417 : else
4418 : {
4419 : /* Cost model check occurs at prologue generation. */
4420 45238 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4421 150 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4422 150 : + vect_get_stmt_cost (cond_branch_not_taken);
4423 : /* Cost model check occurs at epilogue generation. */
4424 : else
4425 45088 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4426 : }
4427 : }
4428 :
4429 : /* Complete the target-specific cost calculations. */
4430 82636 : loop_vinfo->vector_costs->finish_cost (loop_vinfo->scalar_costs);
4431 82636 : vec_prologue_cost = loop_vinfo->vector_costs->prologue_cost ();
4432 82636 : vec_inside_cost = loop_vinfo->vector_costs->body_cost ();
4433 82636 : vec_epilogue_cost = loop_vinfo->vector_costs->epilogue_cost ();
4434 82636 : if (suggested_unroll_factor)
4435 82449 : *suggested_unroll_factor
4436 82449 : = loop_vinfo->vector_costs->suggested_unroll_factor ();
4437 :
4438 82449 : if (suggested_unroll_factor && *suggested_unroll_factor > 1
4439 233 : && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4440 0 : && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4441 : *suggested_unroll_factor,
4442 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4443 : {
4444 0 : if (dump_enabled_p ())
4445 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4446 : "can't unroll as unrolled vectorization factor larger"
4447 : " than maximum vectorization factor: "
4448 : HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4449 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4450 0 : *suggested_unroll_factor = 1;
4451 : }
4452 :
4453 82636 : vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4454 :
4455 82636 : if (dump_enabled_p ())
4456 : {
4457 606 : dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4458 606 : dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4459 : vec_inside_cost);
4460 606 : dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4461 : vec_prologue_cost);
4462 606 : dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4463 : vec_epilogue_cost);
4464 606 : dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4465 : scalar_single_iter_cost);
4466 606 : dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4467 : scalar_outside_cost);
4468 606 : dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4469 : vec_outside_cost);
4470 606 : dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4471 : peel_iters_prologue);
4472 606 : dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4473 : peel_iters_epilogue);
4474 : }
4475 :
4476 : /* Calculate number of iterations required to make the vector version
4477 : profitable, relative to the loop bodies only. The following condition
4478 : must hold true:
4479 : SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4480 : where
4481 : SIC = scalar iteration cost, VIC = vector iteration cost,
4482 : VOC = vector outside cost, VF = vectorization factor,
4483 : NPEEL = prologue iterations + epilogue iterations,
4484 : SOC = scalar outside cost for run time cost model check. */
4485 :
4486 82636 : int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4487 82636 : - vec_inside_cost);
4488 82636 : if (saving_per_viter <= 0)
4489 : {
4490 24191 : if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4491 0 : warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4492 : "vectorization did not happen for a simd loop");
4493 :
4494 24191 : if (dump_enabled_p ())
4495 22 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4496 : "cost model: the vector iteration cost = %d "
4497 : "divided by the scalar iteration cost = %d "
4498 : "is greater or equal to the vectorization factor = %d"
4499 : ".\n",
4500 : vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4501 24191 : *ret_min_profitable_niters = -1;
4502 24191 : *ret_min_profitable_estimate = -1;
4503 24191 : return;
4504 : }
4505 :
4506 : /* ??? The "if" arm is written to handle all cases; see below for what
4507 : we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4508 58445 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4509 : {
4510 : /* Rewriting the condition above in terms of the number of
4511 : vector iterations (vniters) rather than the number of
4512 : scalar iterations (niters) gives:
4513 :
4514 : SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4515 :
4516 : <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4517 :
4518 : For integer N, X and Y when X > 0:
4519 :
4520 : N * X > Y <==> N >= (Y /[floor] X) + 1. */
4521 14 : int outside_overhead = (vec_outside_cost
4522 14 : - scalar_single_iter_cost * peel_iters_prologue
4523 14 : - scalar_single_iter_cost * peel_iters_epilogue
4524 : - scalar_outside_cost);
4525 : /* We're only interested in cases that require at least one
4526 : vector iteration. */
4527 14 : int min_vec_niters = 1;
4528 14 : if (outside_overhead > 0)
4529 11 : min_vec_niters = outside_overhead / saving_per_viter + 1;
4530 :
4531 14 : if (dump_enabled_p ())
4532 6 : dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4533 : min_vec_niters);
4534 :
4535 14 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4536 : {
4537 : /* Now that we know the minimum number of vector iterations,
4538 : find the minimum niters for which the scalar cost is larger:
4539 :
4540 : SIC * niters > VIC * vniters + VOC - SOC
4541 :
4542 : We know that the minimum niters is no more than
4543 : vniters * VF + NPEEL, but it might be (and often is) less
4544 : than that if a partial vector iteration is cheaper than the
4545 : equivalent scalar code. */
4546 14 : int threshold = (vec_inside_cost * min_vec_niters
4547 14 : + vec_outside_cost
4548 14 : - scalar_outside_cost);
4549 14 : if (threshold <= 0)
4550 : min_profitable_iters = 1;
4551 : else
4552 14 : min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4553 : }
4554 : else
4555 : /* Convert the number of vector iterations into a number of
4556 : scalar iterations. */
4557 0 : min_profitable_iters = (min_vec_niters * assumed_vf
4558 0 : + peel_iters_prologue
4559 : + peel_iters_epilogue);
4560 : }
4561 : else
4562 : {
4563 58431 : min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4564 58431 : * assumed_vf
4565 58431 : - vec_inside_cost * peel_iters_prologue
4566 58431 : - vec_inside_cost * peel_iters_epilogue);
4567 58431 : if (min_profitable_iters <= 0)
4568 : min_profitable_iters = 0;
4569 : else
4570 : {
4571 49127 : min_profitable_iters /= saving_per_viter;
4572 :
4573 49127 : if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4574 49127 : <= (((int) vec_inside_cost * min_profitable_iters)
4575 49127 : + (((int) vec_outside_cost - scalar_outside_cost)
4576 : * assumed_vf)))
4577 49127 : min_profitable_iters++;
4578 : }
4579 : }
4580 :
4581 58445 : if (dump_enabled_p ())
4582 584 : dump_printf (MSG_NOTE,
4583 : " Calculated minimum iters for profitability: %d\n",
4584 : min_profitable_iters);
4585 :
4586 58445 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4587 58431 : && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4588 : /* We want the vectorized loop to execute at least once. */
4589 : min_profitable_iters = assumed_vf + peel_iters_prologue;
4590 10808 : else if (min_profitable_iters < peel_iters_prologue)
4591 : /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4592 : vectorized loop executes at least once. */
4593 : min_profitable_iters = peel_iters_prologue;
4594 :
4595 58445 : if (dump_enabled_p ())
4596 584 : dump_printf_loc (MSG_NOTE, vect_location,
4597 : " Runtime profitability threshold = %d\n",
4598 : min_profitable_iters);
4599 :
4600 58445 : *ret_min_profitable_niters = min_profitable_iters;
4601 :
4602 : /* Calculate number of iterations required to make the vector version
4603 : profitable, relative to the loop bodies only.
4604 :
4605 : Non-vectorized variant is SIC * niters and it must win over vector
4606 : variant on the expected loop trip count. The following condition must hold true:
4607 : SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4608 :
4609 58445 : if (vec_outside_cost <= 0)
4610 : min_profitable_estimate = 0;
4611 : /* ??? This "else if" arm is written to handle all cases; see below for
4612 : what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4613 52930 : else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4614 : {
4615 : /* This is a repeat of the code above, but with + SOC rather
4616 : than - SOC. */
4617 14 : int outside_overhead = (vec_outside_cost
4618 14 : - scalar_single_iter_cost * peel_iters_prologue
4619 14 : - scalar_single_iter_cost * peel_iters_epilogue
4620 : + scalar_outside_cost);
4621 14 : int min_vec_niters = 1;
4622 14 : if (outside_overhead > 0)
4623 14 : min_vec_niters = outside_overhead / saving_per_viter + 1;
4624 :
4625 14 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4626 : {
4627 14 : int threshold = (vec_inside_cost * min_vec_niters
4628 14 : + vec_outside_cost
4629 14 : + scalar_outside_cost);
4630 14 : min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4631 : }
4632 : else
4633 : min_profitable_estimate = (min_vec_niters * assumed_vf
4634 : + peel_iters_prologue
4635 : + peel_iters_epilogue);
4636 : }
4637 : else
4638 : {
4639 52916 : min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4640 52916 : * assumed_vf
4641 52916 : - vec_inside_cost * peel_iters_prologue
4642 52916 : - vec_inside_cost * peel_iters_epilogue)
4643 52916 : / ((scalar_single_iter_cost * assumed_vf)
4644 : - vec_inside_cost);
4645 : }
4646 58445 : min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4647 58445 : if (dump_enabled_p ())
4648 584 : dump_printf_loc (MSG_NOTE, vect_location,
4649 : " Static estimate profitability threshold = %d\n",
4650 : min_profitable_estimate);
4651 :
4652 58445 : *ret_min_profitable_estimate = min_profitable_estimate;
4653 : }
4654 :
4655 : /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4656 : vector elements (not bits) for a vector with NELT elements. */
4657 : static void
4658 2187 : calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4659 : vec_perm_builder *sel)
4660 : {
4661 : /* The encoding is a single stepped pattern. Any wrap-around is handled
4662 : by vec_perm_indices. */
4663 2187 : sel->new_vector (nelt, 1, 3);
4664 8748 : for (unsigned int i = 0; i < 3; i++)
4665 6561 : sel->quick_push (i + offset);
4666 2187 : }
4667 :
4668 : /* Checks whether the target supports whole-vector shifts for vectors of mode
4669 : MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4670 : it supports vec_perm_const with masks for all necessary shift amounts. */
4671 : static bool
4672 7685 : have_whole_vector_shift (machine_mode mode)
4673 : {
4674 7685 : if (can_implement_p (vec_shr_optab, mode))
4675 : return true;
4676 :
4677 : /* Variable-length vectors should be handled via the optab. */
4678 61 : unsigned int nelt;
4679 122 : if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4680 : return false;
4681 :
4682 61 : vec_perm_builder sel;
4683 61 : vec_perm_indices indices;
4684 307 : for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4685 : {
4686 246 : calc_vec_perm_mask_for_shift (i, nelt, &sel);
4687 246 : indices.new_vector (sel, 2, nelt);
4688 246 : if (!can_vec_perm_const_p (mode, mode, indices, false))
4689 : return false;
4690 : }
4691 : return true;
4692 61 : }
4693 :
4694 : /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
4695 : multiplication operands have differing signs and (b) we intend
4696 : to emulate the operation using a series of signed DOT_PROD_EXPRs.
4697 : See vect_emulate_mixed_dot_prod for the actual sequence used. */
4698 :
4699 : static bool
4700 2186 : vect_is_emulated_mixed_dot_prod (slp_tree slp_node)
4701 : {
4702 2186 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
4703 2186 : gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
4704 1733 : if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
4705 : return false;
4706 :
4707 578 : tree rhs1 = gimple_assign_rhs1 (assign);
4708 578 : tree rhs2 = gimple_assign_rhs2 (assign);
4709 578 : if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
4710 : return false;
4711 :
4712 429 : return !directly_supported_p (DOT_PROD_EXPR,
4713 : SLP_TREE_VECTYPE (slp_node),
4714 143 : SLP_TREE_VECTYPE
4715 : (SLP_TREE_CHILDREN (slp_node)[0]),
4716 143 : optab_vector_mixed_sign);
4717 : }
4718 :
4719 : /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4720 : functions. Design better to avoid maintenance issues. */
4721 :
4722 : /* Function vect_model_reduction_cost.
4723 :
4724 : Models cost for a reduction operation, including the vector ops
4725 : generated within the strip-mine loop in some cases, the initial
4726 : definition before the loop, and the epilogue code that must be generated. */
4727 :
4728 : static void
4729 46925 : vect_model_reduction_cost (loop_vec_info loop_vinfo,
4730 : slp_tree node, internal_fn reduc_fn,
4731 : vect_reduction_type reduction_type,
4732 : int ncopies, stmt_vector_for_cost *cost_vec)
4733 : {
4734 46925 : int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4735 46925 : tree vectype;
4736 46925 : machine_mode mode;
4737 46925 : class loop *loop = NULL;
4738 :
4739 46925 : if (loop_vinfo)
4740 46925 : loop = LOOP_VINFO_LOOP (loop_vinfo);
4741 :
4742 : /* Condition reductions generate two reductions in the loop. */
4743 46925 : if (reduction_type == COND_REDUCTION)
4744 280 : ncopies *= 2;
4745 :
4746 46925 : vectype = SLP_TREE_VECTYPE (node);
4747 46925 : mode = TYPE_MODE (vectype);
4748 46925 : stmt_vec_info orig_stmt_info
4749 46925 : = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
4750 :
4751 46925 : gimple_match_op op;
4752 46925 : if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4753 0 : gcc_unreachable ();
4754 :
4755 46925 : if (reduction_type == EXTRACT_LAST_REDUCTION)
4756 : /* No extra instructions are needed in the prologue. The loop body
4757 : operations are costed in vectorizable_condition. */
4758 : inside_cost = 0;
4759 46925 : else if (reduction_type == FOLD_LEFT_REDUCTION)
4760 : {
4761 : /* No extra instructions needed in the prologue. */
4762 3927 : prologue_cost = 0;
4763 :
4764 3927 : if (reduc_fn != IFN_LAST)
4765 : /* Count one reduction-like operation per vector. */
4766 0 : inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4767 : node, 0, vect_body);
4768 : else
4769 : {
4770 : /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4771 3927 : unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4772 3927 : inside_cost = record_stmt_cost (cost_vec, nelements,
4773 : vec_to_scalar, node, 0,
4774 : vect_body);
4775 3927 : inside_cost += record_stmt_cost (cost_vec, nelements,
4776 : scalar_stmt, node, 0,
4777 : vect_body);
4778 : }
4779 : }
4780 : else
4781 : {
4782 : /* Add in the cost of the initial definitions. */
4783 42998 : int prologue_stmts;
4784 42998 : if (reduction_type == COND_REDUCTION)
4785 : /* For cond reductions we have four vectors: initial index, step,
4786 : initial result of the data reduction, initial value of the index
4787 : reduction. */
4788 : prologue_stmts = 4;
4789 : else
4790 : /* We need the initial reduction value. */
4791 42718 : prologue_stmts = 1;
4792 42998 : prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4793 : scalar_to_vec, node, 0,
4794 : vect_prologue);
4795 : }
4796 :
4797 : /* Determine cost of epilogue code.
4798 :
4799 : We have a reduction operator that will reduce the vector in one statement.
4800 : Also requires scalar extract. */
4801 :
4802 46925 : if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4803 : {
4804 46766 : if (reduc_fn != IFN_LAST)
4805 : {
4806 35345 : if (reduction_type == COND_REDUCTION)
4807 : {
4808 : /* An EQ stmt and an COND_EXPR stmt. */
4809 8 : epilogue_cost += record_stmt_cost (cost_vec, 2,
4810 : vector_stmt, node, 0,
4811 : vect_epilogue);
4812 : /* Reduction of the max index and a reduction of the found
4813 : values. */
4814 8 : epilogue_cost += record_stmt_cost (cost_vec, 2,
4815 : vec_to_scalar, node, 0,
4816 : vect_epilogue);
4817 : /* A broadcast of the max value. */
4818 8 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4819 : scalar_to_vec, node, 0,
4820 : vect_epilogue);
4821 : }
4822 : else
4823 : {
4824 35337 : epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4825 : node, 0, vect_epilogue);
4826 35337 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4827 : vec_to_scalar, node, 0,
4828 : vect_epilogue);
4829 : }
4830 : }
4831 11421 : else if (reduction_type == COND_REDUCTION)
4832 : {
4833 272 : unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4834 : /* Extraction of scalar elements. */
4835 544 : epilogue_cost += record_stmt_cost (cost_vec,
4836 272 : 2 * estimated_nunits,
4837 : vec_to_scalar, node, 0,
4838 : vect_epilogue);
4839 : /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4840 272 : epilogue_cost += record_stmt_cost (cost_vec,
4841 272 : 2 * estimated_nunits - 3,
4842 : scalar_stmt, node, 0,
4843 : vect_epilogue);
4844 : }
4845 11149 : else if (reduction_type == EXTRACT_LAST_REDUCTION
4846 11149 : || reduction_type == FOLD_LEFT_REDUCTION)
4847 : /* No extra instructions need in the epilogue. */
4848 : ;
4849 : else
4850 : {
4851 7222 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4852 7222 : tree bitsize = TYPE_SIZE (op.type);
4853 7222 : int element_bitsize = tree_to_uhwi (bitsize);
4854 7222 : int nelements = vec_size_in_bits / element_bitsize;
4855 :
4856 7222 : if (op.code == COND_EXPR)
4857 28 : op.code = MAX_EXPR;
4858 :
4859 : /* We have a whole vector shift available. */
4860 968 : if (VECTOR_MODE_P (mode)
4861 7222 : && directly_supported_p (op.code, vectype)
4862 13035 : && have_whole_vector_shift (mode))
4863 : {
4864 : /* Final reduction via vector shifts and the reduction operator.
4865 : Also requires scalar extract. */
4866 17439 : epilogue_cost += record_stmt_cost (cost_vec,
4867 11626 : exact_log2 (nelements) * 2,
4868 : vector_stmt, node, 0,
4869 : vect_epilogue);
4870 5813 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4871 : vec_to_scalar, node, 0,
4872 : vect_epilogue);
4873 : }
4874 : else
4875 : /* Use extracts and reduction op for final reduction. For N
4876 : elements, we have N extracts and N-1 reduction ops. */
4877 1409 : epilogue_cost += record_stmt_cost (cost_vec,
4878 1409 : nelements + nelements - 1,
4879 : vector_stmt, node, 0,
4880 : vect_epilogue);
4881 : }
4882 : }
4883 :
4884 46925 : if (dump_enabled_p ())
4885 2846 : dump_printf (MSG_NOTE,
4886 : "vect_model_reduction_cost: inside_cost = %d, "
4887 : "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4888 : prologue_cost, epilogue_cost);
4889 46925 : }
4890 :
4891 : /* SEQ is a sequence of instructions that initialize the reduction
4892 : described by REDUC_INFO. Emit them in the appropriate place. */
4893 :
4894 : static void
4895 445 : vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4896 : vect_reduc_info reduc_info, gimple *seq)
4897 : {
4898 445 : if (VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info))
4899 : {
4900 : /* When reusing an accumulator from the main loop, we only need
4901 : initialization instructions if the main loop can be skipped.
4902 : In that case, emit the initialization instructions at the end
4903 : of the guard block that does the skip. */
4904 25 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
4905 25 : gcc_assert (skip_edge);
4906 25 : gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4907 25 : gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4908 : }
4909 : else
4910 : {
4911 : /* The normal case: emit the initialization instructions on the
4912 : preheader edge. */
4913 420 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4914 420 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4915 : }
4916 445 : }
4917 :
4918 : /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4919 : which performs a reduction involving GROUP_SIZE scalar statements.
4920 : NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
4921 : is nonnull, introducing extra elements of that value will not change the
4922 : result. */
4923 :
4924 : static void
4925 21855 : get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4926 : vect_reduc_info reduc_info,
4927 : tree vector_type,
4928 : vec<tree> *vec_oprnds,
4929 : unsigned int number_of_vectors,
4930 : unsigned int group_size, tree neutral_op)
4931 : {
4932 21855 : vec<tree> &initial_values = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info);
4933 21855 : unsigned HOST_WIDE_INT nunits;
4934 21855 : unsigned j, number_of_places_left_in_vector;
4935 21855 : unsigned int i;
4936 :
4937 43710 : gcc_assert (group_size == initial_values.length () || neutral_op);
4938 :
4939 : /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4940 : created vectors. It is greater than 1 if unrolling is performed.
4941 :
4942 : For example, we have two scalar operands, s1 and s2 (e.g., group of
4943 : strided accesses of size two), while NUNITS is four (i.e., four scalars
4944 : of this type can be packed in a vector). The output vector will contain
4945 : two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4946 : will be 2).
4947 :
4948 : If GROUP_SIZE > NUNITS, the scalars will be split into several
4949 : vectors containing the operands.
4950 :
4951 : For example, NUNITS is four as before, and the group size is 8
4952 : (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4953 : {s5, s6, s7, s8}. */
4954 :
4955 21855 : if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4956 : nunits = group_size;
4957 :
4958 21855 : tree vector_elt_type = TREE_TYPE (vector_type);
4959 21855 : number_of_places_left_in_vector = nunits;
4960 21855 : bool constant_p = true;
4961 21855 : tree_vector_builder elts (vector_type, nunits, 1);
4962 21855 : elts.quick_grow (nunits);
4963 21855 : gimple_seq ctor_seq = NULL;
4964 21855 : if (neutral_op
4965 43139 : && !useless_type_conversion_p (vector_elt_type,
4966 21284 : TREE_TYPE (neutral_op)))
4967 : {
4968 220 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4969 199 : neutral_op = gimple_build (&ctor_seq, COND_EXPR,
4970 : vector_elt_type,
4971 : neutral_op,
4972 : build_all_ones_cst (vector_elt_type),
4973 : build_zero_cst (vector_elt_type));
4974 : else
4975 21 : neutral_op = gimple_convert (&ctor_seq, vector_elt_type, neutral_op);
4976 : }
4977 210511 : for (j = 0; j < nunits * number_of_vectors; ++j)
4978 : {
4979 188656 : tree op;
4980 188656 : i = j % group_size;
4981 :
4982 : /* Get the def before the loop. In reduction chain we have only
4983 : one initial value. Else we have as many as PHIs in the group. */
4984 188656 : if (i >= initial_values.length () || (j > i && neutral_op))
4985 : op = neutral_op;
4986 : else
4987 : {
4988 51304 : if (!useless_type_conversion_p (vector_elt_type,
4989 25652 : TREE_TYPE (initial_values[i])))
4990 : {
4991 235 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4992 422 : initial_values[i] = gimple_build (&ctor_seq, COND_EXPR,
4993 : vector_elt_type,
4994 211 : initial_values[i],
4995 : build_all_ones_cst
4996 : (vector_elt_type),
4997 : build_zero_cst
4998 : (vector_elt_type));
4999 : else
5000 48 : initial_values[i] = gimple_convert (&ctor_seq,
5001 : vector_elt_type,
5002 24 : initial_values[i]);
5003 : }
5004 25652 : op = initial_values[i];
5005 : }
5006 :
5007 : /* Create 'vect_ = {op0,op1,...,opn}'. */
5008 188656 : number_of_places_left_in_vector--;
5009 188656 : elts[nunits - number_of_places_left_in_vector - 1] = op;
5010 188656 : if (!CONSTANT_CLASS_P (op))
5011 2360 : constant_p = false;
5012 :
5013 188656 : if (number_of_places_left_in_vector == 0)
5014 : {
5015 23317 : tree init;
5016 46634 : if (constant_p && !neutral_op
5017 46346 : ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5018 23317 : : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5019 : /* Build the vector directly from ELTS. */
5020 23317 : init = gimple_build_vector (&ctor_seq, &elts);
5021 0 : else if (neutral_op)
5022 : {
5023 : /* Build a vector of the neutral value and shift the
5024 : other elements into place. */
5025 0 : init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5026 : neutral_op);
5027 0 : int k = nunits;
5028 0 : while (k > 0 && operand_equal_p (elts[k - 1], neutral_op))
5029 : k -= 1;
5030 0 : while (k > 0)
5031 : {
5032 0 : k -= 1;
5033 0 : init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5034 0 : vector_type, init, elts[k]);
5035 : }
5036 : }
5037 : else
5038 : {
5039 : /* First time round, duplicate ELTS to fill the
5040 : required number of vectors. */
5041 0 : duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5042 : elts, number_of_vectors, *vec_oprnds);
5043 0 : break;
5044 : }
5045 23317 : vec_oprnds->quick_push (init);
5046 :
5047 23317 : number_of_places_left_in_vector = nunits;
5048 23317 : elts.new_vector (vector_type, nunits, 1);
5049 23317 : elts.quick_grow (nunits);
5050 23317 : constant_p = true;
5051 : }
5052 : }
5053 21855 : if (ctor_seq != NULL)
5054 445 : vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5055 21855 : }
5056 :
5057 : vect_reduc_info
5058 133039 : info_for_reduction (loop_vec_info loop_vinfo, slp_tree node)
5059 : {
5060 133039 : if (node->cycle_info.id == -1)
5061 : return NULL;
5062 131129 : return loop_vinfo->reduc_infos[node->cycle_info.id];
5063 : }
5064 :
5065 : /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5066 : REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5067 : return false. */
5068 :
5069 : static bool
5070 21494 : vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5071 : vect_reduc_info reduc_info, tree vectype)
5072 : {
5073 21494 : loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5074 21494 : if (!main_loop_vinfo)
5075 : return false;
5076 :
5077 4839 : if (VECT_REDUC_INFO_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5078 : return false;
5079 :
5080 : /* We are not set up to handle vector bools when they are not mapped
5081 : to vector integer data types. */
5082 4824 : if (VECTOR_BOOLEAN_TYPE_P (vectype)
5083 4894 : && GET_MODE_CLASS (TYPE_MODE (vectype)) != MODE_VECTOR_INT)
5084 : return false;
5085 :
5086 4822 : unsigned int num_phis = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).length ();
5087 4822 : auto_vec<tree, 16> main_loop_results (num_phis);
5088 4822 : auto_vec<tree, 16> initial_values (num_phis);
5089 4822 : if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5090 : {
5091 : /* The epilogue loop can be entered either from the main loop or
5092 : from an earlier guard block. */
5093 4599 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
5094 18420 : for (tree incoming_value : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info))
5095 : {
5096 : /* Look for:
5097 :
5098 : INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5099 : INITIAL_VALUE(guard block)>. */
5100 4623 : gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5101 :
5102 4623 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5103 4623 : gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5104 :
5105 4623 : tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5106 4623 : tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5107 :
5108 4623 : main_loop_results.quick_push (from_main_loop);
5109 4623 : initial_values.quick_push (from_skip);
5110 : }
5111 : }
5112 : else
5113 : /* The main loop dominates the epilogue loop. */
5114 223 : main_loop_results.splice (VECT_REDUC_INFO_INITIAL_VALUES (reduc_info));
5115 :
5116 : /* See if the main loop has the kind of accumulator we need. */
5117 4822 : vect_reusable_accumulator *accumulator
5118 4822 : = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5119 4822 : if (!accumulator
5120 9628 : || num_phis != VECT_REDUC_INFO_SCALAR_RESULTS (accumulator->reduc_info).length ()
5121 14446 : || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5122 : VECT_REDUC_INFO_SCALAR_RESULTS (accumulator->reduc_info).begin ()))
5123 : return false;
5124 :
5125 : /* Handle the case where we can reduce wider vectors to narrower ones. */
5126 4812 : tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5127 4812 : unsigned HOST_WIDE_INT m;
5128 4812 : if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5129 4812 : TYPE_VECTOR_SUBPARTS (vectype), &m))
5130 0 : return false;
5131 : /* Check the intermediate vector types and operations are available. */
5132 4812 : tree prev_vectype = old_vectype;
5133 4812 : poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5134 13913 : while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5135 : {
5136 4811 : intermediate_nunits = exact_div (intermediate_nunits, 2);
5137 4811 : tree intermediate_vectype = get_related_vectype_for_scalar_type
5138 4811 : (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5139 4811 : if (!intermediate_vectype
5140 4811 : || !directly_supported_p (VECT_REDUC_INFO_CODE (reduc_info),
5141 : intermediate_vectype)
5142 9102 : || !can_vec_extract (TYPE_MODE (prev_vectype),
5143 4291 : TYPE_MODE (intermediate_vectype)))
5144 : return false;
5145 : prev_vectype = intermediate_vectype;
5146 : }
5147 :
5148 : /* Non-SLP reductions might apply an adjustment after the reduction
5149 : operation, in order to simplify the initialization of the accumulator.
5150 : If the epilogue loop carries on from where the main loop left off,
5151 : it should apply the same adjustment to the final reduction result.
5152 :
5153 : If the epilogue loop can also be entered directly (rather than via
5154 : the main loop), we need to be able to handle that case in the same way,
5155 : with the same adjustment. (In principle we could add a PHI node
5156 : to select the correct adjustment, but in practice that shouldn't be
5157 : necessary.) */
5158 4290 : tree main_adjustment
5159 4290 : = VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5160 4290 : if (loop_vinfo->main_loop_edge && main_adjustment)
5161 : {
5162 3631 : gcc_assert (num_phis == 1);
5163 3631 : tree initial_value = initial_values[0];
5164 : /* Check that we can use INITIAL_VALUE as the adjustment and
5165 : initialize the accumulator with a neutral value instead. */
5166 3631 : if (!operand_equal_p (initial_value, main_adjustment))
5167 : return false;
5168 3525 : initial_values[0] = VECT_REDUC_INFO_NEUTRAL_OP (reduc_info);
5169 : }
5170 4184 : VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5171 4184 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).truncate (0);
5172 4184 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).splice (initial_values);
5173 4184 : VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info) = accumulator;
5174 4184 : return true;
5175 4822 : }
5176 :
5177 : /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5178 : CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5179 :
5180 : static tree
5181 4228 : vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5182 : gimple_seq *seq)
5183 : {
5184 4228 : gcc_assert (!VECTOR_BOOLEAN_TYPE_P (TREE_TYPE (vec_def))
5185 : || (GET_MODE_CLASS (TYPE_MODE (TREE_TYPE (vec_def)))
5186 : == MODE_VECTOR_INT));
5187 4228 : unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5188 4228 : unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5189 4228 : tree stype = TREE_TYPE (vectype);
5190 4228 : tree new_temp = vec_def;
5191 8448 : while (nunits > nunits1)
5192 : {
5193 4220 : nunits /= 2;
5194 4220 : tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5195 4220 : stype, nunits);
5196 4220 : unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5197 :
5198 : /* The target has to make sure we support lowpart/highpart
5199 : extraction, either via direct vector extract or through
5200 : an integer mode punning. */
5201 4220 : tree dst1, dst2;
5202 4220 : gimple *epilog_stmt;
5203 4220 : if (convert_optab_handler (vec_extract_optab,
5204 4220 : TYPE_MODE (TREE_TYPE (new_temp)),
5205 4220 : TYPE_MODE (vectype1))
5206 : != CODE_FOR_nothing)
5207 : {
5208 : /* Extract sub-vectors directly once vec_extract becomes
5209 : a conversion optab. */
5210 2685 : dst1 = make_ssa_name (vectype1);
5211 2685 : epilog_stmt
5212 5370 : = gimple_build_assign (dst1, BIT_FIELD_REF,
5213 : build3 (BIT_FIELD_REF, vectype1,
5214 2685 : new_temp, TYPE_SIZE (vectype1),
5215 : bitsize_int (0)));
5216 2685 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5217 2685 : dst2 = make_ssa_name (vectype1);
5218 2685 : epilog_stmt
5219 2685 : = gimple_build_assign (dst2, BIT_FIELD_REF,
5220 : build3 (BIT_FIELD_REF, vectype1,
5221 2685 : new_temp, TYPE_SIZE (vectype1),
5222 2685 : bitsize_int (bitsize)));
5223 2685 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5224 : }
5225 : else
5226 : {
5227 : /* Extract via punning to appropriately sized integer mode
5228 : vector. */
5229 1535 : tree eltype = build_nonstandard_integer_type (bitsize, 1);
5230 1535 : tree etype = build_vector_type (eltype, 2);
5231 3070 : gcc_assert (convert_optab_handler (vec_extract_optab,
5232 : TYPE_MODE (etype),
5233 : TYPE_MODE (eltype))
5234 : != CODE_FOR_nothing);
5235 1535 : tree tem = make_ssa_name (etype);
5236 1535 : epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5237 : build1 (VIEW_CONVERT_EXPR,
5238 : etype, new_temp));
5239 1535 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5240 1535 : new_temp = tem;
5241 1535 : tem = make_ssa_name (eltype);
5242 1535 : epilog_stmt
5243 3070 : = gimple_build_assign (tem, BIT_FIELD_REF,
5244 : build3 (BIT_FIELD_REF, eltype,
5245 1535 : new_temp, TYPE_SIZE (eltype),
5246 : bitsize_int (0)));
5247 1535 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5248 1535 : dst1 = make_ssa_name (vectype1);
5249 1535 : epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5250 : build1 (VIEW_CONVERT_EXPR,
5251 : vectype1, tem));
5252 1535 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5253 1535 : tem = make_ssa_name (eltype);
5254 1535 : epilog_stmt
5255 1535 : = gimple_build_assign (tem, BIT_FIELD_REF,
5256 : build3 (BIT_FIELD_REF, eltype,
5257 1535 : new_temp, TYPE_SIZE (eltype),
5258 1535 : bitsize_int (bitsize)));
5259 1535 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5260 1535 : dst2 = make_ssa_name (vectype1);
5261 1535 : epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5262 : build1 (VIEW_CONVERT_EXPR,
5263 : vectype1, tem));
5264 1535 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5265 : }
5266 :
5267 4220 : new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5268 : }
5269 4228 : if (!useless_type_conversion_p (vectype, TREE_TYPE (new_temp)))
5270 : {
5271 66 : tree dst3 = make_ssa_name (vectype);
5272 66 : gimple *epilog_stmt = gimple_build_assign (dst3, VIEW_CONVERT_EXPR,
5273 : build1 (VIEW_CONVERT_EXPR,
5274 : vectype, new_temp));
5275 66 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5276 66 : new_temp = dst3;
5277 : }
5278 :
5279 4228 : return new_temp;
5280 : }
5281 :
5282 : /* Function vect_create_epilog_for_reduction
5283 :
5284 : Create code at the loop-epilog to finalize the result of a reduction
5285 : computation.
5286 :
5287 : STMT_INFO is the scalar reduction stmt that is being vectorized.
5288 : SLP_NODE is an SLP node containing a group of reduction statements. The
5289 : first one in this group is STMT_INFO.
5290 : SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5291 : REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5292 : (counting from 0)
5293 : LOOP_EXIT is the edge to update in the merge block. In the case of a single
5294 : exit this edge is always the main loop exit.
5295 :
5296 : This function:
5297 : 1. Completes the reduction def-use cycles.
5298 : 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5299 : by calling the function specified by REDUC_FN if available, or by
5300 : other means (whole-vector shifts or a scalar loop).
5301 : The function also creates a new phi node at the loop exit to preserve
5302 : loop-closed form, as illustrated below.
5303 :
5304 : The flow at the entry to this function:
5305 :
5306 : loop:
5307 : vec_def = phi <vec_init, null> # REDUCTION_PHI
5308 : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5309 : s_loop = scalar_stmt # (scalar) STMT_INFO
5310 : loop_exit:
5311 : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5312 : use <s_out0>
5313 : use <s_out0>
5314 :
5315 : The above is transformed by this function into:
5316 :
5317 : loop:
5318 : vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5319 : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5320 : s_loop = scalar_stmt # (scalar) STMT_INFO
5321 : loop_exit:
5322 : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5323 : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5324 : v_out2 = reduce <v_out1>
5325 : s_out3 = extract_field <v_out2, 0>
5326 : s_out4 = adjust_result <s_out3>
5327 : use <s_out4>
5328 : use <s_out4>
5329 : */
5330 :
5331 : static void
5332 22202 : vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5333 : stmt_vec_info stmt_info,
5334 : slp_tree slp_node,
5335 : slp_instance slp_node_instance,
5336 : edge loop_exit)
5337 : {
5338 22202 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
5339 22202 : code_helper code = VECT_REDUC_INFO_CODE (reduc_info);
5340 22202 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
5341 22202 : tree vectype;
5342 22202 : machine_mode mode;
5343 22202 : basic_block exit_bb;
5344 22202 : gimple *new_phi = NULL, *phi = NULL;
5345 22202 : gimple_stmt_iterator exit_gsi;
5346 22202 : tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5347 22202 : gimple *epilog_stmt = NULL;
5348 22202 : gimple *exit_phi;
5349 22202 : tree def;
5350 22202 : tree orig_name, scalar_result;
5351 22202 : imm_use_iterator imm_iter;
5352 22202 : use_operand_p use_p;
5353 22202 : gimple *use_stmt;
5354 22202 : auto_vec<tree> reduc_inputs;
5355 22202 : int j, i;
5356 22202 : vec<tree> &scalar_results = VECT_REDUC_INFO_SCALAR_RESULTS (reduc_info);
5357 22202 : unsigned int k;
5358 : /* SLP reduction without reduction chain, e.g.,
5359 : # a1 = phi <a2, a0>
5360 : # b1 = phi <b2, b0>
5361 : a2 = operation (a1)
5362 : b2 = operation (b1) */
5363 22202 : const bool slp_reduc = !reduc_info->is_reduc_chain;
5364 22202 : tree induction_index = NULL_TREE;
5365 :
5366 22202 : unsigned int group_size = SLP_TREE_LANES (slp_node);
5367 :
5368 22202 : bool double_reduc = false;
5369 22202 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5370 22202 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5371 : {
5372 0 : double_reduc = true;
5373 0 : gcc_assert (slp_reduc);
5374 : }
5375 :
5376 22202 : vectype = VECT_REDUC_INFO_VECTYPE (reduc_info);
5377 22202 : gcc_assert (vectype);
5378 22202 : mode = TYPE_MODE (vectype);
5379 :
5380 22202 : tree induc_val = NULL_TREE;
5381 22202 : tree adjustment_def = NULL;
5382 : /* Optimize: for induction condition reduction, if we can't use zero
5383 : for induc_val, use initial_def. */
5384 22202 : if (VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5385 62 : induc_val = VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info);
5386 22140 : else if (double_reduc)
5387 : ;
5388 : else
5389 22140 : adjustment_def = VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info);
5390 :
5391 22202 : stmt_vec_info single_live_out_stmt[] = { stmt_info };
5392 22202 : array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5393 22202 : if (slp_reduc)
5394 : /* All statements produce live-out values. */
5395 43996 : live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5396 :
5397 22202 : unsigned vec_num
5398 22202 : = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5399 :
5400 : /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5401 : which is updated with the current index of the loop for every match of
5402 : the original loop's cond_expr (VEC_STMT). This results in a vector
5403 : containing the last time the condition passed for that vector lane.
5404 : The first match will be a 1 to allow 0 to be used for non-matching
5405 : indexes. If there are no matches at all then the vector will be all
5406 : zeroes.
5407 :
5408 : PR92772: This algorithm is broken for architectures that support
5409 : masked vectors, but do not provide fold_extract_last. */
5410 22202 : if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION)
5411 : {
5412 67 : gcc_assert (!double_reduc);
5413 67 : auto_vec<std::pair<tree, bool>, 2> ccompares;
5414 67 : slp_tree cond_node = slp_node_instance->root;
5415 143 : while (cond_node != slp_node_instance->reduc_phis)
5416 : {
5417 76 : stmt_vec_info cond_info = SLP_TREE_REPRESENTATIVE (cond_node);
5418 76 : if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5419 : {
5420 76 : gimple *vec_stmt
5421 76 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (cond_node)[0]);
5422 76 : gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5423 76 : ccompares.safe_push
5424 76 : (std::make_pair (gimple_assign_rhs1 (vec_stmt),
5425 76 : SLP_TREE_REDUC_IDX (cond_node) == 2));
5426 : }
5427 76 : int slp_reduc_idx = SLP_TREE_REDUC_IDX (cond_node);
5428 76 : cond_node = SLP_TREE_CHILDREN (cond_node)[slp_reduc_idx];
5429 : }
5430 67 : gcc_assert (ccompares.length () != 0);
5431 :
5432 67 : tree indx_before_incr, indx_after_incr;
5433 67 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5434 67 : int scalar_precision
5435 67 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5436 67 : tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5437 67 : tree cr_index_vector_type = get_related_vectype_for_scalar_type
5438 67 : (TYPE_MODE (vectype), cr_index_scalar_type,
5439 : TYPE_VECTOR_SUBPARTS (vectype));
5440 :
5441 : /* First we create a simple vector induction variable which starts
5442 : with the values {1,2,3,...} (SERIES_VECT) and increments by the
5443 : vector size (STEP). */
5444 :
5445 : /* Create a {1,2,3,...} vector. */
5446 67 : tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5447 :
5448 : /* Create a vector of the step value. */
5449 67 : tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5450 67 : tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5451 :
5452 : /* Create an induction variable. */
5453 67 : gimple_stmt_iterator incr_gsi;
5454 67 : bool insert_after;
5455 67 : vect_iv_increment_position (LOOP_VINFO_MAIN_EXIT (loop_vinfo),
5456 : &incr_gsi, &insert_after);
5457 67 : create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
5458 : insert_after, &indx_before_incr, &indx_after_incr);
5459 :
5460 : /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5461 : filled with zeros (VEC_ZERO). */
5462 :
5463 : /* Create a vector of 0s. */
5464 67 : tree zero = build_zero_cst (cr_index_scalar_type);
5465 67 : tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5466 :
5467 : /* Create a vector phi node. */
5468 67 : tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5469 67 : new_phi = create_phi_node (new_phi_tree, loop->header);
5470 67 : add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5471 : loop_preheader_edge (loop), UNKNOWN_LOCATION);
5472 :
5473 : /* Now take the condition from the loops original cond_exprs
5474 : and produce a new cond_exprs (INDEX_COND_EXPR) which for
5475 : every match uses values from the induction variable
5476 : (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5477 : (NEW_PHI_TREE).
5478 : Finally, we update the phi (NEW_PHI_TREE) to take the value of
5479 : the new cond_expr (INDEX_COND_EXPR). */
5480 67 : gimple_seq stmts = NULL;
5481 210 : for (int i = ccompares.length () - 1; i != -1; --i)
5482 : {
5483 76 : tree ccompare = ccompares[i].first;
5484 76 : if (ccompares[i].second)
5485 69 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5486 : cr_index_vector_type,
5487 : ccompare,
5488 : indx_before_incr, new_phi_tree);
5489 : else
5490 7 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5491 : cr_index_vector_type,
5492 : ccompare,
5493 : new_phi_tree, indx_before_incr);
5494 : }
5495 67 : gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5496 :
5497 : /* Update the phi with the vec cond. */
5498 67 : induction_index = new_phi_tree;
5499 67 : add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5500 : loop_latch_edge (loop), UNKNOWN_LOCATION);
5501 67 : }
5502 :
5503 : /* 2. Create epilog code.
5504 : The reduction epilog code operates across the elements of the vector
5505 : of partial results computed by the vectorized loop.
5506 : The reduction epilog code consists of:
5507 :
5508 : step 1: compute the scalar result in a vector (v_out2)
5509 : step 2: extract the scalar result (s_out3) from the vector (v_out2)
5510 : step 3: adjust the scalar result (s_out3) if needed.
5511 :
5512 : Step 1 can be accomplished using one the following three schemes:
5513 : (scheme 1) using reduc_fn, if available.
5514 : (scheme 2) using whole-vector shifts, if available.
5515 : (scheme 3) using a scalar loop. In this case steps 1+2 above are
5516 : combined.
5517 :
5518 : The overall epilog code looks like this:
5519 :
5520 : s_out0 = phi <s_loop> # original EXIT_PHI
5521 : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5522 : v_out2 = reduce <v_out1> # step 1
5523 : s_out3 = extract_field <v_out2, 0> # step 2
5524 : s_out4 = adjust_result <s_out3> # step 3
5525 :
5526 : (step 3 is optional, and steps 1 and 2 may be combined).
5527 : Lastly, the uses of s_out0 are replaced by s_out4. */
5528 :
5529 :
5530 : /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5531 : v_out1 = phi <VECT_DEF>
5532 : Store them in NEW_PHIS. */
5533 : /* We need to reduce values in all exits. */
5534 22202 : exit_bb = loop_exit->dest;
5535 22202 : exit_gsi = gsi_after_labels (exit_bb);
5536 22202 : reduc_inputs.create (vec_num);
5537 45876 : for (unsigned i = 0; i < vec_num; i++)
5538 : {
5539 23674 : gimple_seq stmts = NULL;
5540 23674 : def = vect_get_slp_vect_def (slp_node, i);
5541 23674 : tree new_def = copy_ssa_name (def);
5542 23674 : phi = create_phi_node (new_def, exit_bb);
5543 23674 : if (LOOP_VINFO_MAIN_EXIT (loop_vinfo) == loop_exit)
5544 23647 : SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
5545 : else
5546 : {
5547 57 : for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
5548 30 : SET_PHI_ARG_DEF (phi, k, def);
5549 : }
5550 23674 : new_def = gimple_convert (&stmts, vectype, new_def);
5551 23674 : reduc_inputs.quick_push (new_def);
5552 23674 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5553 : }
5554 :
5555 : /* 2.2 Get the original scalar reduction variable as defined in the loop.
5556 : In case STMT is a "pattern-stmt" (i.e. - it represents a reduction
5557 : pattern), the scalar-def is taken from the original stmt that the
5558 : pattern-stmt (STMT) replaces. */
5559 :
5560 23019 : tree scalar_dest = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
5561 22202 : tree scalar_type = TREE_TYPE (scalar_dest);
5562 22202 : scalar_results.truncate (0);
5563 22202 : scalar_results.reserve_exact (group_size);
5564 22202 : new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5565 :
5566 : /* True if we should implement SLP_REDUC using native reduction operations
5567 : instead of scalar operations. */
5568 22202 : const bool direct_slp_reduc
5569 22202 : = (reduc_fn != IFN_LAST
5570 22202 : && slp_reduc
5571 22202 : && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5572 :
5573 : /* If signed overflow is undefined we might need to perform reduction
5574 : computations in an unsigned type. */
5575 22202 : tree compute_vectype = vectype;
5576 22202 : if (ANY_INTEGRAL_TYPE_P (vectype)
5577 15202 : && TYPE_OVERFLOW_UNDEFINED (vectype)
5578 5549 : && code.is_tree_code ()
5579 27751 : && arith_code_with_undefined_signed_overflow ((tree_code) code))
5580 4086 : compute_vectype = unsigned_type_for (vectype);
5581 :
5582 : /* In case of reduction chain, e.g.,
5583 : # a1 = phi <a3, a0>
5584 : a2 = operation (a1)
5585 : a3 = operation (a2),
5586 :
5587 : we may end up with more than one vector result. Here we reduce them
5588 : to one vector.
5589 :
5590 : The same is true for a SLP reduction, e.g.,
5591 : # a1 = phi <a2, a0>
5592 : # b1 = phi <b2, b0>
5593 : a2 = operation (a1)
5594 : b2 = operation (a2),
5595 :
5596 : where we can end up with more than one vector as well. We can
5597 : easily accumulate vectors when the number of vector elements is
5598 : a multiple of the SLP group size.
5599 :
5600 : The same is true if we couldn't use a single defuse cycle. */
5601 22202 : if ((!slp_reduc
5602 : || direct_slp_reduc
5603 : || (slp_reduc
5604 22202 : && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size)))
5605 44404 : && reduc_inputs.length () > 1)
5606 : {
5607 542 : gimple_seq stmts = NULL;
5608 542 : tree single_input = reduc_inputs[0];
5609 542 : if (compute_vectype != vectype)
5610 157 : single_input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5611 : compute_vectype, single_input);
5612 1849 : for (k = 1; k < reduc_inputs.length (); k++)
5613 : {
5614 1307 : tree input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5615 1307 : compute_vectype, reduc_inputs[k]);
5616 1307 : single_input = gimple_build (&stmts, code, compute_vectype,
5617 : single_input, input);
5618 : }
5619 542 : if (compute_vectype != vectype)
5620 157 : single_input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5621 : vectype, single_input);
5622 542 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5623 :
5624 542 : reduc_inputs.truncate (0);
5625 542 : reduc_inputs.safe_push (single_input);
5626 : }
5627 :
5628 22202 : tree orig_reduc_input = reduc_inputs[0];
5629 :
5630 : /* If this loop is an epilogue loop that can be skipped after the
5631 : main loop, we can only share a reduction operation between the
5632 : main loop and the epilogue if we put it at the target of the
5633 : skip edge.
5634 :
5635 : We can still reuse accumulators if this check fails. Doing so has
5636 : the minor(?) benefit of making the epilogue loop's scalar result
5637 : independent of the main loop's scalar result. */
5638 22202 : bool unify_with_main_loop_p = false;
5639 22202 : if (VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)
5640 4184 : && loop_vinfo->skip_this_loop_edge
5641 3944 : && single_succ_p (exit_bb)
5642 22223 : && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5643 : {
5644 21 : unify_with_main_loop_p = true;
5645 :
5646 21 : basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5647 21 : reduc_inputs[0] = make_ssa_name (vectype);
5648 21 : gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5649 21 : add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5650 : UNKNOWN_LOCATION);
5651 21 : add_phi_arg (new_phi,
5652 21 : VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)->reduc_input,
5653 : loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5654 21 : exit_gsi = gsi_after_labels (reduc_block);
5655 : }
5656 :
5657 : /* Shouldn't be used beyond this point. */
5658 22202 : exit_bb = nullptr;
5659 :
5660 : /* If we are operating on a mask vector and do not support direct mask
5661 : reduction, work on a bool data vector instead of a mask vector. */
5662 22202 : if (VECTOR_BOOLEAN_TYPE_P (vectype)
5663 227 : && VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info)
5664 22394 : && vectype != VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info))
5665 : {
5666 192 : compute_vectype = vectype = VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info);
5667 192 : gimple_seq stmts = NULL;
5668 392 : for (unsigned i = 0; i < reduc_inputs.length (); ++i)
5669 400 : reduc_inputs[i] = gimple_build (&stmts, VEC_COND_EXPR, vectype,
5670 200 : reduc_inputs[i],
5671 : build_one_cst (vectype),
5672 : build_zero_cst (vectype));
5673 192 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5674 : }
5675 :
5676 22202 : if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
5677 67 : && reduc_fn != IFN_LAST)
5678 : {
5679 : /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5680 : various data values where the condition matched and another vector
5681 : (INDUCTION_INDEX) containing all the indexes of those matches. We
5682 : need to extract the last matching index (which will be the index with
5683 : highest value) and use this to index into the data vector.
5684 : For the case where there were no matches, the data vector will contain
5685 : all default values and the index vector will be all zeros. */
5686 :
5687 : /* Get various versions of the type of the vector of indexes. */
5688 4 : tree index_vec_type = TREE_TYPE (induction_index);
5689 4 : gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5690 4 : tree index_scalar_type = TREE_TYPE (index_vec_type);
5691 4 : tree index_vec_cmp_type = truth_type_for (index_vec_type);
5692 :
5693 : /* Get an unsigned integer version of the type of the data vector. */
5694 4 : int scalar_precision
5695 4 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5696 4 : tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5697 4 : tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5698 : vectype);
5699 :
5700 : /* First we need to create a vector (ZERO_VEC) of zeros and another
5701 : vector (MAX_INDEX_VEC) filled with the last matching index, which we
5702 : can create using a MAX reduction and then expanding.
5703 : In the case where the loop never made any matches, the max index will
5704 : be zero. */
5705 :
5706 : /* Vector of {0, 0, 0,...}. */
5707 4 : tree zero_vec = build_zero_cst (vectype);
5708 :
5709 : /* Find maximum value from the vector of found indexes. */
5710 4 : tree max_index = make_ssa_name (index_scalar_type);
5711 4 : gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5712 : 1, induction_index);
5713 4 : gimple_call_set_lhs (max_index_stmt, max_index);
5714 4 : gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5715 :
5716 : /* Vector of {max_index, max_index, max_index,...}. */
5717 4 : tree max_index_vec = make_ssa_name (index_vec_type);
5718 4 : tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5719 : max_index);
5720 4 : gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5721 : max_index_vec_rhs);
5722 4 : gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5723 :
5724 : /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5725 : with the vector (INDUCTION_INDEX) of found indexes, choosing values
5726 : from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5727 : otherwise. Only one value should match, resulting in a vector
5728 : (VEC_COND) with one data value and the rest zeros.
5729 : In the case where the loop never made any matches, every index will
5730 : match, resulting in a vector with all data values (which will all be
5731 : the default value). */
5732 :
5733 : /* Compare the max index vector to the vector of found indexes to find
5734 : the position of the max value. */
5735 4 : tree vec_compare = make_ssa_name (index_vec_cmp_type);
5736 4 : gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5737 : induction_index,
5738 : max_index_vec);
5739 4 : gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5740 :
5741 : /* Use the compare to choose either values from the data vector or
5742 : zero. */
5743 4 : tree vec_cond = make_ssa_name (vectype);
5744 4 : gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5745 : vec_compare,
5746 4 : reduc_inputs[0],
5747 : zero_vec);
5748 4 : gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5749 :
5750 : /* Finally we need to extract the data value from the vector (VEC_COND)
5751 : into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5752 : reduction, but because this doesn't exist, we can use a MAX reduction
5753 : instead. The data value might be signed or a float so we need to cast
5754 : it first.
5755 : In the case where the loop never made any matches, the data values are
5756 : all identical, and so will reduce down correctly. */
5757 :
5758 : /* Make the matched data values unsigned. */
5759 4 : tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5760 4 : tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5761 : vec_cond);
5762 4 : gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5763 : VIEW_CONVERT_EXPR,
5764 : vec_cond_cast_rhs);
5765 4 : gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5766 :
5767 : /* Reduce down to a scalar value. */
5768 4 : tree data_reduc = make_ssa_name (scalar_type_unsigned);
5769 4 : gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5770 : 1, vec_cond_cast);
5771 4 : gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5772 4 : gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5773 :
5774 : /* Convert the reduced value back to the result type and set as the
5775 : result. */
5776 4 : gimple_seq stmts = NULL;
5777 4 : new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5778 : data_reduc);
5779 4 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5780 4 : scalar_results.safe_push (new_temp);
5781 4 : }
5782 22198 : else if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
5783 63 : && reduc_fn == IFN_LAST)
5784 : {
5785 : /* Condition reduction without supported IFN_REDUC_MAX. Generate
5786 : idx = 0;
5787 : idx_val = induction_index[0];
5788 : val = data_reduc[0];
5789 : for (idx = 0, val = init, i = 0; i < nelts; ++i)
5790 : if (induction_index[i] > idx_val)
5791 : val = data_reduc[i], idx_val = induction_index[i];
5792 : return val; */
5793 :
5794 63 : tree data_eltype = TREE_TYPE (vectype);
5795 63 : tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5796 63 : unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5797 63 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5798 : /* Enforced by vectorizable_reduction, which ensures we have target
5799 : support before allowing a conditional reduction on variable-length
5800 : vectors. */
5801 63 : unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5802 63 : tree idx_val = NULL_TREE, val = NULL_TREE;
5803 419 : for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5804 : {
5805 356 : tree old_idx_val = idx_val;
5806 356 : tree old_val = val;
5807 356 : idx_val = make_ssa_name (idx_eltype);
5808 356 : epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5809 : build3 (BIT_FIELD_REF, idx_eltype,
5810 : induction_index,
5811 356 : bitsize_int (el_size),
5812 356 : bitsize_int (off)));
5813 356 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5814 356 : val = make_ssa_name (data_eltype);
5815 712 : epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5816 : build3 (BIT_FIELD_REF,
5817 : data_eltype,
5818 356 : reduc_inputs[0],
5819 356 : bitsize_int (el_size),
5820 356 : bitsize_int (off)));
5821 356 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5822 356 : if (off != 0)
5823 : {
5824 293 : tree new_idx_val = idx_val;
5825 293 : if (off != v_size - el_size)
5826 : {
5827 230 : new_idx_val = make_ssa_name (idx_eltype);
5828 230 : epilog_stmt = gimple_build_assign (new_idx_val,
5829 : MAX_EXPR, idx_val,
5830 : old_idx_val);
5831 230 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5832 : }
5833 293 : tree cond = make_ssa_name (boolean_type_node);
5834 293 : epilog_stmt = gimple_build_assign (cond, GT_EXPR,
5835 : idx_val, old_idx_val);
5836 293 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5837 293 : tree new_val = make_ssa_name (data_eltype);
5838 293 : epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
5839 : cond, val, old_val);
5840 293 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5841 293 : idx_val = new_idx_val;
5842 293 : val = new_val;
5843 : }
5844 : }
5845 : /* Convert the reduced value back to the result type and set as the
5846 : result. */
5847 63 : gimple_seq stmts = NULL;
5848 63 : val = gimple_convert (&stmts, scalar_type, val);
5849 63 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5850 63 : scalar_results.safe_push (val);
5851 63 : }
5852 :
5853 : /* 2.3 Create the reduction code, using one of the three schemes described
5854 : above. In SLP we simply need to extract all the elements from the
5855 : vector (without reducing them), so we use scalar shifts. */
5856 22135 : else if (reduc_fn != IFN_LAST && (!slp_reduc || group_size == 1))
5857 : {
5858 20263 : tree tmp;
5859 20263 : tree vec_elem_type;
5860 :
5861 : /* Case 1: Create:
5862 : v_out2 = reduc_expr <v_out1> */
5863 :
5864 20263 : if (dump_enabled_p ())
5865 1514 : dump_printf_loc (MSG_NOTE, vect_location,
5866 : "Reduce using direct vector reduction.\n");
5867 :
5868 20263 : gimple_seq stmts = NULL;
5869 20263 : vec_elem_type = TREE_TYPE (vectype);
5870 20263 : new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5871 20263 : vec_elem_type, reduc_inputs[0]);
5872 20263 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5873 20263 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5874 :
5875 20263 : if ((VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5876 62 : && induc_val)
5877 : {
5878 : /* Earlier we set the initial value to be a vector if induc_val
5879 : values. Check the result and if it is induc_val then replace
5880 : with the original initial value, unless induc_val is
5881 : the same as initial_def already. */
5882 60 : tree zcompare = make_ssa_name (boolean_type_node);
5883 60 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
5884 : new_temp, induc_val);
5885 60 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5886 60 : tree initial_def = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[0];
5887 60 : tmp = make_ssa_name (new_scalar_dest);
5888 60 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5889 : initial_def, new_temp);
5890 60 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5891 60 : new_temp = tmp;
5892 : }
5893 :
5894 20263 : scalar_results.safe_push (new_temp);
5895 20263 : }
5896 1681 : else if (direct_slp_reduc)
5897 : {
5898 : /* Here we create one vector for each of the GROUP_SIZE results,
5899 : with the elements for other SLP statements replaced with the
5900 : neutral value. We can then do a normal reduction on each vector. */
5901 :
5902 : /* Enforced by vectorizable_reduction. */
5903 : gcc_assert (reduc_inputs.length () == 1);
5904 : gcc_assert (pow2p_hwi (group_size));
5905 :
5906 : gimple_seq seq = NULL;
5907 :
5908 : /* Build a vector {0, 1, 2, ...}, with the same number of elements
5909 : and the same element size as VECTYPE. */
5910 : tree index = build_index_vector (vectype, 0, 1);
5911 : tree index_type = TREE_TYPE (index);
5912 : tree index_elt_type = TREE_TYPE (index_type);
5913 : tree mask_type = truth_type_for (index_type);
5914 :
5915 : /* Create a vector that, for each element, identifies which of
5916 : the results should use it. */
5917 : tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5918 : index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5919 : build_vector_from_val (index_type, index_mask));
5920 :
5921 : /* Get a neutral vector value. This is simply a splat of the neutral
5922 : scalar value if we have one, otherwise the initial scalar value
5923 : is itself a neutral value. */
5924 : tree vector_identity = NULL_TREE;
5925 : tree neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5926 : NULL_TREE, false);
5927 : if (neutral_op)
5928 : vector_identity = gimple_build_vector_from_val (&seq, vectype,
5929 : neutral_op);
5930 : for (unsigned int i = 0; i < group_size; ++i)
5931 : {
5932 : /* If there's no univeral neutral value, we can use the
5933 : initial scalar value from the original PHI. This is used
5934 : for MIN and MAX reduction, for example. */
5935 : if (!neutral_op)
5936 : {
5937 : tree scalar_value
5938 : = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[i];
5939 : scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5940 : scalar_value);
5941 : vector_identity = gimple_build_vector_from_val (&seq, vectype,
5942 : scalar_value);
5943 : }
5944 :
5945 : /* Calculate the equivalent of:
5946 :
5947 : sel[j] = (index[j] == i);
5948 :
5949 : which selects the elements of REDUC_INPUTS[0] that should
5950 : be included in the result. */
5951 : tree compare_val = build_int_cst (index_elt_type, i);
5952 : compare_val = build_vector_from_val (index_type, compare_val);
5953 : tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5954 : index, compare_val);
5955 :
5956 : /* Calculate the equivalent of:
5957 :
5958 : vec = seq ? reduc_inputs[0] : vector_identity;
5959 :
5960 : VEC is now suitable for a full vector reduction. */
5961 : tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5962 : sel, reduc_inputs[0], vector_identity);
5963 :
5964 : /* Do the reduction and convert it to the appropriate type. */
5965 : tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5966 : TREE_TYPE (vectype), vec);
5967 : scalar = gimple_convert (&seq, scalar_type, scalar);
5968 : scalar_results.safe_push (scalar);
5969 : }
5970 : gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5971 : }
5972 : else
5973 : {
5974 1681 : bool reduce_with_shift;
5975 1681 : tree vec_temp;
5976 :
5977 1681 : gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5978 :
5979 : /* See if the target wants to do the final (shift) reduction
5980 : in a vector mode of smaller size and first reduce upper/lower
5981 : halves against each other. */
5982 1872 : enum machine_mode mode1 = mode;
5983 1872 : tree stype = TREE_TYPE (vectype);
5984 1872 : if (compute_vectype != vectype)
5985 : {
5986 482 : stype = unsigned_type_for (stype);
5987 482 : gimple_seq stmts = NULL;
5988 1034 : for (unsigned i = 0; i < reduc_inputs.length (); ++i)
5989 : {
5990 552 : tree new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5991 552 : compute_vectype, reduc_inputs[i]);
5992 552 : reduc_inputs[i] = new_temp;
5993 : }
5994 482 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5995 : }
5996 1872 : unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5997 1872 : unsigned nunits1 = nunits;
5998 1872 : if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5999 1872 : && reduc_inputs.length () == 1)
6000 : {
6001 41 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6002 : /* For SLP reductions we have to make sure lanes match up, but
6003 : since we're doing individual element final reduction reducing
6004 : vector width here is even more important.
6005 : ??? We can also separate lanes with permutes, for the common
6006 : case of power-of-two group-size odd/even extracts would work. */
6007 41 : if (slp_reduc && nunits != nunits1)
6008 : {
6009 41 : nunits1 = least_common_multiple (nunits1, group_size);
6010 82 : gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6011 : }
6012 : }
6013 1831 : else if (!slp_reduc
6014 1831 : && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6015 0 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6016 :
6017 1872 : tree vectype1 = compute_vectype;
6018 1872 : if (mode1 != mode)
6019 : {
6020 47 : vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6021 47 : stype, nunits1);
6022 : /* First reduce the vector to the desired vector size we should
6023 : do shift reduction on by combining upper and lower halves. */
6024 47 : gimple_seq stmts = NULL;
6025 47 : new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6026 : code, &stmts);
6027 47 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6028 47 : reduc_inputs[0] = new_temp;
6029 : }
6030 :
6031 1872 : reduce_with_shift = have_whole_vector_shift (mode1);
6032 729 : if (!VECTOR_MODE_P (mode1)
6033 2599 : || !directly_supported_p (code, vectype1))
6034 : reduce_with_shift = false;
6035 :
6036 1855 : if (reduce_with_shift && (!slp_reduc || group_size == 1))
6037 : {
6038 1631 : int element_bitsize = vector_element_bits (vectype1);
6039 : /* Enforced by vectorizable_reduction, which disallows SLP reductions
6040 : for variable-length vectors and also requires direct target support
6041 : for loop reductions. */
6042 1631 : int nelements = TYPE_VECTOR_SUBPARTS (vectype1).to_constant ();
6043 1631 : vec_perm_builder sel;
6044 1631 : vec_perm_indices indices;
6045 :
6046 1631 : int elt_offset;
6047 :
6048 1631 : tree zero_vec = build_zero_cst (vectype1);
6049 : /* Case 2: Create:
6050 : for (offset = nelements/2; offset >= 1; offset/=2)
6051 : {
6052 : Create: va' = vec_shift <va, offset>
6053 : Create: va = vop <va, va'>
6054 : } */
6055 :
6056 1631 : if (dump_enabled_p ())
6057 365 : dump_printf_loc (MSG_NOTE, vect_location,
6058 : "Reduce using vector shifts\n");
6059 :
6060 1631 : gimple_seq stmts = NULL;
6061 1631 : new_temp = gimple_convert (&stmts, vectype1, reduc_inputs[0]);
6062 1631 : for (elt_offset = nelements / 2;
6063 3572 : elt_offset >= 1;
6064 1941 : elt_offset /= 2)
6065 : {
6066 1941 : calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6067 1941 : indices.new_vector (sel, 2, nelements);
6068 1941 : tree mask = vect_gen_perm_mask_any (vectype1, indices);
6069 1941 : new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6070 : new_temp, zero_vec, mask);
6071 1941 : new_temp = gimple_build (&stmts, code,
6072 : vectype1, new_name, new_temp);
6073 : }
6074 :
6075 : /* 2.4 Extract the final scalar result. Create:
6076 : s_out3 = extract_field <v_out2, bitpos> */
6077 :
6078 1631 : if (dump_enabled_p ())
6079 365 : dump_printf_loc (MSG_NOTE, vect_location,
6080 : "extract scalar result\n");
6081 :
6082 1631 : new_temp = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype1),
6083 1631 : new_temp, bitsize_int (element_bitsize),
6084 1631 : bitsize_zero_node);
6085 1631 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6086 1631 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6087 1631 : scalar_results.safe_push (new_temp);
6088 1631 : }
6089 : else
6090 : {
6091 : /* Case 3: Create:
6092 : s = extract_field <v_out2, 0>
6093 : for (offset = element_size;
6094 : offset < vector_size;
6095 : offset += element_size;)
6096 : {
6097 : Create: s' = extract_field <v_out2, offset>
6098 : Create: s = op <s, s'> // For non SLP cases
6099 : } */
6100 :
6101 241 : if (dump_enabled_p ())
6102 150 : dump_printf_loc (MSG_NOTE, vect_location,
6103 : "Reduce using scalar code.\n");
6104 :
6105 241 : tree compute_type = TREE_TYPE (vectype1);
6106 241 : unsigned element_bitsize = vector_element_bits (vectype1);
6107 241 : unsigned vec_size_in_bits = element_bitsize
6108 241 : * TYPE_VECTOR_SUBPARTS (vectype1).to_constant ();
6109 241 : tree bitsize = bitsize_int (element_bitsize);
6110 241 : gimple_seq stmts = NULL;
6111 647 : FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6112 : {
6113 406 : unsigned bit_offset;
6114 812 : new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6115 406 : vec_temp, bitsize, bitsize_zero_node);
6116 :
6117 : /* In SLP we don't need to apply reduction operation, so we just
6118 : collect s' values in SCALAR_RESULTS. */
6119 406 : if (slp_reduc)
6120 396 : scalar_results.safe_push (new_temp);
6121 :
6122 1000 : for (bit_offset = element_bitsize;
6123 1406 : bit_offset < vec_size_in_bits;
6124 1000 : bit_offset += element_bitsize)
6125 : {
6126 1000 : tree bitpos = bitsize_int (bit_offset);
6127 1000 : new_name = gimple_build (&stmts, BIT_FIELD_REF,
6128 : compute_type, vec_temp,
6129 : bitsize, bitpos);
6130 1000 : if (slp_reduc)
6131 : {
6132 : /* In SLP we don't need to apply reduction operation, so
6133 : we just collect s' values in SCALAR_RESULTS. */
6134 990 : new_temp = new_name;
6135 990 : scalar_results.safe_push (new_name);
6136 : }
6137 : else
6138 10 : new_temp = gimple_build (&stmts, code, compute_type,
6139 : new_name, new_temp);
6140 : }
6141 : }
6142 :
6143 : /* The only case where we need to reduce scalar results in a SLP
6144 : reduction, is unrolling. If the size of SCALAR_RESULTS is
6145 : greater than GROUP_SIZE, we reduce them combining elements modulo
6146 : GROUP_SIZE. */
6147 241 : if (slp_reduc)
6148 : {
6149 231 : tree res, first_res, new_res;
6150 :
6151 : /* Reduce multiple scalar results in case of SLP unrolling. */
6152 925 : for (j = group_size; scalar_results.iterate (j, &res);
6153 : j++)
6154 : {
6155 694 : first_res = scalar_results[j % group_size];
6156 694 : new_res = gimple_build (&stmts, code, compute_type,
6157 : first_res, res);
6158 694 : scalar_results[j % group_size] = new_res;
6159 : }
6160 231 : scalar_results.truncate (group_size);
6161 1154 : for (k = 0; k < group_size; k++)
6162 1384 : scalar_results[k] = gimple_convert (&stmts, scalar_type,
6163 692 : scalar_results[k]);
6164 : }
6165 : else
6166 : {
6167 : /* Reduction chain - we have one scalar to keep in
6168 : SCALAR_RESULTS. */
6169 10 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6170 10 : scalar_results.safe_push (new_temp);
6171 : }
6172 :
6173 241 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6174 : }
6175 :
6176 1872 : if ((VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6177 0 : && induc_val)
6178 : {
6179 : /* Earlier we set the initial value to be a vector if induc_val
6180 : values. Check the result and if it is induc_val then replace
6181 : with the original initial value, unless induc_val is
6182 : the same as initial_def already. */
6183 0 : tree zcompare = make_ssa_name (boolean_type_node);
6184 0 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6185 0 : scalar_results[0], induc_val);
6186 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6187 0 : tree initial_def = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[0];
6188 0 : tree tmp = make_ssa_name (new_scalar_dest);
6189 0 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6190 0 : initial_def, scalar_results[0]);
6191 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6192 0 : scalar_results[0] = tmp;
6193 : }
6194 : }
6195 :
6196 : /* 2.5 Adjust the final result by the initial value of the reduction
6197 : variable. (When such adjustment is not needed, then
6198 : 'adjustment_def' is zero). For example, if code is PLUS we create:
6199 : new_temp = loop_exit_def + adjustment_def */
6200 :
6201 22202 : if (adjustment_def)
6202 : {
6203 15910 : gcc_assert (!slp_reduc || group_size == 1);
6204 15910 : gimple_seq stmts = NULL;
6205 15910 : if (double_reduc)
6206 : {
6207 0 : gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6208 0 : adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6209 0 : new_temp = gimple_build (&stmts, code, vectype,
6210 0 : reduc_inputs[0], adjustment_def);
6211 : }
6212 : else
6213 : {
6214 15910 : new_temp = scalar_results[0];
6215 15910 : gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6216 15910 : adjustment_def = gimple_convert (&stmts, TREE_TYPE (compute_vectype),
6217 : adjustment_def);
6218 15910 : new_temp = gimple_convert (&stmts, TREE_TYPE (compute_vectype),
6219 : new_temp);
6220 15910 : new_temp = gimple_build (&stmts, code, TREE_TYPE (compute_vectype),
6221 : new_temp, adjustment_def);
6222 15910 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6223 : }
6224 :
6225 15910 : epilog_stmt = gimple_seq_last_stmt (stmts);
6226 15910 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6227 15910 : scalar_results[0] = new_temp;
6228 : }
6229 :
6230 : /* Record this operation if it could be reused by the epilogue loop. */
6231 22202 : if (VECT_REDUC_INFO_TYPE (reduc_info) == TREE_CODE_REDUCTION
6232 22202 : && reduc_inputs.length () == 1)
6233 22016 : loop_vinfo->reusable_accumulators.put (scalar_results[0],
6234 : { orig_reduc_input, reduc_info });
6235 :
6236 : /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6237 : phis with new adjusted scalar results, i.e., replace use <s_out0>
6238 : with use <s_out4>.
6239 :
6240 : Transform:
6241 : loop_exit:
6242 : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6243 : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6244 : v_out2 = reduce <v_out1>
6245 : s_out3 = extract_field <v_out2, 0>
6246 : s_out4 = adjust_result <s_out3>
6247 : use <s_out0>
6248 : use <s_out0>
6249 :
6250 : into:
6251 :
6252 : loop_exit:
6253 : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6254 : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6255 : v_out2 = reduce <v_out1>
6256 : s_out3 = extract_field <v_out2, 0>
6257 : s_out4 = adjust_result <s_out3>
6258 : use <s_out4>
6259 : use <s_out4> */
6260 :
6261 44404 : gcc_assert (live_out_stmts.size () == scalar_results.length ());
6262 22202 : auto_vec<gimple *> phis;
6263 44865 : for (k = 0; k < live_out_stmts.size (); k++)
6264 : {
6265 22663 : stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6266 22663 : tree scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6267 :
6268 : /* Find the loop-closed-use at the loop exit of the original scalar
6269 : result. (The reduction result is expected to have two immediate uses,
6270 : one at the latch block, and one at the loop exit). Note with
6271 : early break we can have two exit blocks, so pick the correct PHI. */
6272 115113 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6273 69787 : if (!is_gimple_debug (USE_STMT (use_p))
6274 69787 : && !flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6275 : {
6276 22658 : gcc_assert (is_a <gphi *> (USE_STMT (use_p)));
6277 22658 : if (gimple_bb (USE_STMT (use_p)) == loop_exit->dest)
6278 22650 : phis.safe_push (USE_STMT (use_p));
6279 22663 : }
6280 :
6281 45313 : FOR_EACH_VEC_ELT (phis, i, exit_phi)
6282 : {
6283 : /* Replace the uses: */
6284 22650 : orig_name = PHI_RESULT (exit_phi);
6285 :
6286 : /* Look for a single use at the target of the skip edge. */
6287 22650 : if (unify_with_main_loop_p)
6288 : {
6289 38 : use_operand_p use_p;
6290 38 : gimple *user;
6291 38 : if (!single_imm_use (orig_name, &use_p, &user))
6292 0 : gcc_unreachable ();
6293 38 : orig_name = gimple_get_lhs (user);
6294 : }
6295 :
6296 22650 : scalar_result = scalar_results[k];
6297 84028 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6298 : {
6299 38728 : gphi *use_phi = dyn_cast <gphi *> (use_stmt);
6300 116228 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6301 : {
6302 38750 : if (use_phi
6303 38750 : && (phi_arg_edge_from_use (use_p)->flags & EDGE_ABNORMAL))
6304 : {
6305 0 : gcc_assert (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (orig_name));
6306 0 : SSA_NAME_OCCURS_IN_ABNORMAL_PHI (scalar_result) = 1;
6307 : }
6308 38750 : SET_USE (use_p, scalar_result);
6309 : }
6310 38728 : update_stmt (use_stmt);
6311 22650 : }
6312 : }
6313 :
6314 22663 : phis.truncate (0);
6315 : }
6316 22202 : }
6317 :
6318 : /* Return a vector of type VECTYPE that is equal to the vector select
6319 : operation "MASK ? VEC : IDENTITY". Insert the select statements
6320 : before GSI. */
6321 :
6322 : static tree
6323 9 : merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6324 : tree vec, tree identity)
6325 : {
6326 9 : tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6327 9 : gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6328 : mask, vec, identity);
6329 9 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6330 9 : return cond;
6331 : }
6332 :
6333 : /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6334 : order, starting with LHS. Insert the extraction statements before GSI and
6335 : associate the new scalar SSA names with variable SCALAR_DEST.
6336 : If MASK is nonzero mask the input and then operate on it unconditionally.
6337 : Return the SSA name for the result. */
6338 :
6339 : static tree
6340 1101 : vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6341 : tree_code code, tree lhs, tree vector_rhs,
6342 : tree mask)
6343 : {
6344 1101 : tree vectype = TREE_TYPE (vector_rhs);
6345 1101 : tree scalar_type = TREE_TYPE (vectype);
6346 1101 : tree bitsize = TYPE_SIZE (scalar_type);
6347 1101 : unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6348 1101 : unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6349 :
6350 : /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6351 : to perform an unconditional element-wise reduction of it. */
6352 1101 : if (mask)
6353 : {
6354 77 : tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
6355 : "masked_vector_rhs");
6356 77 : tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
6357 : false);
6358 77 : tree vector_identity = build_vector_from_val (vectype, neutral_op);
6359 77 : gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
6360 : mask, vector_rhs, vector_identity);
6361 77 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6362 77 : vector_rhs = masked_vector_rhs;
6363 : }
6364 :
6365 1101 : for (unsigned HOST_WIDE_INT bit_offset = 0;
6366 5141 : bit_offset < vec_size_in_bits;
6367 4040 : bit_offset += element_bitsize)
6368 : {
6369 4040 : tree bitpos = bitsize_int (bit_offset);
6370 4040 : tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6371 : bitsize, bitpos);
6372 :
6373 4040 : gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6374 4040 : rhs = make_ssa_name (scalar_dest, stmt);
6375 4040 : gimple_assign_set_lhs (stmt, rhs);
6376 4040 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6377 : /* Fold the vector extract, combining it with a previous reversal
6378 : like seen in PR90579. */
6379 4040 : auto gsi2 = gsi_for_stmt (stmt);
6380 4040 : if (fold_stmt (&gsi2, follow_all_ssa_edges))
6381 356 : update_stmt (gsi_stmt (gsi2));
6382 :
6383 4040 : stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6384 4040 : tree new_name = make_ssa_name (scalar_dest, stmt);
6385 4040 : gimple_assign_set_lhs (stmt, new_name);
6386 4040 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6387 4040 : lhs = new_name;
6388 : }
6389 1101 : return lhs;
6390 : }
6391 :
6392 : /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6393 : type of the vector input. */
6394 :
6395 : static internal_fn
6396 2537 : get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6397 : {
6398 2537 : internal_fn mask_reduc_fn;
6399 2537 : internal_fn mask_len_reduc_fn;
6400 :
6401 2537 : switch (reduc_fn)
6402 : {
6403 0 : case IFN_FOLD_LEFT_PLUS:
6404 0 : mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6405 0 : mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6406 0 : break;
6407 :
6408 : default:
6409 : return IFN_LAST;
6410 : }
6411 :
6412 0 : if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6413 : OPTIMIZE_FOR_SPEED))
6414 : return mask_reduc_fn;
6415 0 : if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6416 : OPTIMIZE_FOR_SPEED))
6417 : return mask_len_reduc_fn;
6418 : return IFN_LAST;
6419 : }
6420 :
6421 : /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6422 : statement that sets the live-out value. REDUC_DEF_STMT is the phi
6423 : statement. CODE is the operation performed by STMT_INFO and OPS are
6424 : its scalar operands. REDUC_INDEX is the index of the operand in
6425 : OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6426 : implements in-order reduction, or IFN_LAST if we should open-code it.
6427 : VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6428 : that should be used to control the operation in a fully-masked loop. */
6429 :
6430 : static bool
6431 843 : vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6432 : stmt_vec_info stmt_info,
6433 : gimple_stmt_iterator *gsi,
6434 : slp_tree slp_node,
6435 : code_helper code, internal_fn reduc_fn,
6436 : int num_ops, tree vectype_in,
6437 : int reduc_index, vec_loop_masks *masks,
6438 : vec_loop_lens *lens)
6439 : {
6440 843 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6441 843 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
6442 843 : internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6443 :
6444 843 : gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6445 :
6446 843 : bool is_cond_op = false;
6447 843 : if (!code.is_tree_code ())
6448 : {
6449 23 : code = conditional_internal_fn_code (internal_fn (code));
6450 23 : gcc_assert (code != ERROR_MARK);
6451 : is_cond_op = true;
6452 : }
6453 :
6454 843 : gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
6455 :
6456 843 : gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6457 : TYPE_VECTOR_SUBPARTS (vectype_in)));
6458 :
6459 : /* ??? We should, when transforming the cycle PHI, record the existing
6460 : scalar def as vector def so looking up the vector def works. This
6461 : would also allow generalizing this for reduction paths of length > 1
6462 : and/or SLP reductions. */
6463 843 : slp_tree reduc_node = SLP_TREE_CHILDREN (slp_node)[reduc_index];
6464 843 : stmt_vec_info reduc_var_def = SLP_TREE_SCALAR_STMTS (reduc_node)[0];
6465 843 : tree reduc_var = gimple_get_lhs (STMT_VINFO_STMT (reduc_var_def));
6466 :
6467 : /* The operands either come from a binary operation or an IFN_COND operation.
6468 : The former is a gimple assign with binary rhs and the latter is a
6469 : gimple call with four arguments. */
6470 843 : gcc_assert (num_ops == 2 || num_ops == 4);
6471 :
6472 843 : auto_vec<tree> vec_oprnds0, vec_opmask;
6473 843 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[(is_cond_op ? 2 : 0)
6474 843 : + (1 - reduc_index)],
6475 : &vec_oprnds0);
6476 : /* For an IFN_COND_OP we also need the vector mask operand. */
6477 843 : if (is_cond_op)
6478 23 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], &vec_opmask);
6479 :
6480 : /* The transform below relies on preserving the original scalar PHI
6481 : and its latch def which we replace. So work backwards from there. */
6482 843 : tree scalar_dest
6483 843 : = gimple_phi_arg_def_from_edge (as_a <gphi *> (STMT_VINFO_STMT
6484 : (reduc_var_def)),
6485 843 : loop_latch_edge (loop));
6486 843 : stmt_vec_info scalar_dest_def_info
6487 843 : = vect_stmt_to_vectorize (loop_vinfo->lookup_def (scalar_dest));
6488 843 : tree scalar_type = TREE_TYPE (scalar_dest);
6489 :
6490 843 : int vec_num = vec_oprnds0.length ();
6491 843 : tree vec_elem_type = TREE_TYPE (vectype_out);
6492 843 : gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6493 :
6494 843 : tree vector_identity = NULL_TREE;
6495 843 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6496 : {
6497 2 : vector_identity = build_zero_cst (vectype_out);
6498 2 : if (!HONOR_SIGNED_ZEROS (vectype_out))
6499 : ;
6500 : else
6501 : {
6502 2 : gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
6503 2 : vector_identity = const_unop (NEGATE_EXPR, vectype_out,
6504 : vector_identity);
6505 : }
6506 : }
6507 :
6508 843 : tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6509 843 : int i;
6510 843 : tree def0;
6511 1944 : FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6512 : {
6513 1101 : gimple *new_stmt;
6514 1101 : tree mask = NULL_TREE;
6515 1101 : tree len = NULL_TREE;
6516 1101 : tree bias = NULL_TREE;
6517 1101 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6518 : {
6519 9 : tree loop_mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
6520 : vec_num, vectype_in, i);
6521 9 : if (is_cond_op)
6522 9 : mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (loop_mask),
6523 9 : loop_mask, vec_opmask[i], gsi);
6524 : else
6525 : mask = loop_mask;
6526 : }
6527 1092 : else if (is_cond_op)
6528 68 : mask = vec_opmask[i];
6529 1101 : if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
6530 : {
6531 0 : len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
6532 : i, 1, false);
6533 0 : signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
6534 0 : bias = build_int_cst (intQI_type_node, biasval);
6535 0 : if (!is_cond_op)
6536 0 : mask = build_minus_one_cst (truth_type_for (vectype_in));
6537 : }
6538 :
6539 : /* Handle MINUS by adding the negative. */
6540 1101 : if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6541 : {
6542 0 : tree negated = make_ssa_name (vectype_out);
6543 0 : new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6544 0 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6545 0 : def0 = negated;
6546 : }
6547 :
6548 9 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
6549 1110 : && mask && mask_reduc_fn == IFN_LAST)
6550 9 : def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6551 : vector_identity);
6552 :
6553 : /* On the first iteration the input is simply the scalar phi
6554 : result, and for subsequent iterations it is the output of
6555 : the preceding operation. */
6556 1101 : if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6557 : {
6558 0 : if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6559 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
6560 : def0, mask, len, bias);
6561 0 : else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
6562 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6563 : def0, mask);
6564 : else
6565 0 : new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6566 : def0);
6567 : /* For chained SLP reductions the output of the previous reduction
6568 : operation serves as the input of the next. For the final statement
6569 : the output cannot be a temporary - we reuse the original
6570 : scalar destination of the last statement. */
6571 0 : if (i != vec_num - 1)
6572 : {
6573 0 : gimple_set_lhs (new_stmt, scalar_dest_var);
6574 0 : reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6575 0 : gimple_set_lhs (new_stmt, reduc_var);
6576 : }
6577 : }
6578 : else
6579 : {
6580 1101 : reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
6581 : tree_code (code), reduc_var, def0,
6582 : mask);
6583 1101 : new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6584 : /* Remove the statement, so that we can use the same code paths
6585 : as for statements that we've just created. */
6586 1101 : gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6587 1101 : gsi_remove (&tmp_gsi, true);
6588 : }
6589 :
6590 1101 : if (i == vec_num - 1)
6591 : {
6592 843 : gimple_set_lhs (new_stmt, scalar_dest);
6593 843 : vect_finish_replace_stmt (loop_vinfo,
6594 : scalar_dest_def_info,
6595 : new_stmt);
6596 : }
6597 : else
6598 258 : vect_finish_stmt_generation (loop_vinfo,
6599 : scalar_dest_def_info,
6600 : new_stmt, gsi);
6601 :
6602 1101 : slp_node->push_vec_def (new_stmt);
6603 : }
6604 :
6605 843 : return true;
6606 843 : }
6607 :
6608 : /* Function is_nonwrapping_integer_induction.
6609 :
6610 : Check if STMT_VINO (which is part of loop LOOP) both increments and
6611 : does not cause overflow. */
6612 :
6613 : static bool
6614 408 : is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6615 : {
6616 408 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6617 408 : tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6618 408 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6619 408 : tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6620 408 : widest_int ni, max_loop_value, lhs_max;
6621 408 : wi::overflow_type overflow = wi::OVF_NONE;
6622 :
6623 : /* Make sure the loop is integer based. */
6624 408 : if (TREE_CODE (base) != INTEGER_CST
6625 109 : || TREE_CODE (step) != INTEGER_CST)
6626 : return false;
6627 :
6628 : /* Check that the max size of the loop will not wrap. */
6629 :
6630 109 : if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6631 : return true;
6632 :
6633 8 : if (! max_stmt_executions (loop, &ni))
6634 : return false;
6635 :
6636 8 : max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6637 8 : &overflow);
6638 8 : if (overflow)
6639 : return false;
6640 :
6641 8 : max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6642 16 : TYPE_SIGN (lhs_type), &overflow);
6643 8 : if (overflow)
6644 : return false;
6645 :
6646 8 : return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6647 8 : <= TYPE_PRECISION (lhs_type));
6648 408 : }
6649 :
6650 : /* Check if masking can be supported by inserting a conditional expression.
6651 : CODE is the code for the operation. COND_FN is the conditional internal
6652 : function, if it exists. VECTYPE_IN is the type of the vector input. */
6653 : static bool
6654 5104 : use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6655 : tree vectype_in)
6656 : {
6657 5104 : if (cond_fn != IFN_LAST
6658 5104 : && direct_internal_fn_supported_p (cond_fn, vectype_in,
6659 : OPTIMIZE_FOR_SPEED))
6660 : return false;
6661 :
6662 3598 : if (code.is_tree_code ())
6663 3592 : switch (tree_code (code))
6664 : {
6665 : case DOT_PROD_EXPR:
6666 : case SAD_EXPR:
6667 : return true;
6668 :
6669 : default:
6670 : break;
6671 : }
6672 : return false;
6673 : }
6674 :
6675 : /* Insert a conditional expression to enable masked vectorization. CODE is the
6676 : code for the operation. VOP is the array of operands. MASK is the loop
6677 : mask. GSI is a statement iterator used to place the new conditional
6678 : expression. */
6679 : static void
6680 4 : build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6681 : gimple_stmt_iterator *gsi)
6682 : {
6683 4 : switch (tree_code (code))
6684 : {
6685 4 : case DOT_PROD_EXPR:
6686 4 : {
6687 4 : tree vectype = TREE_TYPE (vop[1]);
6688 4 : tree zero = build_zero_cst (vectype);
6689 4 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6690 4 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6691 : mask, vop[1], zero);
6692 4 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6693 4 : vop[1] = masked_op1;
6694 4 : break;
6695 : }
6696 :
6697 0 : case SAD_EXPR:
6698 0 : {
6699 0 : tree vectype = TREE_TYPE (vop[1]);
6700 0 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6701 0 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6702 : mask, vop[1], vop[0]);
6703 0 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6704 0 : vop[1] = masked_op1;
6705 0 : break;
6706 : }
6707 :
6708 0 : default:
6709 0 : gcc_unreachable ();
6710 : }
6711 4 : }
6712 :
6713 : /* Given an operation with CODE in loop reduction path whose reduction PHI is
6714 : specified by REDUC_INFO, the operation has TYPE of scalar result, and its
6715 : input vectype is represented by VECTYPE_IN. The vectype of vectorized result
6716 : may be different from VECTYPE_IN, either in base type or vectype lanes,
6717 : lane-reducing operation is the case. This function check if it is possible,
6718 : and how to perform partial vectorization on the operation in the context
6719 : of LOOP_VINFO. */
6720 :
6721 : static void
6722 3208 : vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo,
6723 : vect_reduc_info reduc_info,
6724 : slp_tree slp_node,
6725 : code_helper code, tree type,
6726 : tree vectype_in)
6727 : {
6728 3208 : enum vect_reduction_type reduc_type = VECT_REDUC_INFO_TYPE (reduc_info);
6729 3208 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
6730 3208 : internal_fn cond_fn
6731 924 : = ((code.is_internal_fn ()
6732 924 : && internal_fn_mask_index ((internal_fn)code) != -1)
6733 3208 : ? (internal_fn)code : get_conditional_internal_fn (code, type));
6734 :
6735 3208 : if (reduc_type != FOLD_LEFT_REDUCTION
6736 2529 : && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6737 5694 : && (cond_fn == IFN_LAST
6738 2486 : || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6739 : OPTIMIZE_FOR_SPEED)))
6740 : {
6741 1514 : if (dump_enabled_p ())
6742 98 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6743 : "can't operate on partial vectors because"
6744 : " no conditional operation is available.\n");
6745 1514 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6746 : }
6747 1694 : else if (reduc_type == FOLD_LEFT_REDUCTION
6748 1694 : && reduc_fn == IFN_LAST
6749 1694 : && !expand_vec_cond_expr_p (vectype_in, truth_type_for (vectype_in)))
6750 : {
6751 0 : if (dump_enabled_p ())
6752 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6753 : "can't operate on partial vectors because"
6754 : " no conditional operation is available.\n");
6755 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6756 : }
6757 1694 : else if (reduc_type == FOLD_LEFT_REDUCTION
6758 679 : && internal_fn_mask_index (reduc_fn) == -1
6759 679 : && FLOAT_TYPE_P (vectype_in)
6760 2368 : && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
6761 : {
6762 0 : if (dump_enabled_p ())
6763 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6764 : "can't operate on partial vectors because"
6765 : " signed zeros cannot be preserved.\n");
6766 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6767 : }
6768 : else
6769 : {
6770 1694 : internal_fn mask_reduc_fn
6771 1694 : = get_masked_reduction_fn (reduc_fn, vectype_in);
6772 1694 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6773 1694 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
6774 1694 : unsigned nvectors = vect_get_num_copies (loop_vinfo, slp_node);
6775 :
6776 1694 : if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6777 0 : vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_in, 1);
6778 : else
6779 1694 : vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_in, NULL);
6780 : }
6781 3208 : }
6782 :
6783 : /* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
6784 : the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC,
6785 : and the analysis is for slp if SLP_NODE is not NULL.
6786 :
6787 : For a lane-reducing operation, the loop reduction path that it lies in,
6788 : may contain normal operation, or other lane-reducing operation of different
6789 : input type size, an example as:
6790 :
6791 : int sum = 0;
6792 : for (i)
6793 : {
6794 : ...
6795 : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
6796 : sum += w[i]; // widen-sum <vector(16) char>
6797 : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
6798 : sum += n[i]; // normal <vector(4) int>
6799 : ...
6800 : }
6801 :
6802 : Vectorization factor is essentially determined by operation whose input
6803 : vectype has the most lanes ("vector(16) char" in the example), while we
6804 : need to choose input vectype with the least lanes ("vector(4) int" in the
6805 : example) to determine effective number of vector reduction PHIs. */
6806 :
6807 : bool
6808 331214 : vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
6809 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
6810 : {
6811 331214 : gimple *stmt = stmt_info->stmt;
6812 :
6813 331214 : if (!lane_reducing_stmt_p (stmt))
6814 : return false;
6815 :
6816 454 : tree type = TREE_TYPE (gimple_assign_lhs (stmt));
6817 :
6818 454 : if (!INTEGRAL_TYPE_P (type))
6819 : return false;
6820 :
6821 : /* Do not try to vectorize bit-precision reductions. */
6822 454 : if (!type_has_mode_precision_p (type))
6823 : return false;
6824 :
6825 454 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
6826 :
6827 : /* TODO: Support lane-reducing operation that does not directly participate
6828 : in loop reduction. */
6829 454 : if (!reduc_info)
6830 : return false;
6831 :
6832 : /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
6833 : recoginized. */
6834 454 : gcc_assert (!nested_in_vect_loop_p (LOOP_VINFO_LOOP (loop_vinfo), stmt_info));
6835 454 : gcc_assert (VECT_REDUC_INFO_TYPE (reduc_info) == TREE_CODE_REDUCTION);
6836 :
6837 1816 : for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
6838 : {
6839 1362 : slp_tree slp_op;
6840 1362 : tree op;
6841 1362 : tree vectype;
6842 1362 : enum vect_def_type dt;
6843 :
6844 1362 : if (!vect_is_simple_use (loop_vinfo, slp_node, i, &op,
6845 : &slp_op, &dt, &vectype))
6846 : {
6847 0 : if (dump_enabled_p ())
6848 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6849 : "use not simple.\n");
6850 0 : return false;
6851 : }
6852 :
6853 1362 : if (!vectype)
6854 : {
6855 6 : vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
6856 : slp_op);
6857 6 : if (!vectype)
6858 : return false;
6859 : }
6860 :
6861 1362 : if (!vect_maybe_update_slp_op_vectype (slp_op, vectype))
6862 : {
6863 0 : if (dump_enabled_p ())
6864 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6865 : "incompatible vector types for invariants\n");
6866 0 : return false;
6867 : }
6868 :
6869 1362 : if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6870 454 : continue;
6871 :
6872 : /* There should be at most one cycle def in the stmt. */
6873 908 : if (VECTORIZABLE_CYCLE_DEF (dt))
6874 : return false;
6875 : }
6876 :
6877 454 : slp_tree node_in = SLP_TREE_CHILDREN (slp_node)[0];
6878 454 : tree vectype_in = SLP_TREE_VECTYPE (node_in);
6879 454 : gcc_assert (vectype_in);
6880 :
6881 : /* Compute number of effective vector statements for costing. */
6882 454 : unsigned int ncopies_for_cost = vect_get_num_copies (loop_vinfo, node_in);
6883 454 : gcc_assert (ncopies_for_cost >= 1);
6884 :
6885 454 : if (vect_is_emulated_mixed_dot_prod (slp_node))
6886 : {
6887 : /* We need extra two invariants: one that contains the minimum signed
6888 : value and one that contains half of its negative. */
6889 9 : int prologue_stmts = 2;
6890 9 : unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
6891 : scalar_to_vec, slp_node, 0,
6892 : vect_prologue);
6893 9 : if (dump_enabled_p ())
6894 0 : dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
6895 : "extra prologue_cost = %d .\n", cost);
6896 :
6897 : /* Three dot-products and a subtraction. */
6898 9 : ncopies_for_cost *= 4;
6899 : }
6900 :
6901 454 : record_stmt_cost (cost_vec, (int) ncopies_for_cost, vector_stmt, slp_node,
6902 : 0, vect_body);
6903 :
6904 454 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
6905 : {
6906 43 : enum tree_code code = gimple_assign_rhs_code (stmt);
6907 43 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
6908 43 : node_in, code, type,
6909 : vectype_in);
6910 : }
6911 :
6912 : /* Transform via vect_transform_reduction. */
6913 454 : SLP_TREE_TYPE (slp_node) = reduc_vec_info_type;
6914 454 : return true;
6915 : }
6916 :
6917 : /* Function vectorizable_reduction.
6918 :
6919 : Check if STMT_INFO performs a reduction operation that can be vectorized.
6920 : If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6921 : stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6922 : Return true if STMT_INFO is vectorizable in this way.
6923 :
6924 : This function also handles reduction idioms (patterns) that have been
6925 : recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6926 : may be of this form:
6927 : X = pattern_expr (arg0, arg1, ..., X)
6928 : and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6929 : sequence that had been detected and replaced by the pattern-stmt
6930 : (STMT_INFO).
6931 :
6932 : This function also handles reduction of condition expressions, for example:
6933 : for (int i = 0; i < N; i++)
6934 : if (a[i] < value)
6935 : last = a[i];
6936 : This is handled by vectorising the loop and creating an additional vector
6937 : containing the loop indexes for which "a[i] < value" was true. In the
6938 : function epilogue this is reduced to a single max value and then used to
6939 : index into the vector of results.
6940 :
6941 : In some cases of reduction patterns, the type of the reduction variable X is
6942 : different than the type of the other arguments of STMT_INFO.
6943 : In such cases, the vectype that is used when transforming STMT_INFO into
6944 : a vector stmt is different than the vectype that is used to determine the
6945 : vectorization factor, because it consists of a different number of elements
6946 : than the actual number of elements that are being operated upon in parallel.
6947 :
6948 : For example, consider an accumulation of shorts into an int accumulator.
6949 : On some targets it's possible to vectorize this pattern operating on 8
6950 : shorts at a time (hence, the vectype for purposes of determining the
6951 : vectorization factor should be V8HI); on the other hand, the vectype that
6952 : is used to create the vector form is actually V4SI (the type of the result).
6953 :
6954 : Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6955 : indicates what is the actual level of parallelism (V8HI in the example), so
6956 : that the right vectorization factor would be derived. This vectype
6957 : corresponds to the type of arguments to the reduction stmt, and should *NOT*
6958 : be used to create the vectorized stmt. The right vectype for the vectorized
6959 : stmt is obtained from the type of the result X:
6960 : get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6961 :
6962 : This means that, contrary to "regular" reductions (or "regular" stmts in
6963 : general), the following equation:
6964 : STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6965 : does *NOT* necessarily hold for reduction patterns. */
6966 :
6967 : bool
6968 330760 : vectorizable_reduction (loop_vec_info loop_vinfo,
6969 : stmt_vec_info stmt_info, slp_tree slp_node,
6970 : slp_instance slp_node_instance,
6971 : stmt_vector_for_cost *cost_vec)
6972 : {
6973 330760 : tree vectype_in = NULL_TREE;
6974 330760 : enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6975 330760 : stmt_vec_info cond_stmt_vinfo = NULL;
6976 330760 : int i;
6977 330760 : int ncopies;
6978 330760 : bool single_defuse_cycle = false;
6979 330760 : tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6980 330760 : tree cond_reduc_val = NULL_TREE;
6981 :
6982 : /* Make sure it was already recognized as a reduction computation. */
6983 330760 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6984 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6985 330760 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6986 : return false;
6987 :
6988 : /* The reduction meta. */
6989 57734 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
6990 :
6991 57734 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6992 : {
6993 1427 : gcc_assert (is_a <gphi *> (stmt_info->stmt));
6994 : /* We eventually need to set a vector type on invariant arguments. */
6995 : unsigned j;
6996 : slp_tree child;
6997 4273 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6998 2854 : if (!vect_maybe_update_slp_op_vectype (child,
6999 : SLP_TREE_VECTYPE (slp_node)))
7000 : {
7001 0 : if (dump_enabled_p ())
7002 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7003 : "incompatible vector types for "
7004 : "invariants\n");
7005 0 : return false;
7006 : }
7007 2854 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7008 2854 : && !useless_type_conversion_p (SLP_TREE_VECTYPE (slp_node),
7009 : SLP_TREE_VECTYPE (child)))
7010 : {
7011 : /* With bools we can have mask and non-mask precision vectors
7012 : or different non-mask precisions. while pattern recog is
7013 : supposed to guarantee consistency here, we do not have
7014 : pattern stmts for PHIs (PR123316).
7015 : Deal with that here instead of ICEing later. */
7016 8 : if (dump_enabled_p ())
7017 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7018 : "incompatible vector type setup from "
7019 : "bool pattern detection\n");
7020 8 : return false;
7021 : }
7022 : /* Analysis for double-reduction is done on the outer
7023 : loop PHI, nested cycles have no further restrictions. */
7024 1419 : SLP_TREE_TYPE (slp_node) = cycle_phi_info_type;
7025 1419 : return true;
7026 : }
7027 :
7028 56307 : if (!is_a <gphi *> (stmt_info->stmt))
7029 : {
7030 6880 : gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def);
7031 6880 : SLP_TREE_TYPE (slp_node) = reduc_vec_info_type;
7032 6880 : return true;
7033 : }
7034 :
7035 49427 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7036 49427 : stmt_vec_info phi_info = stmt_info;
7037 49427 : bool double_reduc = false;
7038 49427 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7039 : {
7040 : /* We arrive here for both the inner loop LC PHI and the
7041 : outer loop PHI. The latter is what we want to analyze the
7042 : reduction with. The LC PHI is handled by vectorizable_lc_phi. */
7043 287 : if (gimple_bb (stmt_info->stmt) != loop->header)
7044 0 : return false;
7045 :
7046 : /* Set loop and phi_info to the inner loop. */
7047 287 : use_operand_p use_p;
7048 287 : gimple *use_stmt;
7049 287 : bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7050 : &use_p, &use_stmt);
7051 287 : gcc_assert (res);
7052 287 : phi_info = loop_vinfo->lookup_stmt (use_stmt);
7053 287 : loop = loop->inner;
7054 287 : double_reduc = true;
7055 : }
7056 :
7057 49427 : const bool reduc_chain = reduc_info->is_reduc_chain;
7058 49427 : slp_node_instance->reduc_phis = slp_node;
7059 : /* ??? We're leaving slp_node to point to the PHIs, we only
7060 : need it to get at the number of vector stmts which wasn't
7061 : yet initialized for the instance root. */
7062 :
7063 : /* PHIs should not participate in patterns. */
7064 49427 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7065 49427 : gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7066 :
7067 : /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7068 : and compute the reduction chain length. Discover the real
7069 : reduction operation stmt on the way (slp_for_stmt_info). */
7070 49427 : unsigned reduc_chain_length = 0;
7071 49427 : stmt_info = NULL;
7072 49427 : slp_tree slp_for_stmt_info = NULL;
7073 49427 : slp_tree vdef_slp = slp_node_instance->root;
7074 108698 : while (vdef_slp != slp_node)
7075 : {
7076 60023 : int reduc_idx = SLP_TREE_REDUC_IDX (vdef_slp);
7077 60023 : if (reduc_idx == -1)
7078 : {
7079 744 : if (dump_enabled_p ())
7080 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7081 : "reduction chain broken by patterns.\n");
7082 752 : return false;
7083 : }
7084 59279 : stmt_vec_info vdef = SLP_TREE_REPRESENTATIVE (vdef_slp);
7085 59279 : if (is_a <gphi *> (vdef->stmt))
7086 : {
7087 574 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[reduc_idx];
7088 : /* Do not count PHIs towards the chain length. */
7089 574 : continue;
7090 : }
7091 58705 : gimple_match_op op;
7092 58705 : if (!gimple_extract_op (vdef->stmt, &op))
7093 : {
7094 0 : if (dump_enabled_p ())
7095 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7096 : "reduction chain includes unsupported"
7097 : " statement type.\n");
7098 0 : return false;
7099 : }
7100 58705 : if (CONVERT_EXPR_CODE_P (op.code))
7101 : {
7102 3312 : if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7103 : {
7104 8 : if (dump_enabled_p ())
7105 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7106 : "conversion in the reduction chain.\n");
7107 8 : return false;
7108 : }
7109 3304 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[0];
7110 : }
7111 : else
7112 : {
7113 : /* First non-conversion stmt. */
7114 55393 : if (!slp_for_stmt_info)
7115 48675 : slp_for_stmt_info = vdef_slp;
7116 :
7117 55393 : if (lane_reducing_op_p (op.code))
7118 : {
7119 : /* The last operand of lane-reducing operation is for
7120 : reduction. */
7121 454 : gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
7122 :
7123 454 : slp_tree op_node = SLP_TREE_CHILDREN (vdef_slp)[0];
7124 454 : tree vectype_op = SLP_TREE_VECTYPE (op_node);
7125 454 : tree type_op = TREE_TYPE (op.ops[0]);
7126 454 : if (!vectype_op)
7127 : {
7128 9 : vectype_op = get_vectype_for_scalar_type (loop_vinfo,
7129 : type_op);
7130 9 : if (!vectype_op
7131 9 : || !vect_maybe_update_slp_op_vectype (op_node,
7132 : vectype_op))
7133 0 : return false;
7134 : }
7135 :
7136 : /* To accommodate lane-reducing operations of mixed input
7137 : vectypes, choose input vectype with the least lanes for the
7138 : reduction PHI statement, which would result in the most
7139 : ncopies for vectorized reduction results. */
7140 454 : if (!vectype_in
7141 454 : || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7142 46 : < GET_MODE_SIZE (SCALAR_TYPE_MODE (type_op))))
7143 431 : vectype_in = vectype_op;
7144 : }
7145 54939 : else if (!vectype_in)
7146 48244 : vectype_in = SLP_TREE_VECTYPE (slp_node);
7147 55393 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[reduc_idx];
7148 : }
7149 58697 : reduc_chain_length++;
7150 : }
7151 48675 : if (!slp_for_stmt_info)
7152 : {
7153 0 : if (dump_enabled_p ())
7154 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7155 : "only noop-conversions in the reduction chain.\n");
7156 0 : return false;
7157 : }
7158 48675 : stmt_info = SLP_TREE_REPRESENTATIVE (slp_for_stmt_info);
7159 :
7160 : /* PHIs should not participate in patterns. */
7161 48675 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7162 :
7163 : /* 1. Is vectorizable reduction? */
7164 : /* Not supportable if the reduction variable is used in the loop, unless
7165 : it's a reduction chain. */
7166 48675 : if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7167 0 : && !reduc_chain)
7168 : return false;
7169 :
7170 : /* Reductions that are not used even in an enclosing outer-loop,
7171 : are expected to be "live" (used out of the loop). */
7172 48675 : if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7173 0 : && !STMT_VINFO_LIVE_P (stmt_info))
7174 : return false;
7175 :
7176 : /* 2. Has this been recognized as a reduction pattern?
7177 :
7178 : Check if STMT represents a pattern that has been recognized
7179 : in earlier analysis stages. For stmts that represent a pattern,
7180 : the STMT_VINFO_RELATED_STMT field records the last stmt in
7181 : the original sequence that constitutes the pattern. */
7182 :
7183 48675 : stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7184 48675 : if (orig_stmt_info)
7185 : {
7186 3271 : gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7187 3271 : gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7188 : }
7189 :
7190 : /* 3. Check the operands of the operation. The first operands are defined
7191 : inside the loop body. The last operand is the reduction variable,
7192 : which is defined by the loop-header-phi. */
7193 :
7194 48675 : tree vectype_out = SLP_TREE_VECTYPE (slp_for_stmt_info);
7195 48675 : VECT_REDUC_INFO_VECTYPE (reduc_info) = vectype_out;
7196 :
7197 48675 : gimple_match_op op;
7198 48675 : if (!gimple_extract_op (stmt_info->stmt, &op))
7199 0 : gcc_unreachable ();
7200 48675 : bool lane_reducing = lane_reducing_op_p (op.code);
7201 :
7202 48675 : if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7203 15140 : && !SCALAR_FLOAT_TYPE_P (op.type))
7204 : return false;
7205 :
7206 : /* Do not try to vectorize bit-precision reductions. */
7207 48675 : if (!type_has_mode_precision_p (op.type)
7208 1552 : && op.code != BIT_AND_EXPR
7209 1472 : && op.code != BIT_IOR_EXPR
7210 49110 : && op.code != BIT_XOR_EXPR)
7211 : return false;
7212 :
7213 : /* Lane-reducing ops also never can be used in a SLP reduction group
7214 : since we'll mix lanes belonging to different reductions. But it's
7215 : OK to use them in a reduction chain or when the reduction group
7216 : has just one element. */
7217 48365 : if (lane_reducing
7218 48365 : && !reduc_chain
7219 404 : && SLP_TREE_LANES (slp_node) > 1)
7220 : {
7221 0 : if (dump_enabled_p ())
7222 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7223 : "lane-reducing reduction in reduction group.\n");
7224 0 : return false;
7225 : }
7226 :
7227 : /* All uses but the last are expected to be defined in the loop.
7228 : The last use is the reduction variable. In case of nested cycle this
7229 : assumption is not true: we use reduc_index to record the index of the
7230 : reduction variable. */
7231 48365 : slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7232 48365 : tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7233 48365 : gcc_assert (op.code != COND_EXPR || !COMPARISON_CLASS_P (op.ops[0]));
7234 153561 : for (i = 0; i < (int) op.num_ops; i++)
7235 : {
7236 : /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7237 105196 : if (i == 0 && op.code == COND_EXPR)
7238 52786 : continue;
7239 :
7240 104389 : stmt_vec_info def_stmt_info;
7241 104389 : enum vect_def_type dt;
7242 104389 : if (!vect_is_simple_use (loop_vinfo, slp_for_stmt_info,
7243 : i, &op.ops[i], &slp_op[i], &dt,
7244 104389 : &vectype_op[i], &def_stmt_info))
7245 : {
7246 0 : if (dump_enabled_p ())
7247 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7248 : "use not simple.\n");
7249 0 : return false;
7250 : }
7251 :
7252 : /* Skip reduction operands, and for an IFN_COND_OP we might hit the
7253 : reduction operand twice (once as definition, once as else). */
7254 104389 : if (SLP_TREE_CHILDREN (slp_for_stmt_info)[i]
7255 208778 : == SLP_TREE_CHILDREN
7256 104389 : (slp_for_stmt_info)[SLP_TREE_REDUC_IDX (slp_for_stmt_info)])
7257 51979 : continue;
7258 :
7259 : /* There should be only one cycle def in the stmt, the one
7260 : leading to reduc_def. */
7261 52410 : if (SLP_TREE_CHILDREN (slp_for_stmt_info)[i]->cycle_info.id != -1)
7262 : return false;
7263 :
7264 52410 : if (!vectype_op[i])
7265 4506 : vectype_op[i]
7266 4506 : = get_vectype_for_scalar_type (loop_vinfo,
7267 4506 : TREE_TYPE (op.ops[i]), slp_op[i]);
7268 :
7269 : /* Record how the non-reduction-def value of COND_EXPR is defined.
7270 : ??? For a chain of multiple CONDs we'd have to match them up all. */
7271 52410 : if (op.code == COND_EXPR && reduc_chain_length == 1)
7272 : {
7273 784 : if (dt == vect_constant_def)
7274 : {
7275 95 : cond_reduc_dt = dt;
7276 95 : cond_reduc_val = op.ops[i];
7277 : }
7278 689 : else if (dt == vect_induction_def
7279 408 : && def_stmt_info
7280 1097 : && is_nonwrapping_integer_induction (def_stmt_info, loop))
7281 : {
7282 109 : cond_reduc_dt = dt;
7283 109 : cond_stmt_vinfo = def_stmt_info;
7284 : }
7285 : }
7286 : }
7287 :
7288 48365 : enum vect_reduction_type reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7289 : /* If we have a condition reduction, see if we can simplify it further. */
7290 48365 : if (reduction_type == COND_REDUCTION)
7291 : {
7292 795 : if (SLP_TREE_LANES (slp_node) != 1)
7293 : return false;
7294 :
7295 : /* When the condition uses the reduction value in the condition, fail. */
7296 771 : if (SLP_TREE_REDUC_IDX (slp_node) == 0)
7297 : {
7298 0 : if (dump_enabled_p ())
7299 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7300 : "condition depends on previous iteration\n");
7301 0 : return false;
7302 : }
7303 :
7304 771 : if (reduc_chain_length == 1
7305 771 : && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7306 : OPTIMIZE_FOR_SPEED)
7307 748 : || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7308 : vectype_in,
7309 : OPTIMIZE_FOR_SPEED)))
7310 : {
7311 0 : if (dump_enabled_p ())
7312 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7313 : "optimizing condition reduction with"
7314 : " FOLD_EXTRACT_LAST.\n");
7315 0 : VECT_REDUC_INFO_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7316 : }
7317 771 : else if (cond_reduc_dt == vect_induction_def)
7318 : {
7319 109 : tree base
7320 : = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7321 109 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7322 :
7323 109 : gcc_assert (TREE_CODE (base) == INTEGER_CST
7324 : && TREE_CODE (step) == INTEGER_CST);
7325 109 : cond_reduc_val = NULL_TREE;
7326 109 : enum tree_code cond_reduc_op_code = ERROR_MARK;
7327 109 : tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7328 109 : if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7329 : ;
7330 : /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7331 : above base; punt if base is the minimum value of the type for
7332 : MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7333 97 : else if (tree_int_cst_sgn (step) == -1)
7334 : {
7335 18 : cond_reduc_op_code = MIN_EXPR;
7336 18 : if (tree_int_cst_sgn (base) == -1)
7337 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7338 18 : else if (tree_int_cst_lt (base,
7339 18 : TYPE_MAX_VALUE (TREE_TYPE (base))))
7340 18 : cond_reduc_val
7341 18 : = int_const_binop (PLUS_EXPR, base, integer_one_node);
7342 : }
7343 : else
7344 : {
7345 79 : cond_reduc_op_code = MAX_EXPR;
7346 79 : if (tree_int_cst_sgn (base) == 1)
7347 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7348 79 : else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7349 : base))
7350 79 : cond_reduc_val
7351 79 : = int_const_binop (MINUS_EXPR, base, integer_one_node);
7352 : }
7353 97 : if (cond_reduc_val)
7354 : {
7355 97 : if (dump_enabled_p ())
7356 61 : dump_printf_loc (MSG_NOTE, vect_location,
7357 : "condition expression based on "
7358 : "integer induction.\n");
7359 97 : VECT_REDUC_INFO_CODE (reduc_info) = cond_reduc_op_code;
7360 97 : VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info)
7361 97 : = cond_reduc_val;
7362 97 : VECT_REDUC_INFO_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7363 : }
7364 : }
7365 662 : else if (cond_reduc_dt == vect_constant_def)
7366 : {
7367 85 : enum vect_def_type cond_initial_dt;
7368 85 : tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7369 85 : vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7370 85 : if (cond_initial_dt == vect_constant_def
7371 107 : && types_compatible_p (TREE_TYPE (cond_initial_val),
7372 22 : TREE_TYPE (cond_reduc_val)))
7373 : {
7374 22 : tree e = fold_binary (LE_EXPR, boolean_type_node,
7375 : cond_initial_val, cond_reduc_val);
7376 22 : if (e && (integer_onep (e) || integer_zerop (e)))
7377 : {
7378 22 : if (dump_enabled_p ())
7379 16 : dump_printf_loc (MSG_NOTE, vect_location,
7380 : "condition expression based on "
7381 : "compile time constant.\n");
7382 : /* Record reduction code at analysis stage. */
7383 22 : VECT_REDUC_INFO_CODE (reduc_info)
7384 22 : = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7385 22 : VECT_REDUC_INFO_TYPE (reduc_info) = CONST_COND_REDUCTION;
7386 : }
7387 : }
7388 : }
7389 : }
7390 :
7391 48341 : if (STMT_VINFO_LIVE_P (phi_info))
7392 : return false;
7393 :
7394 48341 : ncopies = vect_get_num_copies (loop_vinfo, slp_node);
7395 :
7396 48341 : gcc_assert (ncopies >= 1);
7397 :
7398 48341 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7399 :
7400 : /* 4.2. Check support for the epilog operation.
7401 :
7402 : If STMT represents a reduction pattern, then the type of the
7403 : reduction variable may be different than the type of the rest
7404 : of the arguments. For example, consider the case of accumulation
7405 : of shorts into an int accumulator; The original code:
7406 : S1: int_a = (int) short_a;
7407 : orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7408 :
7409 : was replaced with:
7410 : STMT: int_acc = widen_sum <short_a, int_acc>
7411 :
7412 : This means that:
7413 : 1. The tree-code that is used to create the vector operation in the
7414 : epilog code (that reduces the partial results) is not the
7415 : tree-code of STMT, but is rather the tree-code of the original
7416 : stmt from the pattern that STMT is replacing. I.e, in the example
7417 : above we want to use 'widen_sum' in the loop, but 'plus' in the
7418 : epilog.
7419 : 2. The type (mode) we use to check available target support
7420 : for the vector operation to be created in the *epilog*, is
7421 : determined by the type of the reduction variable (in the example
7422 : above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7423 : However the type (mode) we use to check available target support
7424 : for the vector operation to be created *inside the loop*, is
7425 : determined by the type of the other arguments to STMT (in the
7426 : example we'd check this: optab_handler (widen_sum_optab,
7427 : vect_short_mode)).
7428 :
7429 : This is contrary to "regular" reductions, in which the types of all
7430 : the arguments are the same as the type of the reduction variable.
7431 : For "regular" reductions we can therefore use the same vector type
7432 : (and also the same tree-code) when generating the epilog code and
7433 : when generating the code inside the loop. */
7434 :
7435 48341 : code_helper orig_code = VECT_REDUC_INFO_CODE (reduc_info);
7436 :
7437 : /* If conversion might have created a conditional operation like
7438 : IFN_COND_ADD already. Use the internal code for the following checks. */
7439 48341 : if (orig_code.is_internal_fn ())
7440 : {
7441 3682 : tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7442 3682 : orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7443 : }
7444 :
7445 48341 : VECT_REDUC_INFO_CODE (reduc_info) = orig_code;
7446 :
7447 48341 : reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7448 48341 : if (reduction_type == TREE_CODE_REDUCTION)
7449 : {
7450 : /* Check whether it's ok to change the order of the computation.
7451 : Generally, when vectorizing a reduction we change the order of the
7452 : computation. This may change the behavior of the program in some
7453 : cases, so we need to check that this is ok. One exception is when
7454 : vectorizing an outer-loop: the inner-loop is executed sequentially,
7455 : and therefore vectorizing reductions in the inner-loop during
7456 : outer-loop vectorization is safe. Likewise when we are vectorizing
7457 : a series of reductions using SLP and the VF is one the reductions
7458 : are performed in scalar order. */
7459 47570 : if (!reduc_chain
7460 47570 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7461 : ;
7462 47429 : else if (needs_fold_left_reduction_p (op.type, orig_code))
7463 : {
7464 : /* When vectorizing a reduction chain w/o SLP the reduction PHI
7465 : is not directy used in stmt. */
7466 4799 : if (reduc_chain_length != 1)
7467 : {
7468 67 : if (dump_enabled_p ())
7469 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7470 : "in-order reduction chain without SLP.\n");
7471 67 : return false;
7472 : }
7473 : /* Code generation doesn't support function calls other
7474 : than .COND_*. */
7475 4732 : if (!op.code.is_tree_code ()
7476 4866 : && !(op.code.is_internal_fn ()
7477 67 : && conditional_internal_fn_code (internal_fn (op.code))
7478 : != ERROR_MARK))
7479 : {
7480 18 : if (dump_enabled_p ())
7481 16 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7482 : "in-order reduction chain operation not "
7483 : "supported.\n");
7484 18 : return false;
7485 : }
7486 4714 : VECT_REDUC_INFO_TYPE (reduc_info)
7487 4714 : = reduction_type = FOLD_LEFT_REDUCTION;
7488 : }
7489 42630 : else if (!commutative_binary_op_p (orig_code, op.type)
7490 42630 : || !associative_binary_op_p (orig_code, op.type))
7491 : {
7492 152 : if (dump_enabled_p ())
7493 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7494 : "reduction: not commutative/associative\n");
7495 152 : return false;
7496 : }
7497 : }
7498 :
7499 4714 : if ((reduction_type == COND_REDUCTION
7500 : || reduction_type == INTEGER_INDUC_COND_REDUCTION
7501 : || reduction_type == CONST_COND_REDUCTION
7502 43390 : || reduction_type == EXTRACT_LAST_REDUCTION)
7503 : && 1
7504 771 : && ncopies > 1)
7505 : {
7506 276 : if (dump_enabled_p ())
7507 60 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7508 : "multiple types in condition reduction.\n");
7509 276 : return false;
7510 : }
7511 :
7512 : /* See if we can convert a mask vector to a corresponding bool data vector
7513 : to perform the epilogue reduction. */
7514 47828 : tree alt_vectype_out = NULL_TREE;
7515 47828 : if (VECTOR_BOOLEAN_TYPE_P (vectype_out))
7516 : {
7517 968 : alt_vectype_out
7518 1936 : = get_related_vectype_for_scalar_type (loop_vinfo->vector_mode,
7519 968 : TREE_TYPE (vectype_out),
7520 : TYPE_VECTOR_SUBPARTS
7521 : (vectype_out));
7522 968 : if (!alt_vectype_out
7523 968 : || maybe_ne (TYPE_VECTOR_SUBPARTS (alt_vectype_out),
7524 1917 : TYPE_VECTOR_SUBPARTS (vectype_out))
7525 1936 : || !expand_vec_cond_expr_p (alt_vectype_out, vectype_out))
7526 19 : alt_vectype_out = NULL_TREE;
7527 : }
7528 :
7529 47828 : internal_fn reduc_fn = IFN_LAST;
7530 47828 : if (reduction_type == TREE_CODE_REDUCTION
7531 47828 : || reduction_type == FOLD_LEFT_REDUCTION
7532 : || reduction_type == INTEGER_INDUC_COND_REDUCTION
7533 495 : || reduction_type == CONST_COND_REDUCTION)
7534 : {
7535 42730 : if (reduction_type == FOLD_LEFT_REDUCTION
7536 51371 : ? fold_left_reduction_fn (orig_code, &reduc_fn)
7537 42730 : : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7538 : {
7539 46768 : internal_fn sbool_fn = IFN_LAST;
7540 46768 : if (reduc_fn == IFN_LAST)
7541 : ;
7542 44848 : else if ((!VECTOR_BOOLEAN_TYPE_P (vectype_out)
7543 968 : || (GET_MODE_CLASS (TYPE_MODE (vectype_out))
7544 : == MODE_VECTOR_BOOL))
7545 88728 : && direct_internal_fn_supported_p (reduc_fn, vectype_out,
7546 : OPTIMIZE_FOR_SPEED))
7547 : ;
7548 10205 : else if (VECTOR_BOOLEAN_TYPE_P (vectype_out)
7549 968 : && sbool_reduction_fn_for_fn (reduc_fn, &sbool_fn)
7550 11173 : && direct_internal_fn_supported_p (sbool_fn, vectype_out,
7551 : OPTIMIZE_FOR_SPEED))
7552 73 : reduc_fn = sbool_fn;
7553 10132 : else if (reduction_type != FOLD_LEFT_REDUCTION
7554 10132 : && alt_vectype_out
7555 10132 : && direct_internal_fn_supported_p (reduc_fn, alt_vectype_out,
7556 : OPTIMIZE_FOR_SPEED))
7557 724 : VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info) = alt_vectype_out;
7558 : else
7559 : {
7560 9408 : if (dump_enabled_p ())
7561 846 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7562 : "reduc op not supported by target.\n");
7563 :
7564 9408 : reduc_fn = IFN_LAST;
7565 : }
7566 : }
7567 : else
7568 : {
7569 676 : if (dump_enabled_p ())
7570 48 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7571 : "no reduc code for scalar code.\n");
7572 :
7573 676 : return false;
7574 : }
7575 46768 : if (reduc_fn == IFN_LAST
7576 46768 : && VECTOR_BOOLEAN_TYPE_P (vectype_out))
7577 : {
7578 171 : if (!alt_vectype_out)
7579 : {
7580 12 : if (dump_enabled_p ())
7581 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7582 : "cannot turn mask into bool data vector for "
7583 : "reduction epilogue.\n");
7584 12 : return false;
7585 : }
7586 159 : VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info) = alt_vectype_out;
7587 : }
7588 : }
7589 384 : else if (reduction_type == COND_REDUCTION)
7590 : {
7591 384 : int scalar_precision
7592 384 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7593 384 : cr_index_scalar_type = make_unsigned_type (scalar_precision);
7594 384 : cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7595 : vectype_out);
7596 :
7597 384 : if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7598 : OPTIMIZE_FOR_SPEED))
7599 12 : reduc_fn = IFN_REDUC_MAX;
7600 : }
7601 47140 : VECT_REDUC_INFO_FN (reduc_info) = reduc_fn;
7602 :
7603 47140 : if (reduction_type != EXTRACT_LAST_REDUCTION
7604 : && reduc_fn == IFN_LAST
7605 : && !nunits_out.is_constant ())
7606 : {
7607 : if (dump_enabled_p ())
7608 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7609 : "missing target support for reduction on"
7610 : " variable-length vectors.\n");
7611 : return false;
7612 : }
7613 :
7614 : /* For SLP reductions, see if there is a neutral value we can use. */
7615 47140 : tree neutral_op = NULL_TREE;
7616 47140 : tree initial_value = NULL_TREE;
7617 47140 : if (reduc_chain)
7618 1374 : initial_value = vect_phi_initial_value (reduc_def_phi);
7619 47140 : neutral_op = neutral_op_for_reduction (TREE_TYPE
7620 : (gimple_phi_result (reduc_def_phi)),
7621 : orig_code, initial_value);
7622 47140 : VECT_REDUC_INFO_NEUTRAL_OP (reduc_info) = neutral_op;
7623 :
7624 47140 : if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7625 : {
7626 : /* We can't support in-order reductions of code such as this:
7627 :
7628 : for (int i = 0; i < n1; ++i)
7629 : for (int j = 0; j < n2; ++j)
7630 : l += a[j];
7631 :
7632 : since GCC effectively transforms the loop when vectorizing:
7633 :
7634 : for (int i = 0; i < n1 / VF; ++i)
7635 : for (int j = 0; j < n2; ++j)
7636 : for (int k = 0; k < VF; ++k)
7637 : l += a[j];
7638 :
7639 : which is a reassociation of the original operation. */
7640 56 : if (dump_enabled_p ())
7641 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7642 : "in-order double reduction not supported.\n");
7643 :
7644 56 : return false;
7645 : }
7646 :
7647 47084 : if (reduction_type == FOLD_LEFT_REDUCTION
7648 3982 : && SLP_TREE_LANES (slp_node) > 1
7649 117 : && !reduc_chain)
7650 : {
7651 : /* We cannot use in-order reductions in this case because there is
7652 : an implicit reassociation of the operations involved. */
7653 55 : if (dump_enabled_p ())
7654 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7655 : "in-order unchained SLP reductions not supported.\n");
7656 55 : return false;
7657 : }
7658 :
7659 : /* For double reductions, and for SLP reductions with a neutral value,
7660 : we construct a variable-length initial vector by loading a vector
7661 : full of the neutral value and then shift-and-inserting the start
7662 : values into the low-numbered elements. This is however not needed
7663 : when neutral and initial value are equal or we can handle the
7664 : initial value via adjustment in the epilogue. */
7665 47029 : if ((double_reduc || neutral_op)
7666 : && !nunits_out.is_constant ()
7667 : && reduction_type != INTEGER_INDUC_COND_REDUCTION
7668 : && !((SLP_TREE_LANES (slp_node) == 1 || reduc_chain)
7669 : && neutral_op
7670 : && (!double_reduc
7671 : || operand_equal_p (neutral_op,
7672 : vect_phi_initial_value (reduc_def_phi))))
7673 : && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7674 : vectype_out, OPTIMIZE_FOR_BOTH))
7675 : {
7676 : if (dump_enabled_p ())
7677 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7678 : "reduction on variable-length vectors requires"
7679 : " target support for a vector-shift-and-insert"
7680 : " operation.\n");
7681 : return false;
7682 : }
7683 :
7684 : /* Check extra constraints for variable-length unchained SLP reductions. */
7685 47029 : if (!reduc_chain
7686 : && !nunits_out.is_constant ())
7687 : {
7688 : /* We checked above that we could build the initial vector when
7689 : there's a neutral element value. Check here for the case in
7690 : which each SLP statement has its own initial value and in which
7691 : that value needs to be repeated for every instance of the
7692 : statement within the initial vector. */
7693 : unsigned int group_size = SLP_TREE_LANES (slp_node);
7694 : if (!neutral_op
7695 : && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7696 : TREE_TYPE (vectype_out)))
7697 : {
7698 : if (dump_enabled_p ())
7699 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7700 : "unsupported form of SLP reduction for"
7701 : " variable-length vectors: cannot build"
7702 : " initial vector.\n");
7703 : return false;
7704 : }
7705 : /* The epilogue code relies on the number of elements being a multiple
7706 : of the group size. The duplicate-and-interleave approach to setting
7707 : up the initial vector does too. */
7708 : if (!multiple_p (nunits_out, group_size))
7709 : {
7710 : if (dump_enabled_p ())
7711 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7712 : "unsupported form of SLP reduction for"
7713 : " variable-length vectors: the vector size"
7714 : " is not a multiple of the number of results.\n");
7715 : return false;
7716 : }
7717 : }
7718 :
7719 47029 : if (reduction_type == COND_REDUCTION)
7720 : {
7721 384 : widest_int ni;
7722 :
7723 384 : if (! max_loop_iterations (loop, &ni))
7724 : {
7725 14 : if (dump_enabled_p ())
7726 0 : dump_printf_loc (MSG_NOTE, vect_location,
7727 : "loop count not known, cannot create cond "
7728 : "reduction.\n");
7729 14 : return false;
7730 : }
7731 : /* Convert backedges to iterations. */
7732 370 : ni += 1;
7733 :
7734 : /* The additional index will be the same type as the condition. Check
7735 : that the loop can fit into this less one (because we'll use up the
7736 : zero slot for when there are no matches). */
7737 370 : tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7738 370 : if (wi::geu_p (ni, wi::to_widest (max_index)))
7739 : {
7740 90 : if (dump_enabled_p ())
7741 54 : dump_printf_loc (MSG_NOTE, vect_location,
7742 : "loop size is greater than data size.\n");
7743 90 : return false;
7744 : }
7745 384 : }
7746 :
7747 : /* In case the vectorization factor (VF) is bigger than the number
7748 : of elements that we can fit in a vectype (nunits), we have to generate
7749 : more than one vector stmt - i.e - we need to "unroll" the
7750 : vector stmt by a factor VF/nunits. For more details see documentation
7751 : in vectorizable_operation. */
7752 :
7753 : /* If the reduction is used in an outer loop we need to generate
7754 : VF intermediate results, like so (e.g. for ncopies=2):
7755 : r0 = phi (init, r0)
7756 : r1 = phi (init, r1)
7757 : r0 = x0 + r0;
7758 : r1 = x1 + r1;
7759 : (i.e. we generate VF results in 2 registers).
7760 : In this case we have a separate def-use cycle for each copy, and therefore
7761 : for each copy we get the vector def for the reduction variable from the
7762 : respective phi node created for this copy.
7763 :
7764 : Otherwise (the reduction is unused in the loop nest), we can combine
7765 : together intermediate results, like so (e.g. for ncopies=2):
7766 : r = phi (init, r)
7767 : r = x0 + r;
7768 : r = x1 + r;
7769 : (i.e. we generate VF/2 results in a single register).
7770 : In this case for each copy we get the vector def for the reduction variable
7771 : from the vectorized reduction operation generated in the previous iteration.
7772 :
7773 : This only works when we see both the reduction PHI and its only consumer
7774 : in vectorizable_reduction and there are no intermediate stmts
7775 : participating. When unrolling we want each unrolled iteration to have its
7776 : own reduction accumulator since one of the main goals of unrolling a
7777 : reduction is to reduce the aggregate loop-carried latency. */
7778 46925 : if (ncopies > 1
7779 46925 : && !reduc_chain
7780 5252 : && SLP_TREE_LANES (slp_node) == 1
7781 5092 : && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7782 5073 : && reduc_chain_length == 1
7783 4770 : && loop_vinfo->suggested_unroll_factor == 1)
7784 46925 : single_defuse_cycle = true;
7785 :
7786 46925 : if (single_defuse_cycle && !lane_reducing)
7787 : {
7788 4203 : gcc_assert (op.code != COND_EXPR);
7789 :
7790 : /* 4. check support for the operation in the loop
7791 :
7792 : This isn't necessary for the lane reduction codes, since they
7793 : can only be produced by pattern matching, and it's up to the
7794 : pattern matcher to test for support. The main reason for
7795 : specifically skipping this step is to avoid rechecking whether
7796 : mixed-sign dot-products can be implemented using signed
7797 : dot-products. */
7798 4203 : machine_mode vec_mode = TYPE_MODE (vectype_in);
7799 4203 : if (!directly_supported_p (op.code, vectype_in, optab_vector))
7800 : {
7801 711 : if (dump_enabled_p ())
7802 24 : dump_printf (MSG_NOTE, "op not supported by target.\n");
7803 1422 : if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7804 711 : || !vect_can_vectorize_without_simd_p (op.code))
7805 : single_defuse_cycle = false;
7806 : else
7807 5 : if (dump_enabled_p ())
7808 0 : dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7809 : }
7810 :
7811 4203 : if (vect_emulated_vector_p (vectype_in)
7812 4203 : && !vect_can_vectorize_without_simd_p (op.code))
7813 : {
7814 0 : if (dump_enabled_p ())
7815 0 : dump_printf (MSG_NOTE, "using word mode not possible.\n");
7816 0 : return false;
7817 : }
7818 : }
7819 46925 : if (dump_enabled_p () && single_defuse_cycle)
7820 650 : dump_printf_loc (MSG_NOTE, vect_location,
7821 : "using single def-use cycle for reduction by reducing "
7822 : "multiple vectors to one in the loop body\n");
7823 46925 : VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7824 :
7825 : /* For lane-reducing operation, the below processing related to single
7826 : defuse-cycle will be done in its own vectorizable function. One more
7827 : thing to note is that the operation must not be involved in fold-left
7828 : reduction. */
7829 46925 : single_defuse_cycle &= !lane_reducing;
7830 :
7831 46925 : if (single_defuse_cycle || reduction_type == FOLD_LEFT_REDUCTION)
7832 23976 : for (i = 0; i < (int) op.num_ops; i++)
7833 16612 : if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7834 : {
7835 0 : if (dump_enabled_p ())
7836 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7837 : "incompatible vector types for invariants\n");
7838 0 : return false;
7839 : }
7840 :
7841 46925 : vect_model_reduction_cost (loop_vinfo, slp_for_stmt_info, reduc_fn,
7842 : reduction_type, ncopies, cost_vec);
7843 : /* Cost the reduction op inside the loop if transformed via
7844 : vect_transform_reduction for non-lane-reducing operation. Otherwise
7845 : this is costed by the separate vectorizable_* routines. */
7846 46925 : if (single_defuse_cycle)
7847 3497 : record_stmt_cost (cost_vec, ncopies, vector_stmt,
7848 : slp_for_stmt_info, 0, vect_body);
7849 :
7850 46925 : if (dump_enabled_p ()
7851 46925 : && reduction_type == FOLD_LEFT_REDUCTION)
7852 219 : dump_printf_loc (MSG_NOTE, vect_location,
7853 : "using an in-order (fold-left) reduction.\n");
7854 46925 : SLP_TREE_TYPE (slp_node) = cycle_phi_info_type;
7855 :
7856 : /* All but single defuse-cycle optimized and fold-left reductions go
7857 : through their own vectorizable_* routines. */
7858 46925 : stmt_vec_info tem
7859 46925 : = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (slp_node_instance));
7860 46925 : if (!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
7861 39561 : STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7862 : else
7863 : {
7864 7364 : STMT_VINFO_DEF_TYPE (tem) = vect_reduction_def;
7865 7364 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7866 3165 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
7867 : slp_node, op.code, op.type,
7868 : vectype_in);
7869 : }
7870 : return true;
7871 : }
7872 :
7873 : /* STMT_INFO is a dot-product reduction whose multiplication operands
7874 : have different signs. Emit a sequence to emulate the operation
7875 : using a series of signed DOT_PROD_EXPRs and return the last
7876 : statement generated. VEC_DEST is the result of the vector operation
7877 : and VOP lists its inputs. */
7878 :
7879 : static gassign *
7880 4 : vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7881 : gimple_stmt_iterator *gsi, tree vec_dest,
7882 : tree vop[3])
7883 : {
7884 4 : tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
7885 4 : tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
7886 4 : tree narrow_elttype = TREE_TYPE (narrow_vectype);
7887 4 : gimple *new_stmt;
7888 :
7889 : /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
7890 4 : if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
7891 0 : std::swap (vop[0], vop[1]);
7892 :
7893 : /* Convert all inputs to signed types. */
7894 16 : for (int i = 0; i < 3; ++i)
7895 12 : if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
7896 : {
7897 4 : tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
7898 4 : new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
7899 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7900 4 : vop[i] = tmp;
7901 : }
7902 :
7903 : /* In the comments below we assume 8-bit inputs for simplicity,
7904 : but the approach works for any full integer type. */
7905 :
7906 : /* Create a vector of -128. */
7907 4 : tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
7908 4 : tree min_narrow = build_vector_from_val (narrow_vectype,
7909 : min_narrow_elttype);
7910 :
7911 : /* Create a vector of 64. */
7912 4 : auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
7913 4 : tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
7914 4 : half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
7915 :
7916 : /* Emit: SUB_RES = VOP[0] - 128. */
7917 4 : tree sub_res = make_ssa_name (narrow_vectype);
7918 4 : new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
7919 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7920 :
7921 : /* Emit:
7922 :
7923 : STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
7924 : STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
7925 : STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
7926 :
7927 : on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
7928 : Doing the two 64 * y steps first allows more time to compute x. */
7929 4 : tree stage1 = make_ssa_name (wide_vectype);
7930 4 : new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
7931 : vop[1], half_narrow, vop[2]);
7932 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7933 :
7934 4 : tree stage2 = make_ssa_name (wide_vectype);
7935 4 : new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
7936 : vop[1], half_narrow, stage1);
7937 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7938 :
7939 4 : tree stage3 = make_ssa_name (wide_vectype);
7940 4 : new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
7941 : sub_res, vop[1], stage2);
7942 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7943 :
7944 : /* Convert STAGE3 to the reduction type. */
7945 4 : return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
7946 4 : }
7947 :
7948 : /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7949 : value. */
7950 :
7951 : bool
7952 2575 : vect_transform_reduction (loop_vec_info loop_vinfo,
7953 : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7954 : slp_tree slp_node)
7955 : {
7956 2575 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
7957 2575 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7958 2575 : unsigned vec_num;
7959 :
7960 2575 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
7961 :
7962 2575 : if (nested_in_vect_loop_p (loop, stmt_info))
7963 : {
7964 0 : loop = loop->inner;
7965 0 : gcc_assert (VECT_REDUC_INFO_DEF_TYPE (reduc_info)
7966 : == vect_double_reduction_def);
7967 : }
7968 :
7969 2575 : gimple_match_op op;
7970 2575 : if (!gimple_extract_op (stmt_info->stmt, &op))
7971 0 : gcc_unreachable ();
7972 :
7973 : /* All uses but the last are expected to be defined in the loop.
7974 : The last use is the reduction variable. In case of nested cycle this
7975 : assumption is not true: we use reduc_index to record the index of the
7976 : reduction variable. */
7977 2575 : int reduc_index = SLP_TREE_REDUC_IDX (slp_node);
7978 2575 : tree vectype_in = SLP_TREE_VECTYPE (slp_node);
7979 2575 : if (lane_reducing_op_p (op.code))
7980 252 : vectype_in = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (slp_node)[0]);
7981 :
7982 2575 : vec_num = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
7983 :
7984 2575 : code_helper code = canonicalize_code (op.code, op.type);
7985 2575 : internal_fn cond_fn
7986 476 : = ((code.is_internal_fn ()
7987 476 : && internal_fn_mask_index ((internal_fn)code) != -1)
7988 2575 : ? (internal_fn)code : get_conditional_internal_fn (code, op.type));
7989 :
7990 2575 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7991 2575 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
7992 2575 : bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7993 :
7994 : /* Transform. */
7995 2575 : tree new_temp = NULL_TREE;
7996 18025 : auto_vec<tree> vec_oprnds[3];
7997 :
7998 2575 : if (dump_enabled_p ())
7999 745 : dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8000 :
8001 : /* A binary COND_OP reduction must have the same definition and else
8002 : value. */
8003 3051 : bool cond_fn_p = code.is_internal_fn ()
8004 476 : && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8005 476 : if (cond_fn_p)
8006 : {
8007 476 : gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8008 : || code == IFN_COND_MUL || code == IFN_COND_AND
8009 : || code == IFN_COND_IOR || code == IFN_COND_XOR
8010 : || code == IFN_COND_MIN || code == IFN_COND_MAX);
8011 476 : gcc_assert (op.num_ops == 4
8012 : && (op.ops[reduc_index]
8013 : == op.ops[internal_fn_else_index ((internal_fn) code)]));
8014 : }
8015 :
8016 2575 : bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8017 :
8018 2575 : vect_reduction_type reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
8019 2575 : if (reduction_type == FOLD_LEFT_REDUCTION)
8020 : {
8021 843 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
8022 843 : gcc_assert (code.is_tree_code () || cond_fn_p);
8023 843 : return vectorize_fold_left_reduction
8024 843 : (loop_vinfo, stmt_info, gsi, slp_node,
8025 843 : code, reduc_fn, op.num_ops, vectype_in,
8026 843 : reduc_index, masks, lens);
8027 : }
8028 :
8029 1732 : bool single_defuse_cycle = VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info);
8030 1732 : bool lane_reducing = lane_reducing_op_p (code);
8031 1480 : gcc_assert (single_defuse_cycle || lane_reducing);
8032 :
8033 1732 : if (lane_reducing)
8034 : {
8035 : /* The last operand of lane-reducing op is for reduction. */
8036 252 : gcc_assert (reduc_index == (int) op.num_ops - 1);
8037 : }
8038 :
8039 : /* Create the destination vector */
8040 1732 : tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8041 1732 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8042 :
8043 : /* Get NCOPIES vector definitions for all operands except the reduction
8044 : definition. */
8045 1732 : if (!cond_fn_p)
8046 : {
8047 1279 : gcc_assert (reduc_index >= 0 && reduc_index <= 2);
8048 2109 : vect_get_vec_defs (loop_vinfo, slp_node,
8049 1279 : single_defuse_cycle && reduc_index == 0
8050 : ? NULL_TREE : op.ops[0], &vec_oprnds[0],
8051 1279 : single_defuse_cycle && reduc_index == 1
8052 : ? NULL_TREE : op.ops[1], &vec_oprnds[1],
8053 1279 : op.num_ops == 3
8054 252 : && !(single_defuse_cycle && reduc_index == 2)
8055 : ? op.ops[2] : NULL_TREE, &vec_oprnds[2]);
8056 : }
8057 : else
8058 : {
8059 : /* For a conditional operation pass the truth type as mask
8060 : vectype. */
8061 453 : gcc_assert (single_defuse_cycle
8062 : && (reduc_index == 1 || reduc_index == 2));
8063 453 : vect_get_vec_defs (loop_vinfo, slp_node, op.ops[0],
8064 : &vec_oprnds[0],
8065 : reduc_index == 1 ? NULL_TREE : op.ops[1],
8066 : &vec_oprnds[1],
8067 : reduc_index == 2 ? NULL_TREE : op.ops[2],
8068 : &vec_oprnds[2]);
8069 : }
8070 :
8071 : /* For single def-use cycles get one copy of the vectorized reduction
8072 : definition. */
8073 1732 : if (single_defuse_cycle)
8074 : {
8075 1647 : vect_get_vec_defs (loop_vinfo, slp_node,
8076 : reduc_index == 0 ? op.ops[0] : NULL_TREE,
8077 : &vec_oprnds[0],
8078 : reduc_index == 1 ? op.ops[1] : NULL_TREE,
8079 : &vec_oprnds[1],
8080 : reduc_index == 2 ? op.ops[2] : NULL_TREE,
8081 : &vec_oprnds[2]);
8082 : }
8083 85 : else if (lane_reducing)
8084 : {
8085 : /* For normal reduction, consistency between vectorized def/use is
8086 : naturally ensured when mapping from scalar statement. But if lane-
8087 : reducing op is involved in reduction, thing would become somewhat
8088 : complicated in that the op's result and operand for accumulation are
8089 : limited to less lanes than other operands, which certainly causes
8090 : def/use mismatch on adjacent statements around the op if do not have
8091 : any kind of specific adjustment. One approach is to refit lane-
8092 : reducing op in the way of introducing new trivial pass-through copies
8093 : to fix possible def/use gap, so as to make it behave like a normal op.
8094 : And vector reduction PHIs are always generated to the full extent, no
8095 : matter lane-reducing op exists or not. If some copies or PHIs are
8096 : actually superfluous, they would be cleaned up by passes after
8097 : vectorization. An example for single-lane slp, lane-reducing ops
8098 : with mixed input vectypes in a reduction chain, is given as below.
8099 : Similarly, this handling is applicable for multiple-lane slp as well.
8100 :
8101 : int sum = 1;
8102 : for (i)
8103 : {
8104 : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
8105 : sum += w[i]; // widen-sum <vector(16) char>
8106 : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
8107 : sum += n[i]; // normal <vector(4) int>
8108 : }
8109 :
8110 : The vector size is 128-bit,vectorization factor is 16. Reduction
8111 : statements would be transformed as:
8112 :
8113 : vector<4> int sum_v0 = { 0, 0, 0, 1 };
8114 : vector<4> int sum_v1 = { 0, 0, 0, 0 };
8115 : vector<4> int sum_v2 = { 0, 0, 0, 0 };
8116 : vector<4> int sum_v3 = { 0, 0, 0, 0 };
8117 :
8118 : for (i / 16)
8119 : {
8120 : sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
8121 : sum_v1 = sum_v1; // copy
8122 : sum_v2 = sum_v2; // copy
8123 : sum_v3 = sum_v3; // copy
8124 :
8125 : sum_v0 = sum_v0; // copy
8126 : sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
8127 : sum_v2 = sum_v2; // copy
8128 : sum_v3 = sum_v3; // copy
8129 :
8130 : sum_v0 = sum_v0; // copy
8131 : sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1);
8132 : sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2);
8133 : sum_v3 = sum_v3; // copy
8134 :
8135 : sum_v0 += n_v0[i: 0 ~ 3 ];
8136 : sum_v1 += n_v1[i: 4 ~ 7 ];
8137 : sum_v2 += n_v2[i: 8 ~ 11];
8138 : sum_v3 += n_v3[i: 12 ~ 15];
8139 : }
8140 :
8141 : Moreover, for a higher instruction parallelism in final vectorized
8142 : loop, it is considered to make those effective vector lane-reducing
8143 : ops be distributed evenly among all def-use cycles. In the above
8144 : example, DOT_PROD, WIDEN_SUM and SADs are generated into disparate
8145 : cycles, instruction dependency among them could be eliminated. */
8146 85 : unsigned effec_ncopies = vec_oprnds[0].length ();
8147 85 : unsigned total_ncopies = vec_oprnds[reduc_index].length ();
8148 :
8149 85 : gcc_assert (effec_ncopies <= total_ncopies);
8150 :
8151 85 : if (effec_ncopies < total_ncopies)
8152 : {
8153 255 : for (unsigned i = 0; i < op.num_ops - 1; i++)
8154 : {
8155 340 : gcc_assert (vec_oprnds[i].length () == effec_ncopies);
8156 170 : vec_oprnds[i].safe_grow_cleared (total_ncopies);
8157 : }
8158 : }
8159 :
8160 85 : tree reduc_vectype_in = vectype_in;
8161 85 : gcc_assert (reduc_vectype_in);
8162 :
8163 85 : unsigned effec_reduc_ncopies
8164 85 : = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
8165 :
8166 85 : gcc_assert (effec_ncopies <= effec_reduc_ncopies);
8167 :
8168 85 : if (effec_ncopies < effec_reduc_ncopies)
8169 : {
8170 : /* Find suitable def-use cycles to generate vectorized statements
8171 : into, and reorder operands based on the selection. */
8172 0 : unsigned curr_pos = VECT_REDUC_INFO_RESULT_POS (reduc_info);
8173 0 : unsigned next_pos = (curr_pos + effec_ncopies) % effec_reduc_ncopies;
8174 :
8175 0 : gcc_assert (curr_pos < effec_reduc_ncopies);
8176 0 : VECT_REDUC_INFO_RESULT_POS (reduc_info) = next_pos;
8177 :
8178 0 : if (curr_pos)
8179 : {
8180 0 : unsigned count = effec_reduc_ncopies - effec_ncopies;
8181 0 : unsigned start = curr_pos - count;
8182 :
8183 0 : if ((int) start < 0)
8184 : {
8185 0 : count = curr_pos;
8186 0 : start = 0;
8187 : }
8188 :
8189 0 : for (unsigned i = 0; i < op.num_ops - 1; i++)
8190 : {
8191 0 : for (unsigned j = effec_ncopies; j > start; j--)
8192 : {
8193 0 : unsigned k = j - 1;
8194 0 : std::swap (vec_oprnds[i][k], vec_oprnds[i][k + count]);
8195 0 : gcc_assert (!vec_oprnds[i][k]);
8196 : }
8197 : }
8198 : }
8199 : }
8200 : }
8201 :
8202 1732 : bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (slp_node);
8203 2971 : unsigned num = vec_oprnds[reduc_index == 0 ? 1 : 0].length ();
8204 1732 : unsigned mask_index = 0;
8205 :
8206 7593 : for (unsigned i = 0; i < num; ++i)
8207 : {
8208 5861 : gimple *new_stmt;
8209 5861 : tree vop[3] = { vec_oprnds[0][i], vec_oprnds[1][i], NULL_TREE };
8210 5861 : if (!vop[0] || !vop[1])
8211 : {
8212 456 : tree reduc_vop = vec_oprnds[reduc_index][i];
8213 :
8214 : /* If could not generate an effective vector statement for current
8215 : portion of reduction operand, insert a trivial copy to simply
8216 : handle over the operand to other dependent statements. */
8217 456 : gcc_assert (reduc_vop);
8218 :
8219 456 : if (TREE_CODE (reduc_vop) == SSA_NAME
8220 456 : && !SSA_NAME_IS_DEFAULT_DEF (reduc_vop))
8221 456 : new_stmt = SSA_NAME_DEF_STMT (reduc_vop);
8222 : else
8223 : {
8224 0 : new_temp = make_ssa_name (vec_dest);
8225 0 : new_stmt = gimple_build_assign (new_temp, reduc_vop);
8226 0 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
8227 : gsi);
8228 : }
8229 : }
8230 5405 : else if (masked_loop_p && !mask_by_cond_expr)
8231 : {
8232 : /* No conditional ifns have been defined for lane-reducing op
8233 : yet. */
8234 16 : gcc_assert (!lane_reducing);
8235 :
8236 16 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8237 : vec_num, vectype_in,
8238 : mask_index++);
8239 16 : gcall *call;
8240 24 : if (code.is_internal_fn () && cond_fn_p)
8241 : {
8242 16 : gcc_assert (op.num_ops >= 3
8243 : && internal_fn_mask_index (internal_fn (code)) == 0);
8244 8 : vop[2] = vec_oprnds[2][i];
8245 8 : mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask),
8246 : mask, vop[0], gsi);
8247 8 : call = gimple_build_call_internal (cond_fn, 4, mask, vop[1],
8248 : vop[2], vop[reduc_index]);
8249 : }
8250 : else
8251 8 : call = gimple_build_call_internal (cond_fn, 4, mask, vop[0],
8252 : vop[1], vop[reduc_index]);
8253 16 : new_temp = make_ssa_name (vec_dest, call);
8254 16 : gimple_call_set_lhs (call, new_temp);
8255 16 : gimple_call_set_nothrow (call, true);
8256 16 : vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8257 16 : new_stmt = call;
8258 : }
8259 : else
8260 : {
8261 5389 : if (op.num_ops >= 3)
8262 1747 : vop[2] = vec_oprnds[2][i];
8263 :
8264 5389 : if (masked_loop_p && mask_by_cond_expr)
8265 : {
8266 4 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8267 : vec_num, vectype_in,
8268 : mask_index++);
8269 4 : build_vect_cond_expr (code, vop, mask, gsi);
8270 : }
8271 :
8272 5389 : if (emulated_mixed_dot_prod)
8273 4 : new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8274 : vec_dest, vop);
8275 :
8276 6727 : else if (code.is_internal_fn () && !cond_fn_p)
8277 0 : new_stmt = gimple_build_call_internal (internal_fn (code),
8278 : op.num_ops,
8279 : vop[0], vop[1], vop[2]);
8280 6727 : else if (code.is_internal_fn () && cond_fn_p)
8281 1342 : new_stmt = gimple_build_call_internal (internal_fn (code),
8282 : op.num_ops,
8283 : vop[0], vop[1], vop[2],
8284 : vop[reduc_index]);
8285 : else
8286 4043 : new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8287 : vop[0], vop[1], vop[2]);
8288 5389 : new_temp = make_ssa_name (vec_dest, new_stmt);
8289 5389 : gimple_set_lhs (new_stmt, new_temp);
8290 5389 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8291 : }
8292 :
8293 5861 : if (single_defuse_cycle && i < num - 1)
8294 3530 : vec_oprnds[reduc_index].safe_push (gimple_get_lhs (new_stmt));
8295 : else
8296 2331 : slp_node->push_vec_def (new_stmt);
8297 : }
8298 :
8299 : return true;
8300 10300 : }
8301 :
8302 : /* Transform phase of a cycle PHI. */
8303 :
8304 : bool
8305 23709 : vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8306 : stmt_vec_info stmt_info,
8307 : slp_tree slp_node, slp_instance slp_node_instance)
8308 : {
8309 23709 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
8310 23709 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8311 23709 : int i;
8312 23709 : bool nested_cycle = false;
8313 23709 : int vec_num;
8314 :
8315 23843 : if (nested_in_vect_loop_p (loop, stmt_info))
8316 : {
8317 : loop = loop->inner;
8318 : nested_cycle = true;
8319 : }
8320 :
8321 23709 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
8322 23709 : if (reduc_info
8323 23067 : && (VECT_REDUC_INFO_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8324 23067 : || VECT_REDUC_INFO_TYPE (reduc_info) == FOLD_LEFT_REDUCTION))
8325 : /* Leave the scalar phi in place. */
8326 : return true;
8327 :
8328 22224 : if (reduc_info && reduc_info->is_reduc_chain && dump_enabled_p ())
8329 118 : dump_printf_loc (MSG_NOTE, vect_location,
8330 : "vectorizing a reduction chain\n");
8331 :
8332 22866 : vec_num = vect_get_num_copies (loop_vinfo, slp_node);
8333 :
8334 : /* Check whether we should use a single PHI node and accumulate
8335 : vectors to one before the backedge. */
8336 22866 : if (reduc_info && VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info))
8337 22866 : vec_num = 1;
8338 :
8339 : /* Create the destination vector */
8340 22866 : gphi *phi = as_a <gphi *> (stmt_info->stmt);
8341 22866 : tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8342 : vectype_out);
8343 :
8344 : /* Get the loop-entry arguments. */
8345 22866 : auto_vec<tree> vec_initial_defs;
8346 22866 : vec_initial_defs.reserve (vec_num);
8347 : /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8348 : and we can't use zero for induc_val, use initial_def. Similarly
8349 : for REDUC_MIN and initial_def larger than the base. */
8350 22866 : if (reduc_info
8351 22224 : && VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8352 : {
8353 62 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
8354 62 : tree initial_def = vect_phi_initial_value (phi);
8355 62 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).safe_push (initial_def);
8356 62 : tree induc_val = VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info);
8357 62 : if (TREE_CODE (initial_def) == INTEGER_CST
8358 60 : && !integer_zerop (induc_val)
8359 122 : && ((VECT_REDUC_INFO_CODE (reduc_info) == MAX_EXPR
8360 42 : && tree_int_cst_lt (initial_def, induc_val))
8361 58 : || (VECT_REDUC_INFO_CODE (reduc_info) == MIN_EXPR
8362 18 : && tree_int_cst_lt (induc_val, initial_def))))
8363 : {
8364 2 : induc_val = initial_def;
8365 : /* Communicate we used the initial_def to epilouge
8366 : generation. */
8367 2 : VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8368 : }
8369 62 : vec_initial_defs.quick_push
8370 62 : (build_vector_from_val (vectype_out, induc_val));
8371 62 : }
8372 22804 : else if (nested_cycle)
8373 : {
8374 726 : unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8375 726 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8376 : &vec_initial_defs);
8377 : }
8378 : else
8379 : {
8380 22078 : gcc_assert (slp_node == slp_node_instance->reduc_phis);
8381 22078 : vec<tree> &initial_values = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info);
8382 22078 : vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8383 :
8384 22078 : unsigned int num_phis = stmts.length ();
8385 22078 : if (reduc_info->is_reduc_chain)
8386 188 : num_phis = 1;
8387 22078 : initial_values.reserve (num_phis);
8388 44612 : for (unsigned int i = 0; i < num_phis; ++i)
8389 : {
8390 22534 : gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8391 22534 : initial_values.quick_push (vect_phi_initial_value (this_phi));
8392 : }
8393 22078 : tree neutral_op = VECT_REDUC_INFO_NEUTRAL_OP (reduc_info);
8394 22078 : if (vec_num == 1
8395 22078 : && vect_find_reusable_accumulator (loop_vinfo,
8396 : reduc_info, vectype_out))
8397 : ;
8398 : /* Try to simplify the vector initialization by applying an
8399 : adjustment after the reduction has been performed. This
8400 : can also break a critical path but on the other hand
8401 : requires to keep the initial value live across the loop. */
8402 17894 : else if (neutral_op
8403 17332 : && initial_values.length () == 1
8404 17147 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8405 34966 : && !operand_equal_p (neutral_op, initial_values[0]))
8406 : {
8407 12227 : VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info)
8408 12227 : = initial_values[0];
8409 12227 : initial_values[0] = neutral_op;
8410 : }
8411 22078 : if (!VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)
8412 4184 : || loop_vinfo->main_loop_edge)
8413 43710 : get_initial_defs_for_reduction (loop_vinfo, reduc_info, vectype_out,
8414 : &vec_initial_defs, vec_num,
8415 : stmts.length (), neutral_op);
8416 : }
8417 :
8418 22866 : if (reduc_info)
8419 22224 : if (auto *accumulator = VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info))
8420 : {
8421 4184 : tree def = accumulator->reduc_input;
8422 4184 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8423 : {
8424 4181 : unsigned int nreduc;
8425 8362 : bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8426 4181 : (TREE_TYPE (def)),
8427 4181 : TYPE_VECTOR_SUBPARTS (vectype_out),
8428 : &nreduc);
8429 0 : gcc_assert (res);
8430 4181 : gimple_seq stmts = NULL;
8431 : /* Reduce the single vector to a smaller one. */
8432 4181 : if (nreduc != 1)
8433 : {
8434 : /* Perform the reduction in the appropriate type. */
8435 4181 : tree rvectype = vectype_out;
8436 4181 : if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8437 4181 : TREE_TYPE (TREE_TYPE (def))))
8438 235 : rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8439 : TYPE_VECTOR_SUBPARTS
8440 470 : (vectype_out));
8441 4181 : def = vect_create_partial_epilog (def, rvectype,
8442 : VECT_REDUC_INFO_CODE
8443 : (reduc_info),
8444 : &stmts);
8445 : }
8446 : /* The epilogue loop might use a different vector mode, like
8447 : VNx2DI vs. V2DI. */
8448 4181 : if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8449 : {
8450 0 : tree reduc_type = build_vector_type_for_mode
8451 0 : (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8452 0 : def = gimple_convert (&stmts, reduc_type, def);
8453 : }
8454 : /* Adjust the input so we pick up the partially reduced value
8455 : for the skip edge in vect_create_epilog_for_reduction. */
8456 4181 : accumulator->reduc_input = def;
8457 : /* And the reduction could be carried out using a different sign. */
8458 4181 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8459 235 : def = gimple_convert (&stmts, vectype_out, def);
8460 4181 : edge e;
8461 4181 : if ((e = loop_vinfo->main_loop_edge)
8462 4181 : || (e = loop_vinfo->skip_this_loop_edge))
8463 : {
8464 : /* While we'd like to insert on the edge this will split
8465 : blocks and disturb bookkeeping, we also will eventually
8466 : need this on the skip edge. Rely on sinking to
8467 : fixup optimal placement and insert in the pred. */
8468 3958 : gimple_stmt_iterator gsi = gsi_last_bb (e->src);
8469 : /* Insert before a cond that eventually skips the
8470 : epilogue. */
8471 3958 : if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8472 3941 : gsi_prev (&gsi);
8473 3958 : gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8474 : }
8475 : else
8476 223 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8477 : stmts);
8478 : }
8479 4184 : if (loop_vinfo->main_loop_edge)
8480 3961 : vec_initial_defs[0]
8481 3961 : = vect_get_main_loop_result (loop_vinfo, def,
8482 3961 : vec_initial_defs[0]);
8483 : else
8484 223 : vec_initial_defs.safe_push (def);
8485 : }
8486 :
8487 : /* Generate the reduction PHIs upfront. */
8488 47512 : for (i = 0; i < vec_num; i++)
8489 : {
8490 24646 : tree vec_init_def = vec_initial_defs[i];
8491 : /* Create the reduction-phi that defines the reduction
8492 : operand. */
8493 24646 : gphi *new_phi = create_phi_node (vec_dest, loop->header);
8494 24646 : add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8495 : UNKNOWN_LOCATION);
8496 :
8497 : /* The loop-latch arg is set in epilogue processing. */
8498 :
8499 24646 : slp_node->push_vec_def (new_phi);
8500 : }
8501 :
8502 22866 : return true;
8503 22866 : }
8504 :
8505 : /* Vectorizes LC PHIs. */
8506 :
8507 : bool
8508 170844 : vectorizable_lc_phi (loop_vec_info loop_vinfo,
8509 : stmt_vec_info stmt_info,
8510 : slp_tree slp_node)
8511 : {
8512 170844 : if (!loop_vinfo
8513 170844 : || !is_a <gphi *> (stmt_info->stmt)
8514 202881 : || gimple_phi_num_args (stmt_info->stmt) != 1)
8515 : return false;
8516 :
8517 761 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8518 0 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8519 : return false;
8520 :
8521 : /* Deal with copies from externs or constants that disguise as
8522 : loop-closed PHI nodes (PR97886). */
8523 761 : if (!vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8524 : SLP_TREE_VECTYPE (slp_node)))
8525 : {
8526 0 : if (dump_enabled_p ())
8527 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8528 : "incompatible vector types for invariants\n");
8529 0 : return false;
8530 : }
8531 :
8532 : /* ??? This can happen with data vs. mask uses of boolean. */
8533 761 : if (!useless_type_conversion_p (SLP_TREE_VECTYPE (slp_node),
8534 761 : SLP_TREE_VECTYPE
8535 : (SLP_TREE_CHILDREN (slp_node)[0])))
8536 : {
8537 0 : if (dump_enabled_p ())
8538 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8539 : "missed mask promotion\n");
8540 0 : return false;
8541 : }
8542 :
8543 761 : SLP_TREE_TYPE (slp_node) = lc_phi_info_type;
8544 761 : return true;
8545 : }
8546 :
8547 : bool
8548 504 : vect_transform_lc_phi (loop_vec_info loop_vinfo,
8549 : stmt_vec_info stmt_info,
8550 : slp_tree slp_node)
8551 : {
8552 :
8553 504 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8554 504 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8555 504 : basic_block bb = gimple_bb (stmt_info->stmt);
8556 504 : edge e = single_pred_edge (bb);
8557 504 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8558 504 : auto_vec<tree> vec_oprnds;
8559 1008 : vect_get_vec_defs (loop_vinfo, slp_node,
8560 504 : gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8561 1118 : for (unsigned i = 0; i < vec_oprnds.length (); i++)
8562 : {
8563 : /* Create the vectorized LC PHI node. */
8564 614 : gphi *new_phi = create_phi_node (vec_dest, bb);
8565 614 : add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8566 614 : slp_node->push_vec_def (new_phi);
8567 : }
8568 :
8569 504 : return true;
8570 504 : }
8571 :
8572 : /* Vectorizes PHIs. */
8573 :
8574 : bool
8575 138626 : vectorizable_phi (bb_vec_info vinfo,
8576 : stmt_vec_info stmt_info,
8577 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8578 : {
8579 138626 : if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8580 : return false;
8581 :
8582 71180 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8583 : return false;
8584 :
8585 71180 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8586 :
8587 71180 : if (cost_vec) /* transformation not required. */
8588 : {
8589 : slp_tree child;
8590 : unsigned i;
8591 194377 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8592 137361 : if (!child)
8593 : {
8594 0 : if (dump_enabled_p ())
8595 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8596 : "PHI node with unvectorized backedge def\n");
8597 0 : return false;
8598 : }
8599 137361 : else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8600 : {
8601 18 : if (dump_enabled_p ())
8602 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8603 : "incompatible vector types for invariants\n");
8604 18 : return false;
8605 : }
8606 137343 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8607 137343 : && !useless_type_conversion_p (vectype,
8608 : SLP_TREE_VECTYPE (child)))
8609 : {
8610 : /* With bools we can have mask and non-mask precision vectors
8611 : or different non-mask precisions. while pattern recog is
8612 : supposed to guarantee consistency here bugs in it can cause
8613 : mismatches (PR103489 and PR103800 for example).
8614 : Deal with them here instead of ICEing later. */
8615 18 : if (dump_enabled_p ())
8616 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8617 : "incompatible vector type setup from "
8618 : "bool pattern detection\n");
8619 18 : return false;
8620 : }
8621 :
8622 : /* For single-argument PHIs assume coalescing which means zero cost
8623 : for the scalar and the vector PHIs. This avoids artificially
8624 : favoring the vector path (but may pessimize it in some cases). */
8625 57016 : if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8626 51722 : record_stmt_cost (cost_vec, vect_get_num_copies (vinfo, slp_node),
8627 : vector_stmt, slp_node, vectype, 0, vect_body);
8628 57016 : SLP_TREE_TYPE (slp_node) = phi_info_type;
8629 57016 : return true;
8630 : }
8631 :
8632 14128 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8633 14128 : basic_block bb = gimple_bb (stmt_info->stmt);
8634 14128 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8635 14128 : auto_vec<gphi *> new_phis;
8636 51176 : for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8637 : {
8638 37048 : slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8639 :
8640 : /* Skip not yet vectorized defs. */
8641 37495 : if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8642 37048 : && SLP_TREE_VEC_DEFS (child).is_empty ())
8643 447 : continue;
8644 :
8645 36601 : auto_vec<tree> vec_oprnds;
8646 36601 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8647 36601 : if (!new_phis.exists ())
8648 : {
8649 14128 : new_phis.create (vec_oprnds.length ());
8650 29871 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
8651 : {
8652 : /* Create the vectorized LC PHI node. */
8653 15743 : new_phis.quick_push (create_phi_node (vec_dest, bb));
8654 15743 : slp_node->push_vec_def (new_phis[j]);
8655 : }
8656 : }
8657 36601 : edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8658 79890 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
8659 43289 : add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8660 36601 : }
8661 : /* We should have at least one already vectorized child. */
8662 14128 : gcc_assert (new_phis.exists ());
8663 :
8664 14128 : return true;
8665 14128 : }
8666 :
8667 : /* Vectorizes first order recurrences. An overview of the transformation
8668 : is described below. Suppose we have the following loop.
8669 :
8670 : int t = 0;
8671 : for (int i = 0; i < n; ++i)
8672 : {
8673 : b[i] = a[i] - t;
8674 : t = a[i];
8675 : }
8676 :
8677 : There is a first-order recurrence on 'a'. For this loop, the scalar IR
8678 : looks (simplified) like:
8679 :
8680 : scalar.preheader:
8681 : init = 0;
8682 :
8683 : scalar.body:
8684 : i = PHI <0(scalar.preheader), i+1(scalar.body)>
8685 : _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8686 : _1 = a[i]
8687 : b[i] = _1 - _2
8688 : if (i < n) goto scalar.body
8689 :
8690 : In this example, _2 is a recurrence because it's value depends on the
8691 : previous iteration. We vectorize this as (VF = 4)
8692 :
8693 : vector.preheader:
8694 : vect_init = vect_cst(..., ..., ..., 0)
8695 :
8696 : vector.body
8697 : i = PHI <0(vector.preheader), i+4(vector.body)>
8698 : vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8699 : vect_2 = a[i, i+1, i+2, i+3];
8700 : vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8701 : b[i, i+1, i+2, i+3] = vect_2 - vect_3
8702 : if (..) goto vector.body
8703 :
8704 : In this function, vectorizable_recurr, we code generate both the
8705 : vector PHI node and the permute since those together compute the
8706 : vectorized value of the scalar PHI. We do not yet have the
8707 : backedge value to fill in there nor into the vec_perm. Those
8708 : are filled in vect_schedule_scc.
8709 :
8710 : TODO: Since the scalar loop does not have a use of the recurrence
8711 : outside of the loop the natural way to implement peeling via
8712 : vectorizing the live value doesn't work. For now peeling of loops
8713 : with a recurrence is not implemented. For SLP the supported cases
8714 : are restricted to those requiring a single vector recurrence PHI. */
8715 :
8716 : bool
8717 170125 : vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8718 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8719 : {
8720 170125 : if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8721 : return false;
8722 :
8723 31318 : gphi *phi = as_a<gphi *> (stmt_info->stmt);
8724 :
8725 : /* So far we only support first-order recurrence auto-vectorization. */
8726 31318 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8727 : return false;
8728 :
8729 404 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8730 404 : unsigned ncopies = vect_get_num_copies (loop_vinfo, slp_node);
8731 404 : poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8732 404 : unsigned dist = SLP_TREE_LANES (slp_node);
8733 : /* We need to be able to make progress with a single vector. */
8734 404 : if (maybe_gt (dist * 2, nunits))
8735 : {
8736 0 : if (dump_enabled_p ())
8737 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8738 : "first order recurrence exceeds half of "
8739 : "a vector\n");
8740 0 : return false;
8741 : }
8742 :
8743 : /* We need to be able to build a { ..., a, b } init vector with
8744 : dist number of distinct trailing values. Always possible
8745 : when dist == 1 or when nunits is constant or when the initializations
8746 : are uniform. */
8747 404 : tree uniform_initval = NULL_TREE;
8748 404 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8749 1640 : for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
8750 : {
8751 440 : gphi *phi = as_a <gphi *> (s->stmt);
8752 440 : if (! uniform_initval)
8753 404 : uniform_initval = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8754 36 : else if (! operand_equal_p (uniform_initval,
8755 36 : PHI_ARG_DEF_FROM_EDGE (phi, pe)))
8756 : {
8757 : uniform_initval = NULL_TREE;
8758 : break;
8759 : }
8760 : }
8761 404 : if (!uniform_initval && !nunits.is_constant ())
8762 : {
8763 : if (dump_enabled_p ())
8764 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8765 : "cannot build initialization vector for "
8766 : "first order recurrence\n");
8767 : return false;
8768 : }
8769 :
8770 : /* First-order recurrence autovectorization needs to handle permutation
8771 : with indices = [nunits-1, nunits, nunits+1, ...]. */
8772 404 : vec_perm_builder sel (nunits, 1, 3);
8773 1616 : for (int i = 0; i < 3; ++i)
8774 1212 : sel.quick_push (nunits - dist + i);
8775 404 : vec_perm_indices indices (sel, 2, nunits);
8776 :
8777 404 : if (cost_vec) /* transformation not required. */
8778 : {
8779 362 : if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8780 : indices))
8781 : return false;
8782 :
8783 : /* We eventually need to set a vector type on invariant
8784 : arguments. */
8785 : unsigned j;
8786 : slp_tree child;
8787 750 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8788 500 : if (!vect_maybe_update_slp_op_vectype (child, vectype))
8789 : {
8790 0 : if (dump_enabled_p ())
8791 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8792 : "incompatible vector types for "
8793 : "invariants\n");
8794 0 : return false;
8795 : }
8796 :
8797 : /* Verify we have set up compatible types. */
8798 250 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8799 250 : slp_tree latch_def = SLP_TREE_CHILDREN (slp_node)[le->dest_idx];
8800 250 : tree latch_vectype = SLP_TREE_VECTYPE (latch_def);
8801 250 : if (!types_compatible_p (latch_vectype, vectype))
8802 : return false;
8803 :
8804 : /* The recurrence costs the initialization vector and one permute
8805 : for each copy. With SLP the prologue value is explicitly
8806 : represented and costed separately. */
8807 250 : unsigned prologue_cost = 0;
8808 250 : unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8809 : slp_node, 0, vect_body);
8810 250 : if (dump_enabled_p ())
8811 50 : dump_printf_loc (MSG_NOTE, vect_location,
8812 : "vectorizable_recurr: inside_cost = %d, "
8813 : "prologue_cost = %d .\n", inside_cost,
8814 : prologue_cost);
8815 :
8816 250 : SLP_TREE_TYPE (slp_node) = recurr_info_type;
8817 250 : return true;
8818 : }
8819 :
8820 42 : tree vec_init;
8821 42 : if (! uniform_initval)
8822 : {
8823 6 : vec<constructor_elt, va_gc> *v = NULL;
8824 6 : vec_alloc (v, nunits.to_constant ());
8825 33 : for (unsigned i = 0; i < nunits.to_constant () - dist; ++i)
8826 27 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
8827 : build_zero_cst (TREE_TYPE (vectype)));
8828 39 : for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
8829 : {
8830 21 : gphi *phi = as_a <gphi *> (s->stmt);
8831 21 : tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8832 21 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
8833 21 : TREE_TYPE (preheader)))
8834 : {
8835 0 : gimple_seq stmts = NULL;
8836 0 : preheader = gimple_convert (&stmts,
8837 0 : TREE_TYPE (vectype), preheader);
8838 0 : gsi_insert_seq_on_edge_immediate (pe, stmts);
8839 : }
8840 21 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, preheader);
8841 : }
8842 6 : vec_init = build_constructor (vectype, v);
8843 : }
8844 : else
8845 : vec_init = uniform_initval;
8846 42 : vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8847 :
8848 : /* Create the vectorized first-order PHI node. */
8849 42 : tree vec_dest = vect_get_new_vect_var (vectype,
8850 : vect_simple_var, "vec_recur_");
8851 42 : basic_block bb = gimple_bb (phi);
8852 42 : gphi *new_phi = create_phi_node (vec_dest, bb);
8853 42 : add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8854 :
8855 : /* Insert shuffles the first-order recurrence autovectorization.
8856 : result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
8857 42 : tree perm = vect_gen_perm_mask_checked (vectype, indices);
8858 :
8859 : /* Insert the required permute after the latch definition. The
8860 : second and later operands are tentative and will be updated when we have
8861 : vectorized the latch definition. */
8862 42 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8863 42 : gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8864 42 : gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8865 50 : do
8866 : {
8867 50 : gsi_next (&gsi2);
8868 : }
8869 : /* Skip inserted vectorized stmts for the latch definition. We have to
8870 : insert after those. */
8871 50 : while (gimple_uid (gsi_stmt (gsi2)) == 0);
8872 :
8873 121 : for (unsigned i = 0; i < ncopies; ++i)
8874 : {
8875 79 : vec_dest = make_ssa_name (vectype);
8876 79 : gassign *vperm
8877 121 : = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8878 42 : i == 0 ? gimple_phi_result (new_phi) : NULL,
8879 : NULL, perm);
8880 79 : vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8881 :
8882 79 : slp_node->push_vec_def (vperm);
8883 : }
8884 :
8885 : return true;
8886 404 : }
8887 :
8888 : /* Return true if VECTYPE represents a vector that requires lowering
8889 : by the vector lowering pass. */
8890 :
8891 : bool
8892 646535 : vect_emulated_vector_p (tree vectype)
8893 : {
8894 1293070 : return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8895 649242 : && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8896 2689 : || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8897 : }
8898 :
8899 : /* Return true if we can emulate CODE on an integer mode representation
8900 : of a vector. */
8901 :
8902 : bool
8903 10524 : vect_can_vectorize_without_simd_p (tree_code code)
8904 : {
8905 10524 : switch (code)
8906 : {
8907 : case PLUS_EXPR:
8908 : case MINUS_EXPR:
8909 : case NEGATE_EXPR:
8910 : case BIT_AND_EXPR:
8911 : case BIT_IOR_EXPR:
8912 : case BIT_XOR_EXPR:
8913 : case BIT_NOT_EXPR:
8914 : return true;
8915 :
8916 9973 : default:
8917 9973 : return false;
8918 : }
8919 : }
8920 :
8921 : /* Likewise, but taking a code_helper. */
8922 :
8923 : bool
8924 154 : vect_can_vectorize_without_simd_p (code_helper code)
8925 : {
8926 154 : return (code.is_tree_code ()
8927 154 : && vect_can_vectorize_without_simd_p (tree_code (code)));
8928 : }
8929 :
8930 : /* Create vector init for vectorized iv. */
8931 : static tree
8932 916 : vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8933 : tree step_expr, poly_uint64 nunits,
8934 : tree vectype,
8935 : enum vect_induction_op_type induction_type)
8936 : {
8937 916 : unsigned HOST_WIDE_INT const_nunits;
8938 916 : tree vec_shift, vec_init, new_name;
8939 916 : unsigned i;
8940 916 : tree itype = TREE_TYPE (vectype);
8941 :
8942 : /* iv_loop is the loop to be vectorized. Create:
8943 : vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
8944 916 : new_name = gimple_convert (stmts, itype, init_expr);
8945 916 : switch (induction_type)
8946 : {
8947 18 : case vect_step_op_shr:
8948 18 : case vect_step_op_shl:
8949 : /* Build the Initial value from shift_expr. */
8950 18 : vec_init = gimple_build_vector_from_val (stmts,
8951 : vectype,
8952 : new_name);
8953 18 : vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
8954 : build_zero_cst (itype), step_expr);
8955 18 : vec_init = gimple_build (stmts,
8956 : (induction_type == vect_step_op_shr
8957 : ? RSHIFT_EXPR : LSHIFT_EXPR),
8958 : vectype, vec_init, vec_shift);
8959 18 : break;
8960 :
8961 822 : case vect_step_op_neg:
8962 822 : {
8963 822 : vec_init = gimple_build_vector_from_val (stmts,
8964 : vectype,
8965 : new_name);
8966 822 : tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
8967 : vectype, vec_init);
8968 : /* The encoding has 2 interleaved stepped patterns. */
8969 822 : vec_perm_builder sel (nunits, 2, 3);
8970 822 : sel.quick_grow (6);
8971 4110 : for (i = 0; i < 3; i++)
8972 : {
8973 2466 : sel[2 * i] = i;
8974 2466 : sel[2 * i + 1] = i + nunits;
8975 : }
8976 822 : vec_perm_indices indices (sel, 2, nunits);
8977 : /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
8978 : fail when vec_init is const vector. In that situation vec_perm is not
8979 : really needed. */
8980 822 : tree perm_mask_even
8981 822 : = vect_gen_perm_mask_any (vectype, indices);
8982 822 : vec_init = gimple_build (stmts, VEC_PERM_EXPR,
8983 : vectype,
8984 : vec_init, vec_neg,
8985 : perm_mask_even);
8986 822 : }
8987 822 : break;
8988 :
8989 76 : case vect_step_op_mul:
8990 76 : {
8991 : /* Use unsigned mult to avoid UD integer overflow. */
8992 76 : gcc_assert (nunits.is_constant (&const_nunits));
8993 76 : tree utype = unsigned_type_for (itype);
8994 76 : tree uvectype = build_vector_type (utype,
8995 76 : TYPE_VECTOR_SUBPARTS (vectype));
8996 76 : new_name = gimple_convert (stmts, utype, new_name);
8997 76 : vec_init = gimple_build_vector_from_val (stmts,
8998 : uvectype,
8999 : new_name);
9000 76 : tree_vector_builder elts (uvectype, const_nunits, 1);
9001 76 : tree elt_step = build_one_cst (utype);
9002 :
9003 76 : elts.quick_push (elt_step);
9004 660 : for (i = 1; i < const_nunits; i++)
9005 : {
9006 : /* Create: new_name_i = new_name + step_expr. */
9007 508 : elt_step = gimple_build (stmts, MULT_EXPR,
9008 : utype, elt_step, step_expr);
9009 508 : elts.quick_push (elt_step);
9010 : }
9011 : /* Create a vector from [new_name_0, new_name_1, ...,
9012 : new_name_nunits-1]. */
9013 76 : tree vec_mul = gimple_build_vector (stmts, &elts);
9014 76 : vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9015 : vec_init, vec_mul);
9016 76 : vec_init = gimple_convert (stmts, vectype, vec_init);
9017 76 : }
9018 76 : break;
9019 :
9020 0 : default:
9021 0 : gcc_unreachable ();
9022 : }
9023 :
9024 916 : return vec_init;
9025 : }
9026 :
9027 : /* Peel init_expr by skip_niter for induction_type. */
9028 : tree
9029 84 : vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9030 : tree skip_niters, tree step_expr,
9031 : enum vect_induction_op_type induction_type,
9032 : bool early_exit_p)
9033 : {
9034 84 : gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST || early_exit_p);
9035 84 : tree type = TREE_TYPE (init_expr);
9036 84 : unsigned prec = TYPE_PRECISION (type);
9037 84 : switch (induction_type)
9038 : {
9039 : /* neg inductions are typically not used for loop termination conditions but
9040 : are typically implemented as b = -b. That is every scalar iteration b is
9041 : negated. That means that for the initial value of b we will have to
9042 : determine whether the number of skipped iteration is a multiple of 2
9043 : because every 2 scalar iterations we are back at "b". */
9044 0 : case vect_step_op_neg:
9045 : /* For early exits the neg induction will always be the same value at the
9046 : start of the iteration. */
9047 0 : if (early_exit_p)
9048 : break;
9049 :
9050 0 : if (TREE_INT_CST_LOW (skip_niters) % 2)
9051 0 : init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9052 : /* else no change. */
9053 : break;
9054 :
9055 12 : case vect_step_op_shr:
9056 12 : case vect_step_op_shl:
9057 12 : skip_niters = fold_build1 (NOP_EXPR, type, skip_niters);
9058 12 : step_expr = fold_build1 (NOP_EXPR, type, step_expr);
9059 12 : step_expr = fold_build2 (MULT_EXPR, type, step_expr, skip_niters);
9060 : /* When shift mount >= precision, need to avoid UD.
9061 : In the original loop, there's no UD, and according to semantic,
9062 : init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9063 12 : if ((!tree_fits_uhwi_p (step_expr)
9064 12 : || tree_to_uhwi (step_expr) >= prec)
9065 6 : && !early_exit_p)
9066 : {
9067 6 : if (induction_type == vect_step_op_shl
9068 6 : || TYPE_UNSIGNED (type))
9069 4 : init_expr = build_zero_cst (type);
9070 : else
9071 2 : init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9072 : init_expr,
9073 4 : wide_int_to_tree (type, prec - 1));
9074 : }
9075 : else
9076 : {
9077 8 : init_expr = fold_build2 ((induction_type == vect_step_op_shr
9078 : ? RSHIFT_EXPR : LSHIFT_EXPR),
9079 : type, init_expr, step_expr);
9080 6 : init_expr = force_gimple_operand (init_expr, stmts, false, NULL);
9081 : }
9082 : break;
9083 :
9084 72 : case vect_step_op_mul:
9085 72 : {
9086 : /* Due to UB we can't support vect_step_op_mul with early break for now.
9087 : so assert and block. */
9088 72 : gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9089 72 : tree utype = unsigned_type_for (type);
9090 72 : init_expr = gimple_convert (stmts, utype, init_expr);
9091 72 : wide_int skipn = wi::to_wide (skip_niters);
9092 72 : wide_int begin = wi::to_wide (step_expr);
9093 72 : auto_mpz base, exp, mod, res;
9094 72 : wi::to_mpz (begin, base, TYPE_SIGN (type));
9095 72 : wi::to_mpz (skipn, exp, UNSIGNED);
9096 72 : mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9097 72 : mpz_powm (res, base, exp, mod);
9098 72 : begin = wi::from_mpz (utype, res, true);
9099 72 : tree mult_expr = wide_int_to_tree (utype, begin);
9100 72 : init_expr = gimple_build (stmts, MULT_EXPR, utype,
9101 : init_expr, mult_expr);
9102 72 : init_expr = gimple_convert (stmts, type, init_expr);
9103 72 : }
9104 72 : break;
9105 :
9106 0 : default:
9107 0 : gcc_unreachable ();
9108 : }
9109 :
9110 84 : return init_expr;
9111 : }
9112 :
9113 : /* Create vector step for vectorized iv. */
9114 : static tree
9115 1202 : vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9116 : poly_uint64 vf,
9117 : enum vect_induction_op_type induction_type)
9118 : {
9119 1202 : tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9120 1202 : tree new_name = NULL;
9121 : /* Step should be pow (step, vf) for mult induction. */
9122 1202 : if (induction_type == vect_step_op_mul)
9123 : {
9124 76 : gcc_assert (vf.is_constant ());
9125 76 : wide_int begin = wi::to_wide (step_expr);
9126 :
9127 584 : for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9128 508 : begin = wi::mul (begin, wi::to_wide (step_expr));
9129 :
9130 76 : new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9131 76 : }
9132 1126 : else if (induction_type == vect_step_op_neg)
9133 : /* Do nothing. */
9134 : ;
9135 : else
9136 18 : new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9137 : expr, step_expr);
9138 1202 : return new_name;
9139 : }
9140 :
9141 : static tree
9142 1202 : vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9143 : stmt_vec_info stmt_info,
9144 : tree new_name, tree vectype,
9145 : enum vect_induction_op_type induction_type)
9146 : {
9147 : /* No step is needed for neg induction. */
9148 1202 : if (induction_type == vect_step_op_neg)
9149 : return NULL;
9150 :
9151 94 : tree t = unshare_expr (new_name);
9152 94 : gcc_assert (CONSTANT_CLASS_P (new_name)
9153 : || TREE_CODE (new_name) == SSA_NAME);
9154 94 : tree new_vec = build_vector_from_val (vectype, t);
9155 94 : tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9156 : new_vec, vectype, NULL);
9157 94 : return vec_step;
9158 : }
9159 :
9160 : /* Update vectorized iv with vect_step, induc_def is init. */
9161 : static tree
9162 1390 : vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9163 : tree induc_def, tree vec_step,
9164 : enum vect_induction_op_type induction_type)
9165 : {
9166 1390 : tree vec_def = induc_def;
9167 1390 : switch (induction_type)
9168 : {
9169 76 : case vect_step_op_mul:
9170 76 : {
9171 : /* Use unsigned mult to avoid UD integer overflow. */
9172 76 : tree uvectype = unsigned_type_for (vectype);
9173 76 : vec_def = gimple_convert (stmts, uvectype, vec_def);
9174 76 : vec_step = gimple_convert (stmts, uvectype, vec_step);
9175 76 : vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9176 : vec_def, vec_step);
9177 76 : vec_def = gimple_convert (stmts, vectype, vec_def);
9178 : }
9179 76 : break;
9180 :
9181 12 : case vect_step_op_shr:
9182 12 : vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9183 : vec_def, vec_step);
9184 12 : break;
9185 :
9186 6 : case vect_step_op_shl:
9187 6 : vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9188 : vec_def, vec_step);
9189 6 : break;
9190 : case vect_step_op_neg:
9191 : vec_def = induc_def;
9192 : /* Do nothing. */
9193 : break;
9194 0 : default:
9195 0 : gcc_unreachable ();
9196 : }
9197 :
9198 1390 : return vec_def;
9199 :
9200 : }
9201 :
9202 : /* Function vectorizable_nonlinear_induction
9203 :
9204 : Check if STMT_INFO performs an nonlinear induction computation that can be
9205 : vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9206 : a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9207 : basic block.
9208 : Return true if STMT_INFO is vectorizable in this way. */
9209 :
9210 : static bool
9211 8010 : vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9212 : stmt_vec_info stmt_info,
9213 : slp_tree slp_node,
9214 : stmt_vector_for_cost *cost_vec)
9215 : {
9216 8010 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9217 8010 : unsigned ncopies;
9218 8010 : bool nested_in_vect_loop = false;
9219 8010 : class loop *iv_loop;
9220 8010 : tree vec_def;
9221 8010 : edge pe = loop_preheader_edge (loop);
9222 8010 : basic_block new_bb;
9223 8010 : tree vec_init, vec_step;
9224 8010 : tree new_name;
9225 8010 : gimple *new_stmt;
9226 8010 : gphi *induction_phi;
9227 8010 : tree induc_def, vec_dest;
9228 8010 : tree init_expr, step_expr;
9229 8010 : tree niters_skip;
9230 8010 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9231 8010 : unsigned i;
9232 8010 : gimple_stmt_iterator si;
9233 :
9234 8010 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9235 :
9236 8010 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9237 8010 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9238 8010 : enum vect_induction_op_type induction_type
9239 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9240 :
9241 8010 : gcc_assert (induction_type > vect_step_op_add);
9242 :
9243 8010 : ncopies = vect_get_num_copies (loop_vinfo, slp_node);
9244 8010 : gcc_assert (ncopies >= 1);
9245 :
9246 : /* FORNOW. Only handle nonlinear induction in the same loop. */
9247 8010 : if (nested_in_vect_loop_p (loop, stmt_info))
9248 : {
9249 0 : if (dump_enabled_p ())
9250 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9251 : "nonlinear induction in nested loop.\n");
9252 0 : return false;
9253 : }
9254 :
9255 8010 : iv_loop = loop;
9256 8010 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9257 :
9258 : /* TODO: Support multi-lane SLP for nonlinear iv. There should be separate
9259 : vector iv update for each iv and a permutation to generate wanted
9260 : vector iv. */
9261 8010 : if (SLP_TREE_LANES (slp_node) > 1)
9262 : {
9263 0 : if (dump_enabled_p ())
9264 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9265 : "SLP induction not supported for nonlinear"
9266 : " induction.\n");
9267 0 : return false;
9268 : }
9269 :
9270 8010 : if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9271 : {
9272 0 : if (dump_enabled_p ())
9273 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9274 : "floating point nonlinear induction vectorization"
9275 : " not supported.\n");
9276 0 : return false;
9277 : }
9278 :
9279 8010 : step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9280 8010 : init_expr = vect_phi_initial_value (phi);
9281 8010 : gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9282 : && TREE_CODE (step_expr) == INTEGER_CST);
9283 : /* step_expr should be aligned with init_expr,
9284 : .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9285 8010 : step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9286 :
9287 8010 : if (TREE_CODE (init_expr) == INTEGER_CST)
9288 3009 : init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9289 5001 : else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9290 : {
9291 : /* INIT_EXPR could be a bit_field, bail out for such case. */
9292 4 : if (dump_enabled_p ())
9293 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9294 : "nonlinear induction vectorization failed:"
9295 : " component type of vectype is not a nop conversion"
9296 : " from type of init_expr.\n");
9297 4 : return false;
9298 : }
9299 :
9300 8006 : switch (induction_type)
9301 : {
9302 2538 : case vect_step_op_neg:
9303 2538 : if (maybe_eq (TYPE_VECTOR_SUBPARTS (vectype), 1u))
9304 : return false;
9305 2534 : if (TREE_CODE (init_expr) != INTEGER_CST
9306 190 : && TREE_CODE (init_expr) != REAL_CST)
9307 : {
9308 : /* Check for backend support of NEGATE_EXPR and vec_perm. */
9309 190 : if (!directly_supported_p (NEGATE_EXPR, vectype))
9310 0 : return false;
9311 :
9312 : /* The encoding has 2 interleaved stepped patterns. */
9313 190 : vec_perm_builder sel (nunits, 2, 3);
9314 190 : machine_mode mode = TYPE_MODE (vectype);
9315 190 : sel.quick_grow (6);
9316 950 : for (i = 0; i < 3; i++)
9317 : {
9318 570 : sel[i * 2] = i;
9319 570 : sel[i * 2 + 1] = i + nunits;
9320 : }
9321 190 : vec_perm_indices indices (sel, 2, nunits);
9322 190 : if (!can_vec_perm_const_p (mode, mode, indices))
9323 0 : return false;
9324 190 : }
9325 : break;
9326 :
9327 1058 : case vect_step_op_mul:
9328 1058 : {
9329 : /* Check for backend support of MULT_EXPR. */
9330 1058 : if (!directly_supported_p (MULT_EXPR, vectype))
9331 : return false;
9332 :
9333 : /* ?? How to construct vector step for variable number vector.
9334 : [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9335 : if (!vf.is_constant ())
9336 : return false;
9337 : }
9338 : break;
9339 :
9340 4092 : case vect_step_op_shr:
9341 : /* Check for backend support of RSHIFT_EXPR. */
9342 4092 : if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9343 : return false;
9344 :
9345 : /* Don't shift more than type precision to avoid UD. */
9346 26 : if (!tree_fits_uhwi_p (step_expr)
9347 26 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9348 : TYPE_PRECISION (TREE_TYPE (init_expr))))
9349 : return false;
9350 : break;
9351 :
9352 318 : case vect_step_op_shl:
9353 : /* Check for backend support of RSHIFT_EXPR. */
9354 318 : if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9355 : return false;
9356 :
9357 : /* Don't shift more than type precision to avoid UD. */
9358 12 : if (!tree_fits_uhwi_p (step_expr)
9359 12 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9360 : TYPE_PRECISION (TREE_TYPE (init_expr))))
9361 : return false;
9362 :
9363 : break;
9364 :
9365 0 : default:
9366 0 : gcc_unreachable ();
9367 : }
9368 :
9369 3390 : if (cost_vec) /* transformation not required. */
9370 : {
9371 2474 : unsigned inside_cost = 0, prologue_cost = 0;
9372 : /* loop cost for vec_loop. Neg induction doesn't have any
9373 : inside_cost. */
9374 2474 : inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9375 : slp_node, 0, vect_body);
9376 :
9377 : /* loop cost for vec_loop. Neg induction doesn't have any
9378 : inside_cost. */
9379 2474 : if (induction_type == vect_step_op_neg)
9380 1712 : inside_cost = 0;
9381 :
9382 : /* prologue cost for vec_init and vec_step. */
9383 2474 : prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9384 : slp_node, 0, vect_prologue);
9385 :
9386 2474 : if (dump_enabled_p ())
9387 60 : dump_printf_loc (MSG_NOTE, vect_location,
9388 : "vect_model_induction_cost: inside_cost = %d, "
9389 : "prologue_cost = %d. \n", inside_cost,
9390 : prologue_cost);
9391 :
9392 2474 : SLP_TREE_TYPE (slp_node) = induc_vec_info_type;
9393 2474 : DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9394 2474 : return true;
9395 : }
9396 :
9397 : /* Transform. */
9398 :
9399 : /* Compute a vector variable, initialized with the first VF values of
9400 : the induction variable. E.g., for an iv with IV_PHI='X' and
9401 : evolution S, for a vector of 4 units, we want to compute:
9402 : [X, X + S, X + 2*S, X + 3*S]. */
9403 :
9404 916 : if (dump_enabled_p ())
9405 32 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9406 :
9407 916 : pe = loop_preheader_edge (iv_loop);
9408 : /* Find the first insertion point in the BB. */
9409 916 : basic_block bb = gimple_bb (phi);
9410 916 : si = gsi_after_labels (bb);
9411 :
9412 916 : gimple_seq stmts = NULL;
9413 :
9414 916 : niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9415 : /* If we are using the loop mask to "peel" for alignment then we need
9416 : to adjust the start value here. */
9417 916 : if (niters_skip != NULL_TREE)
9418 0 : init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9419 : step_expr, induction_type, false);
9420 :
9421 916 : vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9422 : step_expr, nunits, vectype,
9423 : induction_type);
9424 916 : if (stmts)
9425 : {
9426 162 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9427 162 : gcc_assert (!new_bb);
9428 : }
9429 :
9430 916 : stmts = NULL;
9431 916 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9432 : vf, induction_type);
9433 916 : if (stmts)
9434 : {
9435 0 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9436 0 : gcc_assert (!new_bb);
9437 : }
9438 :
9439 916 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9440 : new_name, vectype,
9441 : induction_type);
9442 : /* Create the following def-use cycle:
9443 : loop prolog:
9444 : vec_init = ...
9445 : vec_step = ...
9446 : loop:
9447 : vec_iv = PHI <vec_init, vec_loop>
9448 : ...
9449 : STMT
9450 : ...
9451 : vec_loop = vec_iv + vec_step; */
9452 :
9453 : /* Create the induction-phi that defines the induction-operand. */
9454 916 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9455 916 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
9456 916 : induc_def = PHI_RESULT (induction_phi);
9457 :
9458 : /* Create the iv update inside the loop. */
9459 916 : stmts = NULL;
9460 916 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9461 : induc_def, vec_step,
9462 : induction_type);
9463 :
9464 916 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9465 916 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9466 :
9467 : /* Set the arguments of the phi node: */
9468 916 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9469 916 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9470 : UNKNOWN_LOCATION);
9471 :
9472 916 : slp_node->push_vec_def (induction_phi);
9473 :
9474 : /* In case that vectorization factor (VF) is bigger than the number
9475 : of elements that we can fit in a vectype (nunits), we have to generate
9476 : more than one vector stmt - i.e - we need to "unroll" the
9477 : vector stmt by a factor VF/nunits. For more details see documentation
9478 : in vectorizable_operation. */
9479 :
9480 916 : if (ncopies > 1)
9481 : {
9482 286 : stmts = NULL;
9483 : /* FORNOW. This restriction should be relaxed. */
9484 286 : gcc_assert (!nested_in_vect_loop);
9485 :
9486 286 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9487 : nunits, induction_type);
9488 :
9489 286 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9490 : new_name, vectype,
9491 : induction_type);
9492 286 : vec_def = induc_def;
9493 1046 : for (i = 1; i < ncopies; i++)
9494 : {
9495 : /* vec_i = vec_prev + vec_step. */
9496 474 : stmts = NULL;
9497 474 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9498 : vec_def, vec_step,
9499 : induction_type);
9500 474 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9501 474 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9502 474 : slp_node->push_vec_def (new_stmt);
9503 : }
9504 : }
9505 :
9506 916 : if (dump_enabled_p ())
9507 64 : dump_printf_loc (MSG_NOTE, vect_location,
9508 : "transform induction: created def-use cycle: %G%G",
9509 32 : (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9510 :
9511 : return true;
9512 : }
9513 :
9514 : /* Function vectorizable_induction
9515 :
9516 : Check if STMT_INFO performs an induction computation that can be vectorized.
9517 : If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9518 : phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9519 : Return true if STMT_INFO is vectorizable in this way. */
9520 :
9521 : bool
9522 291711 : vectorizable_induction (loop_vec_info loop_vinfo,
9523 : stmt_vec_info stmt_info,
9524 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9525 : {
9526 291711 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9527 291711 : bool nested_in_vect_loop = false;
9528 291711 : class loop *iv_loop;
9529 291711 : tree vec_def;
9530 291711 : edge pe = loop_preheader_edge (loop);
9531 291711 : basic_block new_bb;
9532 291711 : tree vec_init = NULL_TREE, vec_step, t;
9533 291711 : tree new_name;
9534 291711 : gphi *induction_phi;
9535 291711 : tree induc_def, vec_dest;
9536 291711 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9537 291711 : unsigned i;
9538 291711 : tree expr;
9539 291711 : tree index_vectype = NULL_TREE;
9540 291711 : gimple_stmt_iterator si;
9541 291711 : enum vect_induction_op_type induction_type
9542 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9543 :
9544 319128 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9545 152904 : if (!phi)
9546 : return false;
9547 :
9548 152904 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
9549 : return false;
9550 :
9551 : /* Make sure it was recognized as induction computation. */
9552 152904 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9553 : return false;
9554 :
9555 : /* Handle nonlinear induction in a separate place. */
9556 149271 : if (induction_type != vect_step_op_add)
9557 8010 : return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9558 8010 : slp_node, cost_vec);
9559 :
9560 141261 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9561 141261 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9562 :
9563 : /* FORNOW. These restrictions should be relaxed. */
9564 141261 : if (nested_in_vect_loop_p (loop, stmt_info))
9565 : {
9566 740 : imm_use_iterator imm_iter;
9567 740 : use_operand_p use_p;
9568 740 : gimple *exit_phi;
9569 740 : edge latch_e;
9570 740 : tree loop_arg;
9571 :
9572 740 : exit_phi = NULL;
9573 740 : latch_e = loop_latch_edge (loop->inner);
9574 740 : loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9575 2256 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9576 : {
9577 800 : gimple *use_stmt = USE_STMT (use_p);
9578 800 : if (is_gimple_debug (use_stmt))
9579 36 : continue;
9580 :
9581 764 : if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9582 : {
9583 : exit_phi = use_stmt;
9584 : break;
9585 : }
9586 740 : }
9587 740 : if (exit_phi)
9588 : {
9589 24 : stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9590 24 : if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9591 8 : && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9592 : {
9593 16 : if (dump_enabled_p ())
9594 16 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9595 : "inner-loop induction only used outside "
9596 : "of the outer vectorized loop.\n");
9597 16 : return false;
9598 : }
9599 : }
9600 :
9601 724 : nested_in_vect_loop = true;
9602 724 : iv_loop = loop->inner;
9603 : }
9604 : else
9605 : iv_loop = loop;
9606 141245 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9607 :
9608 141245 : if (!nunits.is_constant () && SLP_TREE_LANES (slp_node) != 1)
9609 : {
9610 : /* The current SLP code creates the step value element-by-element. */
9611 : if (dump_enabled_p ())
9612 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9613 : "SLP induction not supported for variable-length"
9614 : " vectors.\n");
9615 : return false;
9616 : }
9617 :
9618 141245 : if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9619 : {
9620 12 : if (dump_enabled_p ())
9621 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9622 : "floating point induction vectorization disabled\n");
9623 12 : return false;
9624 : }
9625 :
9626 141233 : tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9627 141233 : gcc_assert (step_expr != NULL_TREE);
9628 282420 : if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
9629 282324 : && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
9630 : {
9631 12 : if (dump_enabled_p ())
9632 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9633 : "bit-precision induction vectorization not "
9634 : "supported.\n");
9635 12 : return false;
9636 : }
9637 141221 : tree stept = TREE_TYPE (step_expr);
9638 141221 : tree step_vectype = get_same_sized_vectype (stept, vectype);
9639 141221 : stept = TREE_TYPE (step_vectype);
9640 :
9641 : /* Check for target support of the vectorized arithmetic used here. */
9642 141221 : if (!target_supports_op_p (step_vectype, PLUS_EXPR, optab_default)
9643 141221 : || !target_supports_op_p (step_vectype, MINUS_EXPR, optab_default))
9644 23744 : return false;
9645 117477 : if (!nunits.is_constant ())
9646 : {
9647 : if (!target_supports_op_p (step_vectype, MULT_EXPR, optab_default))
9648 : return false;
9649 : /* FLOAT_EXPR when computing VEC_INIT for float inductions. */
9650 : if (SCALAR_FLOAT_TYPE_P (stept))
9651 : {
9652 : tree index_type = build_nonstandard_integer_type
9653 : (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (stept)), 1);
9654 :
9655 : index_vectype = build_vector_type (index_type, nunits);
9656 : if (!can_float_p (TYPE_MODE (step_vectype),
9657 : TYPE_MODE (index_vectype), 1))
9658 : return false;
9659 : }
9660 : }
9661 :
9662 117477 : unsigned nvects = vect_get_num_copies (loop_vinfo, slp_node);
9663 117477 : if (cost_vec) /* transformation not required. */
9664 : {
9665 306654 : unsigned inside_cost = 0, prologue_cost = 0;
9666 : /* We eventually need to set a vector type on invariant
9667 : arguments. */
9668 : unsigned j;
9669 : slp_tree child;
9670 306654 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9671 204436 : if (!vect_maybe_update_slp_op_vectype
9672 204436 : (child, SLP_TREE_VECTYPE (slp_node)))
9673 : {
9674 0 : if (dump_enabled_p ())
9675 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9676 : "incompatible vector types for "
9677 : "invariants\n");
9678 0 : return false;
9679 : }
9680 : /* loop cost for vec_loop. */
9681 102218 : inside_cost = record_stmt_cost (cost_vec, nvects,
9682 : vector_stmt, slp_node, 0, vect_body);
9683 : /* prologue cost for vec_init (if not nested) and step. */
9684 102218 : prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9685 : scalar_to_vec,
9686 : slp_node, 0, vect_prologue);
9687 102218 : if (dump_enabled_p ())
9688 4025 : dump_printf_loc (MSG_NOTE, vect_location,
9689 : "vect_model_induction_cost: inside_cost = %d, "
9690 : "prologue_cost = %d .\n", inside_cost,
9691 : prologue_cost);
9692 :
9693 102218 : SLP_TREE_TYPE (slp_node) = induc_vec_info_type;
9694 102218 : DUMP_VECT_SCOPE ("vectorizable_induction");
9695 102218 : return true;
9696 : }
9697 :
9698 : /* Transform. */
9699 :
9700 : /* Compute a vector variable, initialized with the first VF values of
9701 : the induction variable. E.g., for an iv with IV_PHI='X' and
9702 : evolution S, for a vector of 4 units, we want to compute:
9703 : [X, X + S, X + 2*S, X + 3*S]. */
9704 :
9705 15259 : if (dump_enabled_p ())
9706 2770 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9707 :
9708 15259 : pe = loop_preheader_edge (iv_loop);
9709 : /* Find the first insertion point in the BB. */
9710 15259 : basic_block bb = gimple_bb (phi);
9711 15259 : si = gsi_after_labels (bb);
9712 :
9713 : /* For SLP induction we have to generate several IVs as for example
9714 : with group size 3 we need
9715 : [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9716 : [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9717 15259 : gimple_stmt_iterator incr_si;
9718 15259 : bool insert_after;
9719 15259 : standard_iv_increment_position (iv_loop, &incr_si, &insert_after);
9720 :
9721 : /* The initial values are vectorized, but any lanes > group_size
9722 : need adjustment. */
9723 15259 : slp_tree init_node
9724 15259 : = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9725 :
9726 : /* Gather steps. Since we do not vectorize inductions as
9727 : cycles we have to reconstruct the step from SCEV data. */
9728 15259 : unsigned group_size = SLP_TREE_LANES (slp_node);
9729 15259 : tree *steps = XALLOCAVEC (tree, group_size);
9730 15259 : tree *inits = XALLOCAVEC (tree, group_size);
9731 15259 : stmt_vec_info phi_info;
9732 47011 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9733 : {
9734 16493 : steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9735 16493 : if (!init_node)
9736 16254 : inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9737 : pe->dest_idx);
9738 : }
9739 :
9740 : /* Now generate the IVs. */
9741 30518 : gcc_assert (multiple_p (nunits * nvects, group_size));
9742 15259 : unsigned nivs;
9743 15259 : unsigned HOST_WIDE_INT const_nunits;
9744 15259 : if (nested_in_vect_loop)
9745 : nivs = nvects;
9746 15041 : else if (nunits.is_constant (&const_nunits))
9747 : {
9748 : /* Compute the number of distinct IVs we need. First reduce
9749 : group_size if it is a multiple of const_nunits so we get
9750 : one IV for a group_size of 4 but const_nunits 2. */
9751 15041 : unsigned group_sizep = group_size;
9752 15041 : if (group_sizep % const_nunits == 0)
9753 111 : group_sizep = group_sizep / const_nunits;
9754 15041 : nivs = least_common_multiple (group_sizep, const_nunits) / const_nunits;
9755 : }
9756 : else
9757 : {
9758 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
9759 : nivs = 1;
9760 : }
9761 15259 : gimple_seq init_stmts = NULL;
9762 15259 : tree lupdate_mul = NULL_TREE;
9763 218 : if (!nested_in_vect_loop)
9764 : {
9765 15041 : if (nunits.is_constant (&const_nunits))
9766 : {
9767 : /* The number of iterations covered in one vector iteration. */
9768 15041 : unsigned lup_mul = (nvects * const_nunits) / group_size;
9769 15041 : lupdate_mul
9770 15041 : = build_vector_from_val (step_vectype,
9771 15041 : SCALAR_FLOAT_TYPE_P (stept)
9772 28 : ? build_real_from_wide (stept, lup_mul,
9773 : UNSIGNED)
9774 30054 : : build_int_cstu (stept, lup_mul));
9775 : }
9776 : else
9777 : {
9778 : if (SCALAR_FLOAT_TYPE_P (stept))
9779 : {
9780 : tree tem = build_int_cst (integer_type_node, vf);
9781 : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
9782 : }
9783 : else
9784 : lupdate_mul = build_int_cst (stept, vf);
9785 : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
9786 : lupdate_mul);
9787 : }
9788 : }
9789 15259 : tree peel_mul = NULL_TREE;
9790 15259 : if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9791 : {
9792 0 : if (SCALAR_FLOAT_TYPE_P (stept))
9793 0 : peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9794 : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9795 : else
9796 0 : peel_mul = gimple_convert (&init_stmts, stept,
9797 : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9798 0 : peel_mul = gimple_build_vector_from_val (&init_stmts,
9799 : step_vectype, peel_mul);
9800 : }
9801 15259 : tree step_mul = NULL_TREE;
9802 15259 : unsigned ivn;
9803 15259 : auto_vec<tree> vec_steps;
9804 31094 : for (ivn = 0; ivn < nivs; ++ivn)
9805 : {
9806 15835 : gimple_seq stmts = NULL;
9807 15835 : bool invariant = true;
9808 15835 : if (nunits.is_constant (&const_nunits))
9809 : {
9810 15835 : tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9811 15835 : tree_vector_builder init_elts (vectype, const_nunits, 1);
9812 15835 : tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9813 103447 : for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9814 : {
9815 : /* The scalar steps of the IVs. */
9816 87612 : tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9817 87612 : elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9818 87612 : step_elts.quick_push (elt);
9819 87612 : if (!init_node)
9820 : {
9821 : /* The scalar inits of the IVs if not vectorized. */
9822 86362 : elt = inits[(ivn*const_nunits + eltn) % group_size];
9823 86362 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
9824 86362 : TREE_TYPE (elt)))
9825 264 : elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9826 264 : TREE_TYPE (vectype), elt);
9827 86362 : init_elts.quick_push (elt);
9828 : }
9829 : /* The number of steps to add to the initial values. */
9830 87612 : unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9831 175224 : mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9832 175122 : ? build_real_from_wide (stept, mul_elt,
9833 : UNSIGNED)
9834 175122 : : build_int_cstu (stept, mul_elt));
9835 : }
9836 15835 : vec_step = gimple_build_vector (&init_stmts, &step_elts);
9837 15835 : step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9838 15835 : if (!init_node)
9839 15583 : vec_init = gimple_build_vector (&init_stmts, &init_elts);
9840 15835 : }
9841 : else
9842 : {
9843 : tree step = gimple_convert (&init_stmts, stept, steps[0]);
9844 : if (init_node)
9845 : ;
9846 : else if (INTEGRAL_TYPE_P (stept))
9847 : {
9848 : new_name = gimple_convert (&init_stmts, stept, inits[0]);
9849 : /* Build the initial value directly as a VEC_SERIES_EXPR. */
9850 : vec_init = gimple_build (&init_stmts, VEC_SERIES_EXPR,
9851 : step_vectype, new_name, step);
9852 : if (!useless_type_conversion_p (vectype, step_vectype))
9853 : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9854 : vectype, vec_init);
9855 : }
9856 : else
9857 : {
9858 : /* Build:
9859 : [base, base, base, ...]
9860 : + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
9861 : gcc_assert (SCALAR_FLOAT_TYPE_P (stept));
9862 : gcc_assert (flag_associative_math);
9863 : gcc_assert (index_vectype != NULL_TREE);
9864 :
9865 : tree index = build_index_vector (index_vectype, 0, 1);
9866 : new_name = gimple_convert (&init_stmts, stept, inits[0]);
9867 : tree base_vec = gimple_build_vector_from_val (&init_stmts,
9868 : step_vectype,
9869 : new_name);
9870 : tree step_vec = gimple_build_vector_from_val (&init_stmts,
9871 : step_vectype,
9872 : step);
9873 : vec_init = gimple_build (&init_stmts, FLOAT_EXPR,
9874 : step_vectype, index);
9875 : vec_init = gimple_build (&init_stmts, MULT_EXPR,
9876 : step_vectype, vec_init, step_vec);
9877 : vec_init = gimple_build (&init_stmts, PLUS_EXPR,
9878 : step_vectype, vec_init, base_vec);
9879 : if (!useless_type_conversion_p (vectype, step_vectype))
9880 : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9881 : vectype, vec_init);
9882 : }
9883 : /* iv_loop is nested in the loop to be vectorized. Generate:
9884 : vec_step = [S, S, S, S] */
9885 : t = unshare_expr (step);
9886 : gcc_assert (CONSTANT_CLASS_P (t)
9887 : || TREE_CODE (t) == SSA_NAME);
9888 : vec_step = gimple_build_vector_from_val (&init_stmts,
9889 : step_vectype, t);
9890 : }
9891 15835 : vec_steps.safe_push (vec_step);
9892 15835 : if (peel_mul)
9893 : {
9894 0 : if (!step_mul)
9895 : {
9896 0 : gcc_assert (!nunits.is_constant ());
9897 : step_mul = gimple_build (&init_stmts,
9898 : MINUS_EXPR, step_vectype,
9899 : build_zero_cst (step_vectype), peel_mul);
9900 : }
9901 : else
9902 0 : step_mul = gimple_build (&init_stmts,
9903 : MINUS_EXPR, step_vectype,
9904 : step_mul, peel_mul);
9905 : }
9906 :
9907 : /* Create the induction-phi that defines the induction-operand. */
9908 15835 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9909 : "vec_iv_");
9910 15835 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
9911 15835 : induc_def = PHI_RESULT (induction_phi);
9912 :
9913 : /* Create the iv update inside the loop */
9914 15835 : tree up = vec_step;
9915 15835 : if (lupdate_mul)
9916 : {
9917 15583 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
9918 : {
9919 : /* When we're using loop_len produced by SELEC_VL, the
9920 : non-final iterations are not always processing VF
9921 : elements. So vectorize induction variable instead of
9922 :
9923 : _21 = vect_vec_iv_.6_22 + { VF, ... };
9924 :
9925 : We should generate:
9926 :
9927 : _35 = .SELECT_VL (ivtmp_33, VF);
9928 : vect_cst__22 = [vec_duplicate_expr] _35;
9929 : _21 = vect_vec_iv_.6_22 + vect_cst__22; */
9930 0 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
9931 0 : tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1,
9932 : vectype, 0, 0, false);
9933 0 : if (SCALAR_FLOAT_TYPE_P (stept))
9934 0 : expr = gimple_build (&stmts, FLOAT_EXPR, stept, len);
9935 : else
9936 0 : expr = gimple_convert (&stmts, stept, len);
9937 0 : lupdate_mul = gimple_build_vector_from_val (&stmts, step_vectype,
9938 : expr);
9939 0 : up = gimple_build (&stmts, MULT_EXPR,
9940 : step_vectype, vec_step, lupdate_mul);
9941 : }
9942 : else
9943 15583 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9944 : vec_step, lupdate_mul);
9945 : }
9946 15835 : vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9947 15835 : vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, up);
9948 15835 : vec_def = gimple_convert (&stmts, vectype, vec_def);
9949 15835 : insert_iv_increment (&incr_si, insert_after, stmts);
9950 15835 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9951 : UNKNOWN_LOCATION);
9952 :
9953 15835 : if (init_node)
9954 252 : vec_init = vect_get_slp_vect_def (init_node, ivn);
9955 15835 : if (!nested_in_vect_loop
9956 15835 : && step_mul
9957 15835 : && !integer_zerop (step_mul))
9958 : {
9959 15142 : gcc_assert (invariant);
9960 15142 : vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9961 15142 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9962 : vec_step, step_mul);
9963 15142 : vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9964 : vec_def, up);
9965 15142 : vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9966 : }
9967 :
9968 : /* Set the arguments of the phi node: */
9969 15835 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9970 :
9971 15835 : slp_node->push_vec_def (induction_phi);
9972 : }
9973 15259 : if (!nested_in_vect_loop)
9974 : {
9975 : /* Fill up to the number of vectors we need for the whole group. */
9976 15041 : if (nunits.is_constant (&const_nunits))
9977 15041 : nivs = least_common_multiple (group_size, const_nunits) / const_nunits;
9978 : else
9979 : nivs = 1;
9980 15041 : vec_steps.reserve (nivs-ivn);
9981 30103 : for (; ivn < nivs; ++ivn)
9982 : {
9983 21 : slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
9984 21 : vec_steps.quick_push (vec_steps[0]);
9985 : }
9986 : }
9987 :
9988 : /* Re-use IVs when we can. We are generating further vector
9989 : stmts by adding VF' * stride to the IVs generated above. */
9990 15259 : if (ivn < nvects)
9991 : {
9992 3391 : if (nunits.is_constant (&const_nunits))
9993 : {
9994 3391 : unsigned vfp = (least_common_multiple (group_size, const_nunits)
9995 3391 : / group_size);
9996 3391 : lupdate_mul
9997 3391 : = build_vector_from_val (step_vectype,
9998 3391 : SCALAR_FLOAT_TYPE_P (stept)
9999 8 : ? build_real_from_wide (stept,
10000 8 : vfp, UNSIGNED)
10001 6774 : : build_int_cstu (stept, vfp));
10002 : }
10003 : else
10004 : {
10005 : if (SCALAR_FLOAT_TYPE_P (stept))
10006 : {
10007 : tree tem = build_int_cst (integer_type_node, nunits);
10008 : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
10009 : }
10010 : else
10011 : lupdate_mul = build_int_cst (stept, nunits);
10012 : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
10013 : lupdate_mul);
10014 : }
10015 10902 : for (; ivn < nvects; ++ivn)
10016 : {
10017 7511 : gimple *iv
10018 7511 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10019 7511 : tree def = gimple_get_lhs (iv);
10020 7511 : if (ivn < 2*nivs)
10021 3483 : vec_steps[ivn - nivs]
10022 3483 : = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10023 3483 : vec_steps[ivn - nivs], lupdate_mul);
10024 7511 : gimple_seq stmts = NULL;
10025 7511 : def = gimple_convert (&stmts, step_vectype, def);
10026 22533 : def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10027 7511 : def, vec_steps[ivn % nivs]);
10028 7511 : def = gimple_convert (&stmts, vectype, def);
10029 7511 : if (gimple_code (iv) == GIMPLE_PHI)
10030 3483 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10031 : else
10032 : {
10033 4028 : gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10034 4028 : gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10035 : }
10036 7511 : slp_node->push_vec_def (def);
10037 : }
10038 : }
10039 :
10040 15259 : new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10041 15259 : gcc_assert (!new_bb);
10042 :
10043 15259 : return true;
10044 15259 : }
10045 :
10046 : /* Function vectorizable_live_operation_1.
10047 :
10048 : helper function for vectorizable_live_operation. */
10049 :
10050 : static tree
10051 2837 : vectorizable_live_operation_1 (loop_vec_info loop_vinfo, basic_block exit_bb,
10052 : tree vectype, slp_tree slp_node,
10053 : tree bitsize, tree bitstart, tree vec_lhs,
10054 : tree lhs_type, gimple_stmt_iterator *exit_gsi)
10055 : {
10056 2837 : gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10057 :
10058 2837 : tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10059 2837 : gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10060 5676 : for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10061 2839 : SET_PHI_ARG_DEF (phi, i, vec_lhs);
10062 :
10063 2837 : gimple_seq stmts = NULL;
10064 2837 : tree new_tree;
10065 :
10066 : /* If bitstart is 0 then we can use a BIT_FIELD_REF */
10067 2837 : if (integer_zerop (bitstart))
10068 : {
10069 213 : tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10070 : vec_lhs_phi, bitsize, bitstart);
10071 :
10072 : /* Convert the extracted vector element to the scalar type. */
10073 213 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10074 : }
10075 2624 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10076 : {
10077 : /* Emit:
10078 :
10079 : SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - 1>
10080 :
10081 : where VEC_LHS is the vectorized live-out result, LEN is the length of
10082 : the vector, BIAS is the load-store bias. The bias should not be used
10083 : at all since we are not using load/store operations, but LEN will be
10084 : REALLEN + BIAS, so subtract it to get to the correct position. */
10085 0 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
10086 0 : gimple_seq tem = NULL;
10087 0 : gimple_stmt_iterator gsi = gsi_last (tem);
10088 0 : tree len = vect_get_loop_len (loop_vinfo, &gsi,
10089 : &LOOP_VINFO_LENS (loop_vinfo),
10090 : 1, vectype, 0, 1, false);
10091 0 : gimple_seq_add_seq (&stmts, tem);
10092 :
10093 : /* LAST_INDEX = LEN - 1. */
10094 0 : tree last_index = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (len),
10095 0 : len, build_one_cst (TREE_TYPE (len)));
10096 :
10097 : /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - 1>. */
10098 0 : tree scalar_res
10099 0 : = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10100 : vec_lhs_phi, last_index);
10101 :
10102 : /* Convert the extracted vector element to the scalar type. */
10103 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10104 : }
10105 2624 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10106 : {
10107 : /* Emit:
10108 :
10109 : SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10110 :
10111 : where VEC_LHS is the vectorized live-out result and MASK is
10112 : the loop mask for the final iteration. */
10113 0 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
10114 0 : tree scalar_type = TREE_TYPE (vectype);
10115 0 : gimple_seq tem = NULL;
10116 0 : gimple_stmt_iterator gsi = gsi_last (tem);
10117 0 : tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10118 : &LOOP_VINFO_MASKS (loop_vinfo),
10119 : 1, vectype, 0);
10120 0 : tree scalar_res;
10121 0 : gimple_seq_add_seq (&stmts, tem);
10122 :
10123 0 : scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10124 : mask, vec_lhs_phi);
10125 :
10126 : /* Convert the extracted vector element to the scalar type. */
10127 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10128 : }
10129 : else
10130 : {
10131 2624 : tree bftype = TREE_TYPE (vectype);
10132 2624 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10133 85 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10134 2624 : new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10135 2624 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10136 : &stmts, true, NULL_TREE);
10137 : }
10138 :
10139 2837 : *exit_gsi = gsi_after_labels (exit_bb);
10140 2837 : if (stmts)
10141 2837 : gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10142 :
10143 2837 : return new_tree;
10144 : }
10145 :
10146 : /* Function vectorizable_live_operation.
10147 :
10148 : STMT_INFO computes a value that is used outside the loop. Check if
10149 : it can be supported. */
10150 :
10151 : bool
10152 263875 : vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10153 : slp_tree slp_node, slp_instance slp_node_instance,
10154 : int slp_index, bool vec_stmt_p,
10155 : stmt_vector_for_cost *cost_vec)
10156 : {
10157 263875 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10158 263875 : imm_use_iterator imm_iter;
10159 263875 : tree lhs, lhs_type, bitsize;
10160 263875 : tree vectype = SLP_TREE_VECTYPE (slp_node);
10161 263875 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10162 263875 : gimple *use_stmt;
10163 263875 : use_operand_p use_p;
10164 263875 : auto_vec<tree> vec_oprnds;
10165 263875 : int vec_entry = 0;
10166 263875 : poly_uint64 vec_index = 0;
10167 :
10168 263875 : gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10169 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10170 :
10171 : /* If a stmt of a reduction is live, vectorize it via
10172 : vect_create_epilog_for_reduction. vectorizable_reduction assessed
10173 : validity so just trigger the transform here. */
10174 263875 : if (vect_is_reduction (slp_node))
10175 : {
10176 57810 : if (!vec_stmt_p)
10177 : return true;
10178 : /* For SLP reductions we vectorize the epilogue for all involved stmts
10179 : together. For SLP reduction chains we only get here once. */
10180 23481 : if (SLP_INSTANCE_KIND (slp_node_instance) == slp_inst_kind_reduc_group
10181 23222 : && slp_index != 0)
10182 : return true;
10183 23022 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
10184 23022 : if (VECT_REDUC_INFO_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10185 23022 : || VECT_REDUC_INFO_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10186 : return true;
10187 :
10188 22179 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
10189 22179 : || !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10190 22170 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10191 : slp_node_instance,
10192 : LOOP_VINFO_MAIN_EXIT (loop_vinfo));
10193 :
10194 : /* If early break we only have to materialize the reduction on the merge
10195 : block, but we have to find an alternate exit first. */
10196 22179 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10197 : {
10198 28 : slp_tree phis_node = slp_node_instance->reduc_phis;
10199 28 : stmt_info = SLP_TREE_REPRESENTATIVE (phis_node);
10200 89 : for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10201 28 : if (exit != LOOP_VINFO_MAIN_EXIT (loop_vinfo))
10202 : {
10203 23 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10204 : phis_node, slp_node_instance,
10205 : exit);
10206 23 : break;
10207 28 : }
10208 28 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10209 9 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10210 : phis_node, slp_node_instance,
10211 : LOOP_VINFO_MAIN_EXIT
10212 : (loop_vinfo));
10213 : }
10214 :
10215 22179 : return true;
10216 : }
10217 :
10218 : /* If STMT is not relevant and it is a simple assignment and its inputs are
10219 : invariant then it can remain in place, unvectorized. The original last
10220 : scalar value that it computes will be used. */
10221 206065 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
10222 : {
10223 0 : gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10224 0 : if (dump_enabled_p ())
10225 0 : dump_printf_loc (MSG_NOTE, vect_location,
10226 : "statement is simple and uses invariant. Leaving in "
10227 : "place.\n");
10228 0 : return true;
10229 : }
10230 :
10231 206065 : gcc_assert (slp_index >= 0);
10232 :
10233 : /* Get the last occurrence of the scalar index from the concatenation of
10234 : all the slp vectors. Calculate which slp vector it is and the index
10235 : within. */
10236 206065 : int num_scalar = SLP_TREE_LANES (slp_node);
10237 206065 : int num_vec = vect_get_num_copies (vinfo, slp_node);
10238 206065 : poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10239 :
10240 : /* Calculate which vector contains the result, and which lane of
10241 : that vector we need. */
10242 206065 : if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10243 : {
10244 : if (dump_enabled_p ())
10245 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10246 : "Cannot determine which vector holds the"
10247 : " final result.\n");
10248 : return false;
10249 : }
10250 :
10251 206065 : if (!vec_stmt_p)
10252 : {
10253 : /* No transformation required. */
10254 164016 : if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10255 : {
10256 27066 : if (SLP_TREE_LANES (slp_node) != 1)
10257 : {
10258 19 : if (dump_enabled_p ())
10259 19 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10260 : "can't operate on partial vectors "
10261 : "because an SLP statement is live after "
10262 : "the loop.\n");
10263 19 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10264 : }
10265 27047 : else if (num_vec > 1)
10266 : {
10267 15398 : if (dump_enabled_p ())
10268 51 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10269 : "can't operate on partial vectors "
10270 : "because ncopies is greater than 1.\n");
10271 15398 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10272 : }
10273 : else
10274 : {
10275 11649 : if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10276 : OPTIMIZE_FOR_SPEED))
10277 0 : vect_record_loop_mask (loop_vinfo,
10278 : &LOOP_VINFO_MASKS (loop_vinfo),
10279 : 1, vectype, NULL);
10280 11649 : else if (can_vec_extract_var_idx_p (
10281 11649 : TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10282 0 : vect_record_loop_len (loop_vinfo,
10283 : &LOOP_VINFO_LENS (loop_vinfo),
10284 : 1, vectype, 1);
10285 : else
10286 : {
10287 11649 : if (dump_enabled_p ())
10288 651 : dump_printf_loc (
10289 651 : MSG_MISSED_OPTIMIZATION, vect_location,
10290 : "can't operate on partial vectors "
10291 : "because the target doesn't support extract "
10292 : "last reduction.\n");
10293 11649 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10294 : }
10295 : }
10296 : }
10297 : /* ??? Enable for loop costing as well. */
10298 27066 : if (!loop_vinfo)
10299 93664 : record_stmt_cost (cost_vec, 1, vec_to_scalar, slp_node,
10300 : 0, vect_epilogue);
10301 164016 : return true;
10302 : }
10303 :
10304 : /* Use the lhs of the original scalar statement. */
10305 42049 : gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10306 42049 : if (dump_enabled_p ())
10307 1034 : dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10308 : "stmt %G", stmt);
10309 :
10310 42049 : lhs = gimple_get_lhs (stmt);
10311 42049 : lhs_type = TREE_TYPE (lhs);
10312 :
10313 42049 : bitsize = vector_element_bits_tree (vectype);
10314 :
10315 : /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10316 42049 : gcc_assert (!loop_vinfo
10317 : || ((!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10318 : && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10319 : || SLP_TREE_LANES (slp_node) == 1));
10320 :
10321 : /* Get the correct slp vectorized stmt. */
10322 42049 : tree vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10323 42049 : gimple *vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10324 :
10325 : /* In case we need to early break vectorize also get the first stmt. */
10326 42049 : tree vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10327 :
10328 : /* Get entry to use. */
10329 42049 : tree bitstart = bitsize_int (vec_index);
10330 42049 : bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10331 :
10332 42049 : if (loop_vinfo)
10333 : {
10334 : /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10335 : requirement, insert one phi node for it. It looks like:
10336 : loop;
10337 : BB:
10338 : # lhs' = PHI <lhs>
10339 : ==>
10340 : loop;
10341 : BB:
10342 : # vec_lhs' = PHI <vec_lhs>
10343 : new_tree = lane_extract <vec_lhs', ...>;
10344 : lhs' = new_tree; */
10345 :
10346 2896 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10347 : /* Check if we have a loop where the chosen exit is not the main exit,
10348 : in these cases for an early break we restart the iteration the vector code
10349 : did. For the live values we want the value at the start of the iteration
10350 : rather than at the end. */
10351 2896 : edge main_e = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
10352 2896 : bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10353 15031 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10354 9239 : if (!is_gimple_debug (use_stmt)
10355 9239 : && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10356 2837 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10357 : {
10358 2837 : edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10359 2837 : phi_arg_index_from_use (use_p));
10360 2837 : gcc_assert (loop_exit_edge_p (loop, e));
10361 2837 : bool main_exit_edge = e == main_e;
10362 2837 : tree tmp_vec_lhs = vec_lhs;
10363 2837 : tree tmp_bitstart = bitstart;
10364 :
10365 : /* For early exit where the exit is not in the BB that leads
10366 : to the latch then we're restarting the iteration in the
10367 : scalar loop. So get the first live value. */
10368 2837 : bool early_break_first_element_p
10369 2837 : = all_exits_as_early_p || !main_exit_edge;
10370 2837 : if (early_break_first_element_p)
10371 : {
10372 195 : tmp_vec_lhs = vec_lhs0;
10373 195 : tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10374 : }
10375 :
10376 2837 : gimple_stmt_iterator exit_gsi;
10377 2837 : tree new_tree
10378 2837 : = vectorizable_live_operation_1 (loop_vinfo,
10379 : e->dest, vectype,
10380 : slp_node, bitsize,
10381 : tmp_bitstart, tmp_vec_lhs,
10382 : lhs_type, &exit_gsi);
10383 :
10384 2837 : auto gsi = gsi_for_stmt (use_stmt);
10385 2837 : tree lhs_phi = gimple_phi_result (use_stmt);
10386 2837 : remove_phi_node (&gsi, false);
10387 2837 : gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10388 2837 : gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10389 2837 : break;
10390 2896 : }
10391 :
10392 : /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
10393 12194 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10394 6402 : gcc_assert (is_gimple_debug (use_stmt)
10395 2896 : || flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10396 : }
10397 : else
10398 : {
10399 : /* For basic-block vectorization simply insert the lane-extraction. */
10400 39153 : tree bftype = TREE_TYPE (vectype);
10401 39153 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10402 0 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10403 39153 : tree new_tree = build3 (BIT_FIELD_REF, bftype,
10404 : vec_lhs, bitsize, bitstart);
10405 39153 : gimple_seq stmts = NULL;
10406 39153 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10407 : &stmts, true, NULL_TREE);
10408 39153 : if (TREE_CODE (new_tree) == SSA_NAME
10409 78306 : && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10410 2 : SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10411 39153 : if (is_a <gphi *> (vec_stmt))
10412 : {
10413 2586 : gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10414 2586 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10415 : }
10416 : else
10417 : {
10418 36567 : gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10419 36567 : gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10420 : }
10421 :
10422 : /* Replace use of lhs with newly computed result. If the use stmt is a
10423 : single arg PHI, just replace all uses of PHI result. It's necessary
10424 : because lcssa PHI defining lhs may be before newly inserted stmt. */
10425 39153 : use_operand_p use_p;
10426 39153 : stmt_vec_info use_stmt_info;
10427 240818 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10428 162512 : if (!is_gimple_debug (use_stmt)
10429 162512 : && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10430 116517 : || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10431 : {
10432 : /* ??? This can happen when the live lane ends up being
10433 : rooted in a vector construction code-generated by an
10434 : external SLP node (and code-generation for that already
10435 : happened). See gcc.dg/vect/bb-slp-47.c.
10436 : Doing this is what would happen if that vector CTOR
10437 : were not code-generated yet so it is not too bad.
10438 : ??? In fact we'd likely want to avoid this situation
10439 : in the first place. */
10440 65965 : if (TREE_CODE (new_tree) == SSA_NAME
10441 65603 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10442 65603 : && gimple_code (use_stmt) != GIMPLE_PHI
10443 124775 : && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10444 : use_stmt))
10445 : {
10446 362 : if (dump_enabled_p ())
10447 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10448 : "Using original scalar computation for "
10449 : "live lane because use preceeds vector "
10450 : "def\n");
10451 362 : continue;
10452 : }
10453 200123 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10454 : {
10455 : /* ??? It can also happen that we end up pulling a def into
10456 : a loop where replacing out-of-loop uses would require
10457 : a new LC SSA PHI node. Retain the original scalar in
10458 : those cases as well. PR98064. */
10459 67441 : edge e;
10460 67441 : if (TREE_CODE (new_tree) == SSA_NAME
10461 67441 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10462 67441 : && (gimple_bb (use_stmt)->loop_father
10463 67441 : != gimple_bb (vec_stmt)->loop_father)
10464 : /* But a replacement in a LC PHI is OK. This happens
10465 : in gcc.dg/vect/bb-slp-57.c for example. */
10466 7536 : && (gimple_code (use_stmt) != GIMPLE_PHI
10467 2900 : || (((e = phi_arg_edge_from_use (use_p)), true)
10468 2900 : && !loop_exit_edge_p
10469 2900 : (gimple_bb (vec_stmt)->loop_father, e)))
10470 73091 : && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10471 5650 : gimple_bb (use_stmt)->loop_father))
10472 : {
10473 0 : if (dump_enabled_p ())
10474 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10475 : "Using original scalar computation for "
10476 : "live lane because there is an "
10477 : "out-of-loop definition for it\n");
10478 0 : continue;
10479 : }
10480 67441 : SET_USE (use_p, new_tree);
10481 : }
10482 65241 : update_stmt (use_stmt);
10483 39153 : }
10484 : }
10485 :
10486 : return true;
10487 263875 : }
10488 :
10489 : /* Given loop represented by LOOP_VINFO, return true if computation of
10490 : LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10491 : otherwise. */
10492 :
10493 : static bool
10494 61529 : loop_niters_no_overflow (loop_vec_info loop_vinfo)
10495 : {
10496 61529 : gcc_assert (!LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo));
10497 :
10498 : /* Constant case. */
10499 61529 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10500 : {
10501 35844 : tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10502 35844 : tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10503 :
10504 35844 : gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10505 35844 : gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10506 35844 : if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10507 : return true;
10508 : }
10509 :
10510 25685 : widest_int max;
10511 25685 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10512 : /* Check the upper bound of loop niters. */
10513 25685 : if (get_max_loop_iterations (loop, &max))
10514 : {
10515 25685 : tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10516 25685 : signop sgn = TYPE_SIGN (type);
10517 25685 : widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10518 25685 : if (max < type_max)
10519 25460 : return true;
10520 25685 : }
10521 : return false;
10522 25685 : }
10523 :
10524 : /* Return a mask type with half the number of elements as OLD_TYPE,
10525 : given that it should have mode NEW_MODE. */
10526 :
10527 : tree
10528 3714 : vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10529 : {
10530 3714 : poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10531 3714 : return build_truth_vector_type_for_mode (nunits, new_mode);
10532 : }
10533 :
10534 : /* Return a mask type with twice as many elements as OLD_TYPE,
10535 : given that it should have mode NEW_MODE. */
10536 :
10537 : tree
10538 5915 : vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10539 : {
10540 5915 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10541 5915 : return build_truth_vector_type_for_mode (nunits, new_mode);
10542 : }
10543 :
10544 : /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10545 : contain a sequence of NVECTORS masks that each control a vector of type
10546 : VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10547 : these vector masks with the vector version of SCALAR_MASK. */
10548 :
10549 : void
10550 77680 : vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10551 : unsigned int nvectors, tree vectype, tree scalar_mask)
10552 : {
10553 77680 : gcc_assert (nvectors != 0);
10554 :
10555 77680 : if (scalar_mask)
10556 : {
10557 3638 : scalar_cond_masked_key cond (scalar_mask, nvectors);
10558 3638 : loop_vinfo->scalar_cond_masked_set.add (cond);
10559 : }
10560 :
10561 77680 : masks->mask_set.add (std::make_pair (vectype, nvectors));
10562 77680 : }
10563 :
10564 : /* Given a complete set of masks MASKS, extract mask number INDEX
10565 : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10566 : where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10567 :
10568 : See the comment above vec_loop_masks for more details about the mask
10569 : arrangement. */
10570 :
10571 : tree
10572 208 : vect_get_loop_mask (loop_vec_info loop_vinfo,
10573 : gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10574 : unsigned int nvectors, tree vectype, unsigned int index)
10575 : {
10576 208 : if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10577 : == vect_partial_vectors_while_ult)
10578 : {
10579 0 : rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10580 0 : tree mask_type = rgm->type;
10581 :
10582 : /* Populate the rgroup's mask array, if this is the first time we've
10583 : used it. */
10584 0 : if (rgm->controls.is_empty ())
10585 : {
10586 0 : rgm->controls.safe_grow_cleared (nvectors, true);
10587 0 : for (unsigned int i = 0; i < nvectors; ++i)
10588 : {
10589 0 : tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10590 : /* Provide a dummy definition until the real one is available. */
10591 0 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10592 0 : rgm->controls[i] = mask;
10593 : }
10594 : }
10595 :
10596 0 : tree mask = rgm->controls[index];
10597 0 : if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10598 0 : TYPE_VECTOR_SUBPARTS (vectype)))
10599 : {
10600 : /* A loop mask for data type X can be reused for data type Y
10601 : if X has N times more elements than Y and if Y's elements
10602 : are N times bigger than X's. In this case each sequence
10603 : of N elements in the loop mask will be all-zero or all-one.
10604 : We can then view-convert the mask so that each sequence of
10605 : N elements is replaced by a single element. */
10606 0 : gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10607 : TYPE_VECTOR_SUBPARTS (vectype)));
10608 0 : gimple_seq seq = NULL;
10609 0 : mask_type = truth_type_for (vectype);
10610 0 : mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10611 0 : if (seq)
10612 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10613 : }
10614 0 : return mask;
10615 : }
10616 208 : else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10617 : == vect_partial_vectors_avx512)
10618 : {
10619 : /* The number of scalars per iteration and the number of vectors are
10620 : both compile-time constants. */
10621 208 : unsigned int nscalars_per_iter
10622 208 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10623 208 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10624 :
10625 208 : rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10626 :
10627 : /* The stored nV is dependent on the mask type produced. */
10628 208 : gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10629 : TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10630 : == rgm->factor);
10631 208 : nvectors = rgm->factor;
10632 :
10633 : /* Populate the rgroup's mask array, if this is the first time we've
10634 : used it. */
10635 208 : if (rgm->controls.is_empty ())
10636 : {
10637 20 : rgm->controls.safe_grow_cleared (nvectors, true);
10638 106 : for (unsigned int i = 0; i < nvectors; ++i)
10639 : {
10640 86 : tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
10641 : /* Provide a dummy definition until the real one is available. */
10642 86 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10643 86 : rgm->controls[i] = mask;
10644 : }
10645 : }
10646 208 : if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
10647 : TYPE_VECTOR_SUBPARTS (vectype)))
10648 160 : return rgm->controls[index];
10649 :
10650 : /* Split the vector if needed. Since we are dealing with integer mode
10651 : masks with AVX512 we can operate on the integer representation
10652 : performing the whole vector shifting. */
10653 48 : unsigned HOST_WIDE_INT factor;
10654 48 : bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
10655 48 : TYPE_VECTOR_SUBPARTS (vectype), &factor);
10656 0 : gcc_assert (ok);
10657 48 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
10658 48 : tree mask_type = truth_type_for (vectype);
10659 48 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
10660 48 : unsigned vi = index / factor;
10661 48 : unsigned vpart = index % factor;
10662 48 : tree vec = rgm->controls[vi];
10663 48 : gimple_seq seq = NULL;
10664 48 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
10665 48 : lang_hooks.types.type_for_mode
10666 48 : (TYPE_MODE (rgm->type), 1), vec);
10667 : /* For integer mode masks simply shift the right bits into position. */
10668 48 : if (vpart != 0)
10669 40 : vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
10670 : build_int_cst (integer_type_node,
10671 80 : (TYPE_VECTOR_SUBPARTS (vectype)
10672 40 : * vpart)));
10673 48 : vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
10674 48 : (TYPE_MODE (mask_type), 1), vec);
10675 48 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
10676 48 : if (seq)
10677 48 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10678 48 : return vec;
10679 : }
10680 : else
10681 0 : gcc_unreachable ();
10682 : }
10683 :
10684 : /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10685 : lengths for controlling an operation on VECTYPE. The operation splits
10686 : each element of VECTYPE into FACTOR separate subelements, measuring the
10687 : length as a number of these subelements. */
10688 :
10689 : void
10690 0 : vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10691 : unsigned int nvectors, tree vectype, unsigned int factor)
10692 : {
10693 0 : gcc_assert (nvectors != 0);
10694 0 : if (lens->length () < nvectors)
10695 0 : lens->safe_grow_cleared (nvectors, true);
10696 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
10697 :
10698 : /* The number of scalars per iteration, scalar occupied bytes and
10699 : the number of vectors are both compile-time constants. */
10700 0 : unsigned int nscalars_per_iter
10701 0 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10702 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10703 :
10704 0 : if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10705 : {
10706 : /* For now, we only support cases in which all loads and stores fall back
10707 : to VnQI or none do. */
10708 0 : gcc_assert (!rgl->max_nscalars_per_iter
10709 : || (rgl->factor == 1 && factor == 1)
10710 : || (rgl->max_nscalars_per_iter * rgl->factor
10711 : == nscalars_per_iter * factor));
10712 0 : rgl->max_nscalars_per_iter = nscalars_per_iter;
10713 0 : rgl->type = vectype;
10714 0 : rgl->factor = factor;
10715 : }
10716 0 : }
10717 :
10718 : /* Given a complete set of lengths LENS, extract length number INDEX
10719 : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10720 : where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
10721 : multipled by the number of elements that should be processed.
10722 : Insert any set-up statements before GSI. */
10723 :
10724 : tree
10725 0 : vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10726 : vec_loop_lens *lens, unsigned int nvectors, tree vectype,
10727 : unsigned int index, unsigned int factor, bool adjusted)
10728 : {
10729 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
10730 0 : bool use_bias_adjusted_len =
10731 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10732 :
10733 : /* Populate the rgroup's len array, if this is the first time we've
10734 : used it. */
10735 0 : if (rgl->controls.is_empty ())
10736 : {
10737 0 : rgl->controls.safe_grow_cleared (nvectors, true);
10738 0 : for (unsigned int i = 0; i < nvectors; ++i)
10739 : {
10740 0 : tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10741 0 : gcc_assert (len_type != NULL_TREE);
10742 :
10743 0 : tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10744 :
10745 : /* Provide a dummy definition until the real one is available. */
10746 0 : SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10747 0 : rgl->controls[i] = len;
10748 :
10749 0 : if (use_bias_adjusted_len)
10750 : {
10751 0 : gcc_assert (i == 0);
10752 0 : tree adjusted_len =
10753 0 : make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10754 0 : SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10755 0 : rgl->bias_adjusted_ctrl = adjusted_len;
10756 : }
10757 : }
10758 : }
10759 :
10760 0 : if (use_bias_adjusted_len && adjusted)
10761 0 : return rgl->bias_adjusted_ctrl;
10762 :
10763 0 : tree loop_len = rgl->controls[index];
10764 0 : if (rgl->factor == 1 && factor == 1)
10765 : {
10766 0 : poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
10767 0 : poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
10768 0 : if (maybe_ne (nunits1, nunits2))
10769 : {
10770 : /* A loop len for data type X can be reused for data type Y
10771 : if X has N times more elements than Y and if Y's elements
10772 : are N times bigger than X's. */
10773 0 : gcc_assert (multiple_p (nunits1, nunits2));
10774 0 : factor = exact_div (nunits1, nunits2).to_constant ();
10775 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10776 0 : gimple_seq seq = NULL;
10777 0 : loop_len = gimple_build (&seq, EXACT_DIV_EXPR, iv_type, loop_len,
10778 0 : build_int_cst (iv_type, factor));
10779 0 : if (seq)
10780 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10781 : }
10782 0 : }
10783 0 : else if (factor && rgl->factor != factor)
10784 : {
10785 : /* The number of scalars per iteration, scalar occupied bytes and
10786 : the number of vectors are both compile-time constants. */
10787 0 : unsigned int nscalars_per_iter
10788 0 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10789 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10790 0 : unsigned int rglvecsize = rgl->factor * rgl->max_nscalars_per_iter;
10791 0 : unsigned int vecsize = nscalars_per_iter * factor;
10792 0 : if (rglvecsize > vecsize)
10793 : {
10794 0 : unsigned int fac = rglvecsize / vecsize;
10795 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10796 0 : gimple_seq seq = NULL;
10797 0 : loop_len = gimple_build (&seq, EXACT_DIV_EXPR, iv_type, loop_len,
10798 0 : build_int_cst (iv_type, fac));
10799 0 : if (seq)
10800 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10801 : }
10802 0 : else if (rglvecsize < vecsize)
10803 : {
10804 0 : unsigned int fac = vecsize / rglvecsize;
10805 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10806 0 : gimple_seq seq = NULL;
10807 0 : loop_len = gimple_build (&seq, MULT_EXPR, iv_type, loop_len,
10808 0 : build_int_cst (iv_type, fac));
10809 0 : if (seq)
10810 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10811 : }
10812 : }
10813 : return loop_len;
10814 : }
10815 :
10816 : /* Generate the tree for the loop len mask and return it. Given the lens,
10817 : nvectors, vectype, index and factor to gen the len mask as below.
10818 :
10819 : tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
10820 : */
10821 : tree
10822 0 : vect_gen_loop_len_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10823 : gimple_stmt_iterator *cond_gsi, vec_loop_lens *lens,
10824 : unsigned int nvectors, tree vectype, tree stmt,
10825 : unsigned int index, unsigned int factor)
10826 : {
10827 0 : tree all_one_mask = build_all_ones_cst (vectype);
10828 0 : tree all_zero_mask = build_zero_cst (vectype);
10829 0 : tree len = vect_get_loop_len (loop_vinfo, gsi, lens, nvectors, vectype, index,
10830 : factor, true);
10831 0 : tree bias = build_int_cst (intQI_type_node,
10832 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo));
10833 0 : tree len_mask = make_temp_ssa_name (TREE_TYPE (stmt), NULL, "vec_len_mask");
10834 0 : gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, stmt,
10835 : all_one_mask, all_zero_mask, len,
10836 : bias);
10837 0 : gimple_call_set_lhs (call, len_mask);
10838 0 : gsi_insert_before (cond_gsi, call, GSI_SAME_STMT);
10839 :
10840 0 : return len_mask;
10841 : }
10842 :
10843 : /* Scale profiling counters by estimation for LOOP which is vectorized
10844 : by factor VF.
10845 : If FLAT is true, the loop we started with had unrealistically flat
10846 : profile. */
10847 :
10848 : static void
10849 61572 : scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
10850 : {
10851 : /* For flat profiles do not scale down proportionally by VF and only
10852 : cap by known iteration count bounds. */
10853 61572 : if (flat)
10854 : {
10855 34534 : if (dump_file && (dump_flags & TDF_DETAILS))
10856 5258 : fprintf (dump_file,
10857 : "Vectorized loop profile seems flat; not scaling iteration "
10858 : "count down by the vectorization factor %i\n", vf);
10859 34534 : scale_loop_profile (loop, profile_probability::always (),
10860 : get_likely_max_loop_iterations_int (loop));
10861 34534 : return;
10862 : }
10863 : /* Loop body executes VF fewer times and exit increases VF times. */
10864 27038 : profile_count entry_count = loop_preheader_edge (loop)->count ();
10865 :
10866 : /* If we have unreliable loop profile avoid dropping entry
10867 : count below header count. This can happen since loops
10868 : has unrealistically low trip counts. */
10869 27038 : while (vf > 1
10870 28178 : && loop->header->count > entry_count
10871 57385 : && loop->header->count < entry_count * vf)
10872 : {
10873 2169 : if (dump_file && (dump_flags & TDF_DETAILS))
10874 153 : fprintf (dump_file,
10875 : "Vectorization factor %i seems too large for profile "
10876 : "prevoiusly believed to be consistent; reducing.\n", vf);
10877 2169 : vf /= 2;
10878 : }
10879 :
10880 27038 : if (entry_count.nonzero_p ())
10881 27038 : set_edge_probability_and_rescale_others
10882 27038 : (exit_e,
10883 27038 : entry_count.probability_in (loop->header->count / vf));
10884 : /* Avoid producing very large exit probability when we do not have
10885 : sensible profile. */
10886 0 : else if (exit_e->probability < profile_probability::always () / (vf * 2))
10887 0 : set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
10888 27038 : loop->latch->count = single_pred_edge (loop->latch)->count ();
10889 :
10890 27038 : scale_loop_profile (loop, profile_probability::always () / vf,
10891 : get_likely_max_loop_iterations_int (loop));
10892 : }
10893 :
10894 : /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
10895 : original loop that has now been vectorized.
10896 :
10897 : The inits of the data_references need to be advanced with the number of
10898 : iterations of the main loop. This has been computed in vect_do_peeling and
10899 : is stored in parameter ADVANCE.
10900 :
10901 : Since the loop_vec_info of this EPILOGUE was constructed for the original
10902 : loop, its stmt_vec_infos all point to the original statements. These need
10903 : to be updated to point to their corresponding copies.
10904 :
10905 : The data_reference's connections also need to be updated. Their
10906 : corresponding dr_vec_info need to be reconnected to the EPILOGUE's
10907 : stmt_vec_infos, their statements need to point to their corresponding
10908 : copy. */
10909 :
10910 : static void
10911 7067 : update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
10912 : {
10913 7067 : loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
10914 7067 : hash_map<tree,tree> mapping;
10915 7067 : gimple *orig_stmt, *new_stmt;
10916 7067 : gimple_stmt_iterator epilogue_gsi;
10917 7067 : gphi_iterator epilogue_phi_gsi;
10918 7067 : stmt_vec_info stmt_vinfo = NULL, related_vinfo;
10919 7067 : basic_block *epilogue_bbs = get_loop_body (epilogue);
10920 7067 : unsigned i;
10921 :
10922 7067 : free (LOOP_VINFO_BBS (epilogue_vinfo));
10923 7067 : LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
10924 7067 : LOOP_VINFO_NBBS (epilogue_vinfo) = epilogue->num_nodes;
10925 :
10926 : /* The EPILOGUE loop is a copy of the original loop so they share the same
10927 : gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
10928 : point to the copied statements. */
10929 21201 : for (unsigned i = 0; i < epilogue->num_nodes; ++i)
10930 : {
10931 14134 : for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
10932 36370 : !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
10933 : {
10934 22236 : new_stmt = epilogue_phi_gsi.phi ();
10935 :
10936 22236 : gcc_assert (gimple_uid (new_stmt) > 0);
10937 22236 : stmt_vinfo
10938 22236 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10939 :
10940 22236 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10941 : }
10942 :
10943 28268 : for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
10944 144786 : !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
10945 : {
10946 130652 : new_stmt = gsi_stmt (epilogue_gsi);
10947 130652 : if (is_gimple_debug (new_stmt))
10948 21966 : continue;
10949 :
10950 108686 : gcc_assert (gimple_uid (new_stmt) > 0);
10951 108686 : stmt_vinfo
10952 108686 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10953 :
10954 108686 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10955 :
10956 108686 : related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
10957 108686 : if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
10958 : {
10959 2012 : gimple *stmt = STMT_VINFO_STMT (related_vinfo);
10960 : /* Set BB such that the assert in
10961 : 'get_initial_defs_for_reduction' is able to determine that
10962 : the BB of the related stmt is inside this loop. */
10963 2012 : gimple_set_bb (stmt,
10964 : gimple_bb (new_stmt));
10965 2012 : related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
10966 2012 : gcc_assert (related_vinfo == NULL
10967 : || related_vinfo == stmt_vinfo);
10968 : }
10969 : }
10970 : }
10971 :
10972 7067 : struct data_reference *dr;
10973 7067 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
10974 32675 : FOR_EACH_VEC_ELT (datarefs, i, dr)
10975 : {
10976 25608 : orig_stmt = DR_STMT (dr);
10977 25608 : gcc_assert (gimple_uid (orig_stmt) > 0);
10978 25608 : stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
10979 25608 : DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
10980 : }
10981 :
10982 : /* Advance data_reference's with the number of iterations of the previous
10983 : loop and its prologue. */
10984 7067 : vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
10985 :
10986 : /* Remember the advancement made. */
10987 7067 : LOOP_VINFO_DRS_ADVANCED_BY (epilogue_vinfo) = advance;
10988 7067 : }
10989 :
10990 : /* When vectorizing early break statements instructions that happen before
10991 : the early break in the current BB need to be moved to after the early
10992 : break. This function deals with that and assumes that any validity
10993 : checks has already been performed.
10994 :
10995 : While moving the instructions if it encounters a VUSE or VDEF it then
10996 : corrects the VUSES as it moves the statements along. GDEST is the location
10997 : in which to insert the new statements. */
10998 :
10999 : static void
11000 1405 : move_early_exit_stmts (loop_vec_info loop_vinfo)
11001 : {
11002 1405 : DUMP_VECT_SCOPE ("move_early_exit_stmts");
11003 :
11004 1405 : if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
11005 1188 : return;
11006 :
11007 : /* Move all stmts that need moving. */
11008 217 : basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
11009 217 : gimple_stmt_iterator dest_gsi = gsi_after_labels (dest_bb);
11010 :
11011 217 : tree last_seen_vuse = NULL_TREE;
11012 533 : for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11013 : {
11014 : /* We have to update crossed degenerate virtual PHIs. Simply
11015 : elide them. */
11016 316 : if (gphi *vphi = dyn_cast <gphi *> (stmt))
11017 : {
11018 7 : tree vdef = gimple_phi_result (vphi);
11019 7 : tree vuse = gimple_phi_arg_def (vphi, 0);
11020 7 : imm_use_iterator iter;
11021 7 : use_operand_p use_p;
11022 7 : gimple *use_stmt;
11023 30 : FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef)
11024 : {
11025 48 : FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
11026 16 : SET_USE (use_p, vuse);
11027 7 : }
11028 7 : auto gsi = gsi_for_stmt (stmt);
11029 7 : remove_phi_node (&gsi, true);
11030 7 : last_seen_vuse = vuse;
11031 7 : continue;
11032 7 : }
11033 :
11034 : /* Check to see if statement is still required for vect or has been
11035 : elided. */
11036 309 : auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11037 309 : if (!stmt_info)
11038 0 : continue;
11039 :
11040 309 : if (dump_enabled_p ())
11041 158 : dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11042 :
11043 309 : gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11044 309 : gsi_move_before (&stmt_gsi, &dest_gsi, GSI_NEW_STMT);
11045 618 : last_seen_vuse = gimple_vuse (stmt);
11046 : }
11047 :
11048 : /* Update all the stmts with their new reaching VUSES. */
11049 679 : for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11050 : {
11051 194 : if (dump_enabled_p ())
11052 158 : dump_printf_loc (MSG_NOTE, vect_location,
11053 : "updating vuse to %T for load %G",
11054 : last_seen_vuse, p);
11055 194 : gimple_set_vuse (p, last_seen_vuse);
11056 194 : update_stmt (p);
11057 : }
11058 :
11059 : /* And update the LC PHIs on exits. */
11060 1098 : for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
11061 447 : if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
11062 243 : if (gphi *phi = get_virtual_phi (e->dest))
11063 460 : SET_PHI_ARG_DEF_ON_EDGE (phi, e, last_seen_vuse);
11064 : }
11065 :
11066 : /* Generate adjustment code for early break scalar IVs filling in the value
11067 : we created earlier on for LOOP_VINFO_EARLY_BRK_NITERS_VAR. */
11068 :
11069 : static void
11070 1405 : vect_update_ivs_after_vectorizer_for_early_breaks (loop_vec_info loop_vinfo)
11071 : {
11072 1405 : DUMP_VECT_SCOPE ("vect_update_ivs_after_vectorizer_for_early_breaks");
11073 :
11074 1405 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11075 0 : return;
11076 :
11077 1405 : gcc_assert (LOOP_VINFO_EARLY_BRK_NITERS_VAR (loop_vinfo));
11078 :
11079 1405 : tree phi_var = LOOP_VINFO_EARLY_BRK_NITERS_VAR (loop_vinfo);
11080 1405 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11081 1405 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11082 1405 : tree ty_var = TREE_TYPE (phi_var);
11083 1405 : auto loop = LOOP_VINFO_LOOP (loop_vinfo);
11084 1405 : tree induc_var = niters_skip ? copy_ssa_name (phi_var) : phi_var;
11085 :
11086 1405 : auto induction_phi = create_phi_node (induc_var, loop->header);
11087 1405 : tree induc_def = PHI_RESULT (induction_phi);
11088 :
11089 : /* Create the iv update inside the loop. */
11090 1405 : gimple_seq init_stmts = NULL;
11091 1405 : gimple_seq stmts = NULL;
11092 1405 : gimple_seq iv_stmts = NULL;
11093 1405 : tree tree_vf = build_int_cst (ty_var, vf);
11094 :
11095 : /* For loop len targets we have to use .SELECT_VL (ivtmp_33, VF); instead of
11096 : just += VF as the VF can change in between two loop iterations. */
11097 1405 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
11098 : {
11099 0 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
11100 0 : tree_vf = vect_get_loop_len (loop_vinfo, NULL, lens, 1,
11101 : NULL_TREE, 0, 0, true);
11102 : }
11103 :
11104 1405 : tree iter_var;
11105 1405 : if (POINTER_TYPE_P (ty_var))
11106 : {
11107 0 : tree offset = gimple_convert (&stmts, sizetype, tree_vf);
11108 0 : iter_var = gimple_build (&stmts, POINTER_PLUS_EXPR, ty_var, induc_def,
11109 : gimple_convert (&stmts, sizetype, offset));
11110 : }
11111 : else
11112 : {
11113 1405 : tree offset = gimple_convert (&stmts, ty_var, tree_vf);
11114 1405 : iter_var = gimple_build (&stmts, PLUS_EXPR, ty_var, induc_def, offset);
11115 : }
11116 :
11117 1405 : tree init_var = build_zero_cst (ty_var);
11118 1405 : if (niters_skip)
11119 0 : init_var = gimple_build (&init_stmts, MINUS_EXPR, ty_var, init_var,
11120 : gimple_convert (&init_stmts, ty_var, niters_skip));
11121 :
11122 1405 : add_phi_arg (induction_phi, iter_var,
11123 : loop_latch_edge (loop), UNKNOWN_LOCATION);
11124 1405 : add_phi_arg (induction_phi, init_var,
11125 : loop_preheader_edge (loop), UNKNOWN_LOCATION);
11126 :
11127 : /* Find the first insertion point in the BB. */
11128 1405 : auto pe = loop_preheader_edge (loop);
11129 :
11130 : /* If we've done any peeling, calculate the peeling adjustment needed to the
11131 : final IV. */
11132 1405 : if (niters_skip)
11133 : {
11134 0 : tree induc_type = TREE_TYPE (induc_def);
11135 0 : tree s_induc_type = signed_type_for (induc_type);
11136 0 : induc_def = gimple_build (&iv_stmts, MAX_EXPR, s_induc_type,
11137 : gimple_convert (&iv_stmts, s_induc_type,
11138 : induc_def),
11139 : build_zero_cst (s_induc_type));
11140 0 : auto stmt = gimple_build_assign (phi_var,
11141 : gimple_convert (&iv_stmts, induc_type,
11142 : induc_def));
11143 0 : gimple_seq_add_stmt_without_update (&iv_stmts, stmt);
11144 0 : basic_block exit_bb = NULL;
11145 : /* Identify the early exit merge block. I wish we had stored this. */
11146 0 : for (auto e : get_loop_exit_edges (loop))
11147 0 : if (e != LOOP_VINFO_MAIN_EXIT (loop_vinfo))
11148 : {
11149 0 : exit_bb = e->dest;
11150 0 : break;
11151 0 : }
11152 :
11153 0 : gcc_assert (exit_bb);
11154 0 : auto exit_gsi = gsi_after_labels (exit_bb);
11155 0 : gsi_insert_seq_before (&exit_gsi, iv_stmts, GSI_SAME_STMT);
11156 : }
11157 : /* Write the init_stmts in the loop-preheader block. */
11158 1405 : auto psi = gsi_last_nondebug_bb (pe->src);
11159 1405 : gsi_insert_seq_after (&psi, init_stmts, GSI_LAST_NEW_STMT);
11160 : /* Wite the adjustments in the header block. */
11161 1405 : basic_block bb = loop->header;
11162 1405 : auto si = gsi_after_labels (bb);
11163 1405 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11164 : }
11165 :
11166 : /* Function vect_transform_loop.
11167 :
11168 : The analysis phase has determined that the loop is vectorizable.
11169 : Vectorize the loop - created vectorized stmts to replace the scalar
11170 : stmts in the loop, and update the loop exit condition.
11171 : Returns scalar epilogue loop if any. */
11172 :
11173 : class loop *
11174 61572 : vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11175 : {
11176 61572 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11177 61572 : class loop *epilogue = NULL;
11178 61572 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11179 61572 : int nbbs = loop->num_nodes;
11180 61572 : int i;
11181 61572 : tree niters_vector = NULL_TREE;
11182 61572 : tree step_vector = NULL_TREE;
11183 61572 : tree niters_vector_mult_vf = NULL_TREE;
11184 61572 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11185 61572 : unsigned int lowest_vf = constant_lower_bound (vf);
11186 61572 : gimple *stmt;
11187 61572 : bool check_profitability = false;
11188 61572 : unsigned int th;
11189 61572 : bool flat = maybe_flat_loop_profile (loop);
11190 61572 : bool uncounted_p = LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo);
11191 :
11192 61572 : DUMP_VECT_SCOPE ("vec_transform_loop");
11193 :
11194 61572 : if (! LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11195 54505 : loop_vinfo->shared->check_datarefs ();
11196 :
11197 : /* Use the more conservative vectorization threshold. If the number
11198 : of iterations is constant assume the cost check has been performed
11199 : by our caller. If the threshold makes all loops profitable that
11200 : run at least the (estimated) vectorization factor number of times
11201 : checking is pointless, too. */
11202 61572 : th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11203 61572 : if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11204 : {
11205 18670 : if (dump_enabled_p ())
11206 171 : dump_printf_loc (MSG_NOTE, vect_location,
11207 : "Profitability threshold is %d loop iterations.\n",
11208 : th);
11209 : check_profitability = true;
11210 : }
11211 :
11212 : /* Make sure there exists a single-predecessor exit bb. Do this before
11213 : versioning. */
11214 61572 : edge e = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
11215 61572 : if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11216 : {
11217 19127 : split_loop_exit_edge (e, true);
11218 19127 : if (dump_enabled_p ())
11219 2269 : dump_printf (MSG_NOTE, "split exit edge\n");
11220 : }
11221 :
11222 : /* Version the loop first, if required, so the profitability check
11223 : comes first. */
11224 :
11225 61572 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11226 : {
11227 3790 : class loop *sloop
11228 3790 : = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11229 3790 : sloop->force_vectorize = false;
11230 3790 : check_profitability = false;
11231 : }
11232 :
11233 : /* Make sure there exists a single-predecessor exit bb also on the
11234 : scalar loop copy. Do this after versioning but before peeling
11235 : so CFG structure is fine for both scalar and if-converted loop
11236 : to make slpeel_duplicate_current_defs_from_edges face matched
11237 : loop closed PHI nodes on the exit. */
11238 61572 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11239 : {
11240 8037 : e = LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo);
11241 8037 : if (! single_pred_p (e->dest))
11242 : {
11243 7768 : split_loop_exit_edge (e, true);
11244 7768 : if (dump_enabled_p ())
11245 1137 : dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11246 : }
11247 : }
11248 :
11249 61572 : tree niters = vect_build_loop_niters (loop_vinfo);
11250 61572 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11251 61572 : tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11252 61572 : tree advance;
11253 61572 : drs_init_vec orig_drs_init;
11254 61572 : bool niters_no_overflow = uncounted_p ? false /* Not known. */
11255 61529 : : loop_niters_no_overflow (loop_vinfo);
11256 :
11257 61572 : epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11258 : &step_vector, &niters_vector_mult_vf, th,
11259 : check_profitability, niters_no_overflow,
11260 : &advance);
11261 :
11262 : /* Assign hierarchical discriminators to the vectorized loop. */
11263 61572 : poly_uint64 vf_val = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11264 61572 : unsigned int vf_int = constant_lower_bound (vf_val);
11265 61572 : if (vf_int > DISCR_MULTIPLICITY_MAX)
11266 : vf_int = DISCR_MULTIPLICITY_MAX;
11267 :
11268 : /* Assign unique copy_id dynamically instead of using hardcoded constants.
11269 : Epilogue and main vectorized loops get different copy_ids. */
11270 61572 : gimple *loop_last = last_nondebug_stmt (loop->header);
11271 61572 : location_t loop_loc
11272 61572 : = loop_last ? gimple_location (loop_last) : UNKNOWN_LOCATION;
11273 61298 : if (loop_loc != UNKNOWN_LOCATION)
11274 : {
11275 50765 : unsigned int copyid = allocate_copyid_base (loop_loc, 1);
11276 50765 : assign_discriminators_to_loop (loop, vf_int, copyid);
11277 : }
11278 61572 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11279 61572 : && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11280 : {
11281 : /* Ifcvt duplicates loop preheader, loop body and produces an basic
11282 : block after loop exit. We need to scale all that. */
11283 88 : basic_block preheader
11284 88 : = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11285 88 : preheader->count
11286 : = preheader->count.apply_probability
11287 88 : (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11288 88 : scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11289 : LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11290 88 : LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo)->dest->count = preheader->count;
11291 : }
11292 :
11293 61572 : if (niters_vector == NULL_TREE && !uncounted_p)
11294 : {
11295 27353 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11296 27353 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11297 55448 : && known_eq (lowest_vf, vf))
11298 : {
11299 27350 : niters_vector
11300 27350 : = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11301 27350 : LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11302 27350 : step_vector = build_one_cst (TREE_TYPE (niters));
11303 : }
11304 748 : else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11305 1 : vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11306 : &step_vector, niters_no_overflow);
11307 : else
11308 : /* vect_do_peeling subtracted the number of peeled prologue
11309 : iterations from LOOP_VINFO_NITERS. */
11310 747 : vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11311 : &niters_vector, &step_vector,
11312 : niters_no_overflow);
11313 : }
11314 :
11315 : /* 1) Make sure the loop header has exactly two entries
11316 : 2) Make sure we have a preheader basic block. */
11317 :
11318 61572 : gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11319 :
11320 61572 : split_edge (loop_preheader_edge (loop));
11321 :
11322 61572 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11323 : /* This will deal with any possible peeling. */
11324 1 : vect_prepare_for_masked_peels (loop_vinfo);
11325 :
11326 : /* Handle any code motion that we need to for early-break vectorization after
11327 : we've done peeling but just before we start vectorizing. */
11328 61572 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11329 : {
11330 1405 : vect_update_ivs_after_vectorizer_for_early_breaks (loop_vinfo);
11331 1405 : move_early_exit_stmts (loop_vinfo);
11332 : }
11333 :
11334 : /* Remove existing clobber stmts and prefetches. */
11335 187957 : for (i = 0; i < nbbs; i++)
11336 : {
11337 126385 : basic_block bb = bbs[i];
11338 1096629 : for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);)
11339 : {
11340 843859 : stmt = gsi_stmt (si);
11341 843859 : if (gimple_clobber_p (stmt)
11342 843859 : || gimple_call_builtin_p (stmt, BUILT_IN_PREFETCH))
11343 : {
11344 90 : unlink_stmt_vdef (stmt);
11345 90 : gsi_remove (&si, true);
11346 90 : release_defs (stmt);
11347 : }
11348 : else
11349 843769 : gsi_next (&si);
11350 : }
11351 : }
11352 :
11353 : /* Schedule the SLP instances. */
11354 61572 : if (!loop_vinfo->slp_instances.is_empty ())
11355 : {
11356 61572 : DUMP_VECT_SCOPE ("scheduling SLP instances");
11357 61572 : vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11358 : }
11359 :
11360 : /* Generate the loop invariant statements. */
11361 61572 : if (!gimple_seq_empty_p (LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo)))
11362 : {
11363 73 : if (dump_enabled_p ())
11364 30 : dump_printf_loc (MSG_NOTE, vect_location,
11365 : "------>generating loop invariant statements\n");
11366 73 : gimple_stmt_iterator gsi;
11367 73 : gsi = gsi_after_labels (loop_preheader_edge (loop)->src);
11368 73 : gsi_insert_seq_before (&gsi, LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo),
11369 : GSI_CONTINUE_LINKING);
11370 : }
11371 :
11372 : /* Stub out scalar statements that must not survive vectorization and
11373 : were not picked as relevant in any SLP instance.
11374 : Doing this here helps with grouped statements, or statements that
11375 : are involved in patterns. */
11376 187957 : for (i = 0; i < nbbs; i++)
11377 : {
11378 126385 : basic_block bb = bbs[i];
11379 126385 : stmt_vec_info stmt_info;
11380 252770 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11381 1680695 : !gsi_end_p (gsi); gsi_next (&gsi))
11382 : {
11383 1554310 : gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11384 6355 : if (!call || !gimple_call_internal_p (call))
11385 1549109 : continue;
11386 5201 : internal_fn ifn = gimple_call_internal_fn (call);
11387 5201 : if (ifn == IFN_MASK_LOAD)
11388 : {
11389 735 : tree lhs = gimple_get_lhs (call);
11390 735 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11391 : {
11392 0 : tree zero = build_zero_cst (TREE_TYPE (lhs));
11393 0 : gimple *new_stmt = gimple_build_assign (lhs, zero);
11394 0 : gsi_replace (&gsi, new_stmt, true);
11395 : }
11396 : }
11397 4466 : else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11398 : {
11399 2295 : tree lhs = gimple_get_lhs (call);
11400 2295 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11401 : {
11402 0 : tree else_arg
11403 0 : = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11404 0 : gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11405 0 : gsi_replace (&gsi, new_stmt, true);
11406 : }
11407 : }
11408 2171 : else if (ifn == IFN_MASK_CALL
11409 4 : && (stmt_info = loop_vinfo->lookup_stmt (call))
11410 4 : && !STMT_VINFO_RELEVANT_P (stmt_info)
11411 2175 : && !STMT_VINFO_LIVE_P (stmt_info))
11412 : {
11413 4 : gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11414 4 : loop_vinfo->remove_stmt (stmt_info);
11415 : }
11416 : }
11417 : }
11418 :
11419 61572 : if (!uncounted_p)
11420 : {
11421 : /* The vectorization factor is always > 1, so if we use an IV increment of
11422 : 1. A zero NITERS becomes a nonzero NITERS_VECTOR. */
11423 61529 : if (integer_onep (step_vector))
11424 61511 : niters_no_overflow = true;
11425 :
11426 61529 : vect_set_loop_condition (loop, LOOP_VINFO_MAIN_EXIT (loop_vinfo),
11427 : loop_vinfo, niters_vector, step_vector,
11428 61529 : niters_vector_mult_vf, !niters_no_overflow);
11429 : }
11430 :
11431 61572 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11432 :
11433 : /* True if the final iteration might not handle a full vector's
11434 : worth of scalar iterations. */
11435 123144 : bool final_iter_may_be_partial
11436 61572 : = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11437 61572 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
11438 :
11439 : /* +1 to convert latch counts to loop iteration counts. */
11440 61572 : int bias_for_lowest = 1;
11441 :
11442 : /* When we are peeling for gaps then we take away one scalar iteration
11443 : from the vector loop. Thus we can adjust the upper bound by one
11444 : scalar iteration. But only when we know the bound applies to the
11445 : IV exit test which might not be true when we have multiple exits. */
11446 61572 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11447 119962 : bias_for_lowest -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11448 :
11449 61572 : int bias_for_assumed = bias_for_lowest;
11450 61572 : int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11451 61572 : if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11452 : {
11453 : /* When the amount of peeling is known at compile time, the first
11454 : iteration will have exactly alignment_npeels active elements.
11455 : In the worst case it will have at least one. */
11456 1 : int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11457 1 : bias_for_lowest += lowest_vf - min_first_active;
11458 1 : bias_for_assumed += assumed_vf - min_first_active;
11459 : }
11460 : /* In these calculations the "- 1" converts loop iteration counts
11461 : back to latch counts. */
11462 61572 : if (loop->any_upper_bound)
11463 : {
11464 61556 : loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11465 61556 : loop->nb_iterations_upper_bound
11466 61556 : = (final_iter_may_be_partial
11467 62963 : ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11468 2814 : lowest_vf) - 1
11469 60149 : : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11470 120298 : lowest_vf) - 1);
11471 61556 : if (main_vinfo
11472 : /* Both peeling for alignment and peeling for gaps can end up
11473 : with the scalar epilogue running for more than VF-1 iterations. */
11474 7067 : && !main_vinfo->peeling_for_alignment
11475 7019 : && !main_vinfo->peeling_for_gaps)
11476 : {
11477 6837 : unsigned int bound;
11478 6837 : poly_uint64 main_iters
11479 6837 : = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11480 : LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11481 6837 : main_iters
11482 6837 : = upper_bound (main_iters,
11483 6837 : LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11484 13674 : if (can_div_away_from_zero_p (main_iters,
11485 6837 : LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11486 : &bound))
11487 6837 : loop->nb_iterations_upper_bound
11488 6837 : = wi::umin ((bound_wide_int) (bound - 1),
11489 6837 : loop->nb_iterations_upper_bound);
11490 : }
11491 : }
11492 61572 : if (loop->any_likely_upper_bound)
11493 61556 : loop->nb_iterations_likely_upper_bound
11494 61556 : = (final_iter_may_be_partial
11495 62963 : ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11496 1407 : + bias_for_lowest, lowest_vf) - 1
11497 60149 : : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11498 61556 : + bias_for_lowest, lowest_vf) - 1);
11499 61572 : if (loop->any_estimate)
11500 35476 : loop->nb_iterations_estimate
11501 35476 : = (final_iter_may_be_partial
11502 36169 : ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11503 1386 : assumed_vf) - 1
11504 34783 : : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11505 70259 : assumed_vf) - 1);
11506 61572 : scale_profile_for_vect_loop (loop, LOOP_VINFO_MAIN_EXIT (loop_vinfo),
11507 : assumed_vf, flat);
11508 :
11509 61572 : if (dump_enabled_p ())
11510 : {
11511 10955 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11512 : {
11513 9506 : dump_printf_loc (MSG_NOTE, vect_location,
11514 : "LOOP VECTORIZED\n");
11515 9506 : if (loop->inner)
11516 343 : dump_printf_loc (MSG_NOTE, vect_location,
11517 : "OUTER LOOP VECTORIZED\n");
11518 9506 : dump_printf (MSG_NOTE, "\n");
11519 : }
11520 : else
11521 1449 : dump_printf_loc (MSG_NOTE, vect_location,
11522 : "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11523 1449 : GET_MODE_NAME (loop_vinfo->vector_mode));
11524 : }
11525 :
11526 : /* Loops vectorized with a variable factor won't benefit from
11527 : unrolling/peeling. */
11528 61572 : if (!vf.is_constant ())
11529 : {
11530 : loop->unroll = 1;
11531 : if (dump_enabled_p ())
11532 : dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11533 : " variable-length vectorization factor\n");
11534 : }
11535 :
11536 : /* When we have unrolled the loop due to a user requested value we should
11537 : leave it up to the RTL unroll heuristics to determine if it's still worth
11538 : while to unroll more. */
11539 61572 : if (LOOP_VINFO_USER_UNROLL (loop_vinfo))
11540 44 : loop->unroll = 0;
11541 :
11542 : /* Free SLP instances here because otherwise stmt reference counting
11543 : won't work. */
11544 : slp_instance instance;
11545 151489 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11546 89917 : vect_free_slp_instance (instance);
11547 61572 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11548 : /* Clear-up safelen field since its value is invalid after vectorization
11549 : since vectorized loop can have loop-carried dependencies. */
11550 61572 : loop->safelen = 0;
11551 :
11552 61572 : if (epilogue)
11553 : {
11554 : /* Accumulate past advancements made. */
11555 7067 : if (LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo))
11556 89 : advance = fold_build2 (PLUS_EXPR, TREE_TYPE (advance),
11557 : LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo),
11558 : advance);
11559 7067 : update_epilogue_loop_vinfo (epilogue, advance);
11560 :
11561 7067 : epilogue->simduid = loop->simduid;
11562 7067 : epilogue->force_vectorize = loop->force_vectorize;
11563 7067 : epilogue->dont_vectorize = false;
11564 : }
11565 :
11566 61572 : return epilogue;
11567 61572 : }
11568 :
11569 : /* The code below is trying to perform simple optimization - revert
11570 : if-conversion for masked stores, i.e. if the mask of a store is zero
11571 : do not perform it and all stored value producers also if possible.
11572 : For example,
11573 : for (i=0; i<n; i++)
11574 : if (c[i])
11575 : {
11576 : p1[i] += 1;
11577 : p2[i] = p3[i] +2;
11578 : }
11579 : this transformation will produce the following semi-hammock:
11580 :
11581 : if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11582 : {
11583 : vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11584 : vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11585 : MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11586 : vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11587 : vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11588 : MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11589 : }
11590 : */
11591 :
11592 : void
11593 499 : optimize_mask_stores (class loop *loop)
11594 : {
11595 499 : basic_block *bbs = get_loop_body (loop);
11596 499 : unsigned nbbs = loop->num_nodes;
11597 499 : unsigned i;
11598 499 : basic_block bb;
11599 499 : class loop *bb_loop;
11600 499 : gimple_stmt_iterator gsi;
11601 499 : gimple *stmt;
11602 499 : auto_vec<gimple *> worklist;
11603 499 : auto_purge_vect_location sentinel;
11604 :
11605 499 : vect_location = find_loop_location (loop);
11606 : /* Pick up all masked stores in loop if any. */
11607 1996 : for (i = 0; i < nbbs; i++)
11608 : {
11609 998 : bb = bbs[i];
11610 17427 : for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11611 15431 : gsi_next (&gsi))
11612 : {
11613 15431 : stmt = gsi_stmt (gsi);
11614 15431 : if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11615 701 : worklist.safe_push (stmt);
11616 : }
11617 : }
11618 :
11619 499 : free (bbs);
11620 499 : if (worklist.is_empty ())
11621 68 : return;
11622 :
11623 : /* Loop has masked stores. */
11624 1115 : while (!worklist.is_empty ())
11625 : {
11626 684 : gimple *last, *last_store;
11627 684 : edge e, efalse;
11628 684 : tree mask;
11629 684 : basic_block store_bb, join_bb;
11630 684 : gimple_stmt_iterator gsi_to;
11631 684 : tree vdef, new_vdef;
11632 684 : gphi *phi;
11633 684 : tree vectype;
11634 684 : tree zero;
11635 :
11636 684 : last = worklist.pop ();
11637 684 : mask = gimple_call_arg (last, 2);
11638 684 : bb = gimple_bb (last);
11639 : /* Create then_bb and if-then structure in CFG, then_bb belongs to
11640 : the same loop as if_bb. It could be different to LOOP when two
11641 : level loop-nest is vectorized and mask_store belongs to the inner
11642 : one. */
11643 684 : e = split_block (bb, last);
11644 684 : bb_loop = bb->loop_father;
11645 684 : gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11646 684 : join_bb = e->dest;
11647 684 : store_bb = create_empty_bb (bb);
11648 684 : add_bb_to_loop (store_bb, bb_loop);
11649 684 : e->flags = EDGE_TRUE_VALUE;
11650 684 : efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11651 : /* Put STORE_BB to likely part. */
11652 684 : efalse->probability = profile_probability::likely ();
11653 684 : e->probability = efalse->probability.invert ();
11654 684 : store_bb->count = efalse->count ();
11655 684 : make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11656 684 : if (dom_info_available_p (CDI_DOMINATORS))
11657 684 : set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11658 684 : if (dump_enabled_p ())
11659 351 : dump_printf_loc (MSG_NOTE, vect_location,
11660 : "Create new block %d to sink mask stores.",
11661 : store_bb->index);
11662 : /* Create vector comparison with boolean result. */
11663 684 : vectype = TREE_TYPE (mask);
11664 684 : zero = build_zero_cst (vectype);
11665 684 : stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11666 684 : gsi = gsi_last_bb (bb);
11667 684 : gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11668 : /* Create new PHI node for vdef of the last masked store:
11669 : .MEM_2 = VDEF <.MEM_1>
11670 : will be converted to
11671 : .MEM.3 = VDEF <.MEM_1>
11672 : and new PHI node will be created in join bb
11673 : .MEM_2 = PHI <.MEM_1, .MEM_3>
11674 : */
11675 684 : vdef = gimple_vdef (last);
11676 684 : new_vdef = make_ssa_name (gimple_vop (cfun), last);
11677 684 : gimple_set_vdef (last, new_vdef);
11678 684 : phi = create_phi_node (vdef, join_bb);
11679 684 : add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11680 :
11681 : /* Put all masked stores with the same mask to STORE_BB if possible. */
11682 718 : while (true)
11683 : {
11684 701 : gimple_stmt_iterator gsi_from;
11685 701 : gimple *stmt1 = NULL;
11686 :
11687 : /* Move masked store to STORE_BB. */
11688 701 : last_store = last;
11689 701 : gsi = gsi_for_stmt (last);
11690 701 : gsi_from = gsi;
11691 : /* Shift GSI to the previous stmt for further traversal. */
11692 701 : gsi_prev (&gsi);
11693 701 : gsi_to = gsi_start_bb (store_bb);
11694 701 : gsi_move_before (&gsi_from, &gsi_to);
11695 : /* Setup GSI_TO to the non-empty block start. */
11696 701 : gsi_to = gsi_start_bb (store_bb);
11697 701 : if (dump_enabled_p ())
11698 367 : dump_printf_loc (MSG_NOTE, vect_location,
11699 : "Move stmt to created bb\n%G", last);
11700 : /* Move all stored value producers if possible. */
11701 4976 : while (!gsi_end_p (gsi))
11702 : {
11703 4975 : tree lhs;
11704 4975 : imm_use_iterator imm_iter;
11705 4975 : use_operand_p use_p;
11706 4975 : bool res;
11707 :
11708 : /* Skip debug statements. */
11709 4975 : if (is_gimple_debug (gsi_stmt (gsi)))
11710 : {
11711 3 : gsi_prev (&gsi);
11712 3231 : continue;
11713 : }
11714 4972 : stmt1 = gsi_stmt (gsi);
11715 : /* Do not consider statements writing to memory or having
11716 : volatile operand. */
11717 9794 : if (gimple_vdef (stmt1)
11718 9794 : || gimple_has_volatile_ops (stmt1))
11719 : break;
11720 4822 : gsi_from = gsi;
11721 4822 : gsi_prev (&gsi);
11722 4822 : lhs = gimple_get_lhs (stmt1);
11723 4822 : if (!lhs)
11724 : break;
11725 :
11726 : /* LHS of vectorized stmt must be SSA_NAME. */
11727 4822 : if (TREE_CODE (lhs) != SSA_NAME)
11728 : break;
11729 :
11730 4822 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11731 : {
11732 : /* Remove dead scalar statement. */
11733 3560 : if (has_zero_uses (lhs))
11734 : {
11735 3228 : gsi_remove (&gsi_from, true);
11736 3228 : release_defs (stmt1);
11737 3228 : continue;
11738 : }
11739 : }
11740 :
11741 : /* Check that LHS does not have uses outside of STORE_BB. */
11742 1594 : res = true;
11743 4333 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11744 : {
11745 1695 : gimple *use_stmt;
11746 1695 : use_stmt = USE_STMT (use_p);
11747 1695 : if (is_gimple_debug (use_stmt))
11748 0 : continue;
11749 1695 : if (gimple_bb (use_stmt) != store_bb)
11750 : {
11751 : res = false;
11752 : break;
11753 : }
11754 1594 : }
11755 1594 : if (!res)
11756 : break;
11757 :
11758 1044 : if (gimple_vuse (stmt1)
11759 1480 : && gimple_vuse (stmt1) != gimple_vuse (last_store))
11760 : break;
11761 :
11762 : /* Can move STMT1 to STORE_BB. */
11763 1044 : if (dump_enabled_p ())
11764 563 : dump_printf_loc (MSG_NOTE, vect_location,
11765 : "Move stmt to created bb\n%G", stmt1);
11766 1044 : gsi_move_before (&gsi_from, &gsi_to);
11767 : /* Shift GSI_TO for further insertion. */
11768 2088 : gsi_prev (&gsi_to);
11769 : }
11770 : /* Put other masked stores with the same mask to STORE_BB. */
11771 701 : if (worklist.is_empty ()
11772 270 : || gimple_call_arg (worklist.last (), 2) != mask
11773 17 : || worklist.last () != stmt1)
11774 : break;
11775 17 : last = worklist.pop ();
11776 17 : }
11777 1368 : add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11778 : }
11779 499 : }
11780 :
11781 : /* Decide whether it is possible to use a zero-based induction variable
11782 : when vectorizing LOOP_VINFO with partial vectors. If it is, return
11783 : the value that the induction variable must be able to hold in order
11784 : to ensure that the rgroups eventually have no active vector elements.
11785 : Return -1 otherwise. */
11786 :
11787 : widest_int
11788 33510 : vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11789 : {
11790 33510 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11791 33510 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11792 33510 : unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11793 :
11794 : /* Calculate the value that the induction variable must be able
11795 : to hit in order to ensure that we end the loop with an all-false mask.
11796 : This involves adding the maximum number of inactive trailing scalar
11797 : iterations. */
11798 33510 : widest_int iv_limit = -1;
11799 33510 : if (max_loop_iterations (loop, &iv_limit))
11800 : {
11801 33510 : if (niters_skip)
11802 : {
11803 : /* Add the maximum number of skipped iterations to the
11804 : maximum iteration count. */
11805 0 : if (TREE_CODE (niters_skip) == INTEGER_CST)
11806 0 : iv_limit += wi::to_widest (niters_skip);
11807 : else
11808 0 : iv_limit += max_vf - 1;
11809 : }
11810 33510 : else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11811 : /* Make a conservatively-correct assumption. */
11812 336 : iv_limit += max_vf - 1;
11813 :
11814 : /* IV_LIMIT is the maximum number of latch iterations, which is also
11815 : the maximum in-range IV value. Round this value down to the previous
11816 : vector alignment boundary and then add an extra full iteration. */
11817 33510 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11818 33510 : iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11819 : }
11820 33510 : return iv_limit;
11821 : }
11822 :
11823 : /* For the given rgroup_controls RGC, check whether an induction variable
11824 : would ever hit a value that produces a set of all-false masks or zero
11825 : lengths before wrapping around. Return true if it's possible to wrap
11826 : around before hitting the desirable value, otherwise return false. */
11827 :
11828 : bool
11829 0 : vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11830 : {
11831 0 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11832 :
11833 0 : if (iv_limit == -1)
11834 : return true;
11835 :
11836 0 : tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11837 0 : unsigned int compare_precision = TYPE_PRECISION (compare_type);
11838 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11839 :
11840 0 : if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11841 : return true;
11842 :
11843 : return false;
11844 0 : }
|