Line data Source code
1 : /* Loop Vectorization
2 : Copyright (C) 2003-2026 Free Software Foundation, Inc.
3 : Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 : Ira Rosen <irar@il.ibm.com>
5 :
6 : This file is part of GCC.
7 :
8 : GCC is free software; you can redistribute it and/or modify it under
9 : the terms of the GNU General Public License as published by the Free
10 : Software Foundation; either version 3, or (at your option) any later
11 : version.
12 :
13 : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : for more details.
17 :
18 : You should have received a copy of the GNU General Public License
19 : along with GCC; see the file COPYING3. If not see
20 : <http://www.gnu.org/licenses/>. */
21 :
22 : #define INCLUDE_ALGORITHM
23 : #include "config.h"
24 : #include "system.h"
25 : #include "coretypes.h"
26 : #include "backend.h"
27 : #include "target.h"
28 : #include "rtl.h"
29 : #include "tree.h"
30 : #include "gimple.h"
31 : #include "cfghooks.h"
32 : #include "tree-pass.h"
33 : #include "ssa.h"
34 : #include "optabs-tree.h"
35 : #include "memmodel.h"
36 : #include "optabs.h"
37 : #include "diagnostic-core.h"
38 : #include "fold-const.h"
39 : #include "stor-layout.h"
40 : #include "cfganal.h"
41 : #include "gimplify.h"
42 : #include "gimple-iterator.h"
43 : #include "gimplify-me.h"
44 : #include "tree-ssa-loop-ivopts.h"
45 : #include "tree-ssa-loop-manip.h"
46 : #include "tree-ssa-loop-niter.h"
47 : #include "tree-ssa-loop.h"
48 : #include "cfgloop.h"
49 : #include "tree-scalar-evolution.h"
50 : #include "tree-vectorizer.h"
51 : #include "gimple-fold.h"
52 : #include "cgraph.h"
53 : #include "tree-cfg.h"
54 : #include "tree-if-conv.h"
55 : #include "internal-fn.h"
56 : #include "tree-vector-builder.h"
57 : #include "vec-perm-indices.h"
58 : #include "tree-eh.h"
59 : #include "case-cfn-macros.h"
60 : #include "langhooks.h"
61 : #include "opts.h"
62 : #include "hierarchical_discriminator.h"
63 :
64 : /* Loop Vectorization Pass.
65 :
66 : This pass tries to vectorize loops.
67 :
68 : For example, the vectorizer transforms the following simple loop:
69 :
70 : short a[N]; short b[N]; short c[N]; int i;
71 :
72 : for (i=0; i<N; i++){
73 : a[i] = b[i] + c[i];
74 : }
75 :
76 : as if it was manually vectorized by rewriting the source code into:
77 :
78 : typedef int __attribute__((mode(V8HI))) v8hi;
79 : short a[N]; short b[N]; short c[N]; int i;
80 : v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
81 : v8hi va, vb, vc;
82 :
83 : for (i=0; i<N/8; i++){
84 : vb = pb[i];
85 : vc = pc[i];
86 : va = vb + vc;
87 : pa[i] = va;
88 : }
89 :
90 : The main entry to this pass is vectorize_loops(), in which
91 : the vectorizer applies a set of analyses on a given set of loops,
92 : followed by the actual vectorization transformation for the loops that
93 : had successfully passed the analysis phase.
94 : Throughout this pass we make a distinction between two types of
95 : data: scalars (which are represented by SSA_NAMES), and memory references
96 : ("data-refs"). These two types of data require different handling both
97 : during analysis and transformation. The types of data-refs that the
98 : vectorizer currently supports are ARRAY_REFS which base is an array DECL
99 : (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
100 : accesses are required to have a simple (consecutive) access pattern.
101 :
102 : Analysis phase:
103 : ===============
104 : The driver for the analysis phase is vect_analyze_loop().
105 : It applies a set of analyses, some of which rely on the scalar evolution
106 : analyzer (scev) developed by Sebastian Pop.
107 :
108 : During the analysis phase the vectorizer records some information
109 : per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
110 : loop, as well as general information about the loop as a whole, which is
111 : recorded in a "loop_vec_info" struct attached to each loop.
112 :
113 : Transformation phase:
114 : =====================
115 : The loop transformation phase scans all the stmts in the loop, and
116 : creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
117 : the loop that needs to be vectorized. It inserts the vector code sequence
118 : just before the scalar stmt S, and records a pointer to the vector code
119 : in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
120 : attached to S). This pointer will be used for the vectorization of following
121 : stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
122 : otherwise, we rely on dead code elimination for removing it.
123 :
124 : For example, say stmt S1 was vectorized into stmt VS1:
125 :
126 : VS1: vb = px[i];
127 : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
128 : S2: a = b;
129 :
130 : To vectorize stmt S2, the vectorizer first finds the stmt that defines
131 : the operand 'b' (S1), and gets the relevant vector def 'vb' from the
132 : vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
133 : resulting sequence would be:
134 :
135 : VS1: vb = px[i];
136 : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
137 : VS2: va = vb;
138 : S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
139 :
140 : Operands that are not SSA_NAMEs, are data-refs that appear in
141 : load/store operations (like 'x[i]' in S1), and are handled differently.
142 :
143 : Target modeling:
144 : =================
145 : Currently the only target specific information that is used is the
146 : size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
147 : Targets that can support different sizes of vectors, for now will need
148 : to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
149 : flexibility will be added in the future.
150 :
151 : Since we only vectorize operations which vector form can be
152 : expressed using existing tree codes, to verify that an operation is
153 : supported, the vectorizer checks the relevant optab at the relevant
154 : machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
155 : the value found is CODE_FOR_nothing, then there's no target support, and
156 : we can't vectorize the stmt.
157 :
158 : For additional information on this project see:
159 : http://gcc.gnu.org/projects/tree-ssa/vectorization.html
160 : */
161 :
162 : static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
163 : unsigned *);
164 : static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
165 : gphi **);
166 :
167 :
168 : /* Function vect_is_simple_iv_evolution.
169 :
170 : FORNOW: A simple evolution of an induction variables in the loop is
171 : considered a polynomial evolution. */
172 :
173 : static bool
174 766212 : vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn,
175 : stmt_vec_info stmt_info)
176 : {
177 766212 : tree init_expr;
178 766212 : tree step_expr;
179 766212 : tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
180 766212 : basic_block bb;
181 :
182 : /* When there is no evolution in this loop, the evolution function
183 : is not "simple". */
184 766212 : if (evolution_part == NULL_TREE)
185 : return false;
186 :
187 : /* When the evolution is a polynomial of degree >= 2
188 : the evolution function is not "simple". */
189 814552 : if (tree_is_chrec (evolution_part))
190 : return false;
191 :
192 664160 : step_expr = evolution_part;
193 664160 : init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
194 :
195 664160 : if (dump_enabled_p ())
196 38544 : dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
197 : step_expr, init_expr);
198 :
199 664160 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = init_expr;
200 664160 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = step_expr;
201 :
202 664160 : if (TREE_CODE (step_expr) != INTEGER_CST
203 56016 : && (TREE_CODE (step_expr) != SSA_NAME
204 44453 : || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
205 44203 : && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
206 7714 : || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
207 127 : && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
208 127 : || !flag_associative_math)))
209 712557 : && (TREE_CODE (step_expr) != REAL_CST
210 431 : || !flag_associative_math))
211 : {
212 48340 : if (dump_enabled_p ())
213 2948 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
214 : "step unknown.\n");
215 48340 : return false;
216 : }
217 :
218 : return true;
219 : }
220 :
221 : /* Function vect_is_nonlinear_iv_evolution
222 :
223 : Only support nonlinear induction for integer type
224 : 1. neg
225 : 2. mul by constant
226 : 3. lshift/rshift by constant.
227 :
228 : For neg induction, return a fake step as integer -1. */
229 : static bool
230 147960 : vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
231 : gphi* loop_phi_node)
232 : {
233 147960 : tree init_expr, ev_expr, result, op1, op2;
234 147960 : gimple* def;
235 :
236 147960 : if (gimple_phi_num_args (loop_phi_node) != 2)
237 : return false;
238 :
239 147960 : init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
240 147960 : ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
241 :
242 : /* Support nonlinear induction only for integer type. */
243 147960 : if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
244 : return false;
245 :
246 90341 : result = PHI_RESULT (loop_phi_node);
247 :
248 90341 : if (TREE_CODE (ev_expr) != SSA_NAME
249 88042 : || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
250 90341 : || !is_gimple_assign (def))
251 : return false;
252 :
253 82184 : enum tree_code t_code = gimple_assign_rhs_code (def);
254 82184 : tree step;
255 82184 : switch (t_code)
256 : {
257 1808 : case NEGATE_EXPR:
258 1808 : if (gimple_assign_rhs1 (def) != result)
259 : return false;
260 1808 : step = build_int_cst (TREE_TYPE (init_expr), -1);
261 1808 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
262 1808 : break;
263 :
264 14361 : case RSHIFT_EXPR:
265 14361 : case LSHIFT_EXPR:
266 14361 : case MULT_EXPR:
267 14361 : op1 = gimple_assign_rhs1 (def);
268 14361 : op2 = gimple_assign_rhs2 (def);
269 14361 : if (TREE_CODE (op2) != INTEGER_CST
270 10504 : || op1 != result)
271 : return false;
272 10122 : step = op2;
273 10122 : if (t_code == LSHIFT_EXPR)
274 472 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
275 9650 : else if (t_code == RSHIFT_EXPR)
276 8682 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
277 : /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
278 : else
279 968 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
280 : break;
281 :
282 : default:
283 : return false;
284 : }
285 :
286 11930 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = init_expr;
287 11930 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = step;
288 :
289 11930 : return true;
290 : }
291 :
292 : /* Returns true if Phi is a first-order recurrence. A first-order
293 : recurrence is a non-reduction recurrence relation in which the value of
294 : the recurrence in the current loop iteration equals a value defined in
295 : the previous iteration. */
296 :
297 : static bool
298 66876 : vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
299 : gphi *phi)
300 : {
301 : /* A nested cycle isn't vectorizable as first order recurrence. */
302 66876 : if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
303 : return false;
304 :
305 : /* Ensure the loop latch definition is from within the loop. */
306 66710 : edge latch = loop_latch_edge (loop);
307 66710 : tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
308 66710 : if (TREE_CODE (ldef) != SSA_NAME
309 64068 : || SSA_NAME_IS_DEFAULT_DEF (ldef)
310 64002 : || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
311 126108 : || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
312 7962 : return false;
313 :
314 58748 : tree def = gimple_phi_result (phi);
315 :
316 : /* Ensure every use_stmt of the phi node is dominated by the latch
317 : definition. */
318 58748 : imm_use_iterator imm_iter;
319 58748 : use_operand_p use_p;
320 130178 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
321 70937 : if (!is_gimple_debug (USE_STMT (use_p))
322 138322 : && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
323 46397 : || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
324 : USE_STMT (use_p))))
325 58255 : return false;
326 :
327 : /* First-order recurrence autovectorization needs shuffle vector. */
328 493 : tree scalar_type = TREE_TYPE (def);
329 493 : tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
330 493 : if (!vectype)
331 : return false;
332 :
333 : return true;
334 : }
335 :
336 : /* Function vect_analyze_scalar_cycles_1.
337 :
338 : Examine the cross iteration def-use cycles of scalar variables
339 : in LOOP. LOOP_VINFO represents the loop that is now being
340 : considered for vectorization (can be LOOP, or an outer-loop
341 : enclosing LOOP). SLP indicates there will be some subsequent
342 : slp analyses or not. */
343 :
344 : static void
345 381941 : vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
346 : {
347 381941 : basic_block bb = loop->header;
348 381941 : auto_vec<stmt_vec_info, 64> worklist;
349 381941 : gphi_iterator gsi;
350 :
351 381941 : DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
352 :
353 : /* First - identify all inductions. Reduction detection assumes that all the
354 : inductions have been identified, therefore, this order must not be
355 : changed. */
356 1342062 : for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
357 : {
358 960121 : gphi *phi = gsi.phi ();
359 960121 : tree access_fn = NULL;
360 960121 : tree def = PHI_RESULT (phi);
361 960121 : stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
362 :
363 : /* Skip virtual phi's. The data dependences that are associated with
364 : virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
365 1920242 : if (virtual_operand_p (def))
366 332377 : continue;
367 :
368 : /* Skip already analyzed inner loop PHIs of double reductions. */
369 767157 : if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))
370 945 : continue;
371 :
372 766212 : if (dump_enabled_p ())
373 40607 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
374 : (gimple *) phi);
375 :
376 766212 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
377 :
378 : /* Analyze the evolution function. */
379 766212 : access_fn = analyze_scalar_evolution (loop, def);
380 766212 : if (dump_enabled_p ())
381 40607 : dump_printf_loc (MSG_NOTE, vect_location,
382 : "Access function of PHI: %T\n", access_fn);
383 766212 : if (access_fn)
384 766212 : STRIP_NOPS (access_fn);
385 :
386 904680 : if ((!access_fn
387 766212 : || !vect_is_simple_iv_evolution (loop->num, access_fn, stmt_vinfo)
388 615820 : || (LOOP_VINFO_LOOP (loop_vinfo) != loop
389 10861 : && (TREE_CODE (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo))
390 : != INTEGER_CST)))
391 : /* Only handle nonlinear iv for same loop. */
392 916610 : && (LOOP_VINFO_LOOP (loop_vinfo) != loop
393 147960 : || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo, phi)))
394 : {
395 138468 : worklist.safe_push (stmt_vinfo);
396 138468 : continue;
397 : }
398 :
399 627744 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
400 : != NULL_TREE);
401 627744 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
402 :
403 627744 : if (dump_enabled_p ())
404 35697 : dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
405 627744 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
406 :
407 : /* Mark if we have a non-linear IV. */
408 627744 : LOOP_VINFO_NON_LINEAR_IV (loop_vinfo)
409 627744 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_vinfo) != vect_step_op_add;
410 : }
411 :
412 :
413 : /* Second - identify all reductions and nested cycles. */
414 520409 : while (worklist.length () > 0)
415 : {
416 138468 : stmt_vec_info stmt_vinfo = worklist.pop ();
417 138468 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
418 138468 : tree def = PHI_RESULT (phi);
419 :
420 138468 : if (dump_enabled_p ())
421 4910 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
422 : (gimple *) phi);
423 :
424 276936 : gcc_assert (!virtual_operand_p (def)
425 : && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
426 :
427 138468 : gphi *double_reduc;
428 138468 : stmt_vec_info reduc_stmt_info
429 138468 : = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc);
430 138468 : if (reduc_stmt_info && double_reduc)
431 : {
432 1047 : stmt_vec_info inner_phi_info
433 1047 : = loop_vinfo->lookup_stmt (double_reduc);
434 : /* ??? Pass down flag we're the inner loop of a double reduc. */
435 1047 : stmt_vec_info inner_reduc_info
436 1047 : = vect_is_simple_reduction (loop_vinfo, inner_phi_info, NULL);
437 1047 : if (inner_reduc_info)
438 : {
439 945 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
440 945 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
441 945 : STMT_VINFO_REDUC_DEF (inner_phi_info) = inner_reduc_info;
442 945 : STMT_VINFO_REDUC_DEF (inner_reduc_info) = inner_phi_info;
443 945 : if (dump_enabled_p ())
444 130 : dump_printf_loc (MSG_NOTE, vect_location,
445 : "Detected double reduction.\n");
446 :
447 945 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
448 945 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
449 945 : STMT_VINFO_DEF_TYPE (inner_phi_info) = vect_nested_cycle;
450 : /* Make it accessible for SLP vectorization. */
451 945 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
452 : }
453 102 : else if (dump_enabled_p ())
454 14 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
455 : "Unknown def-use cycle pattern.\n");
456 : }
457 137421 : else if (reduc_stmt_info)
458 : {
459 70545 : if (loop != LOOP_VINFO_LOOP (loop_vinfo))
460 : {
461 2272 : if (dump_enabled_p ())
462 433 : dump_printf_loc (MSG_NOTE, vect_location,
463 : "Detected vectorizable nested cycle.\n");
464 :
465 2272 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
466 : }
467 : else
468 : {
469 68273 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
470 68273 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
471 68273 : if (dump_enabled_p ())
472 3788 : dump_printf_loc (MSG_NOTE, vect_location,
473 : "Detected reduction.\n");
474 :
475 68273 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
476 68273 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
477 68273 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
478 : }
479 : }
480 66876 : else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
481 487 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
482 : else
483 66389 : if (dump_enabled_p ())
484 473 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
485 : "Unknown def-use cycle pattern.\n");
486 : }
487 381941 : }
488 :
489 :
490 : /* Function vect_analyze_scalar_cycles.
491 :
492 : Examine the cross iteration def-use cycles of scalar variables, by
493 : analyzing the loop-header PHIs of scalar variables. Classify each
494 : cycle as one of the following: invariant, induction, reduction, unknown.
495 : We do that for the loop represented by LOOP_VINFO, and also to its
496 : inner-loop, if exists.
497 : Examples for scalar cycles:
498 :
499 : Example1: reduction:
500 :
501 : loop1:
502 : for (i=0; i<N; i++)
503 : sum += a[i];
504 :
505 : Example2: induction:
506 :
507 : loop2:
508 : for (i=0; i<N; i++)
509 : a[i] = i; */
510 :
511 : static void
512 376412 : vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
513 : {
514 376412 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
515 :
516 376412 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
517 :
518 : /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
519 : Reductions in such inner-loop therefore have different properties than
520 : the reductions in the nest that gets vectorized:
521 : 1. When vectorized, they are executed in the same order as in the original
522 : scalar loop, so we can't change the order of computation when
523 : vectorizing them.
524 : 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
525 : current checks are too strict. */
526 :
527 376412 : if (loop->inner)
528 5529 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
529 376412 : }
530 :
531 : /* Function vect_get_loop_niters.
532 :
533 : Determine how many iterations the loop is executed and place it
534 : in NUMBER_OF_ITERATIONS. Place the number of latch iterations
535 : in NUMBER_OF_ITERATIONSM1. Place the condition under which the
536 : niter information holds in ASSUMPTIONS.
537 :
538 : Return the loop exit conditions. */
539 :
540 :
541 : static vec<gcond *>
542 281406 : vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
543 : tree *number_of_iterations, tree *number_of_iterationsm1)
544 : {
545 281406 : auto_vec<edge> exits = get_loop_exit_edges (loop);
546 281406 : vec<gcond *> conds;
547 562812 : conds.create (exits.length ());
548 281406 : class tree_niter_desc niter_desc;
549 281406 : tree niter_assumptions, niter, may_be_zero;
550 :
551 281406 : *assumptions = boolean_true_node;
552 281406 : *number_of_iterationsm1 = chrec_dont_know;
553 281406 : *number_of_iterations = chrec_dont_know;
554 :
555 281406 : DUMP_VECT_SCOPE ("get_loop_niters");
556 :
557 281406 : if (exits.is_empty ())
558 0 : return conds;
559 :
560 281406 : if (dump_enabled_p ())
561 14584 : dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
562 : exits.length ());
563 :
564 : edge exit;
565 : unsigned int i;
566 686087 : FOR_EACH_VEC_ELT (exits, i, exit)
567 : {
568 404681 : gcond *cond = get_loop_exit_condition (exit);
569 404681 : if (cond)
570 404651 : conds.safe_push (cond);
571 :
572 404681 : if (dump_enabled_p ())
573 15706 : dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
574 :
575 404681 : if (exit != main_exit)
576 183601 : continue;
577 :
578 281406 : may_be_zero = NULL_TREE;
579 281406 : if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
580 281406 : || chrec_contains_undetermined (niter_desc.niter))
581 60326 : continue;
582 :
583 221080 : niter_assumptions = niter_desc.assumptions;
584 221080 : may_be_zero = niter_desc.may_be_zero;
585 221080 : niter = niter_desc.niter;
586 :
587 221080 : if (may_be_zero && integer_zerop (may_be_zero))
588 : may_be_zero = NULL_TREE;
589 :
590 9315 : if (may_be_zero)
591 : {
592 9315 : if (COMPARISON_CLASS_P (may_be_zero))
593 : {
594 : /* Try to combine may_be_zero with assumptions, this can simplify
595 : computation of niter expression. */
596 9315 : if (niter_assumptions && !integer_nonzerop (niter_assumptions))
597 1023 : niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
598 : niter_assumptions,
599 : fold_build1 (TRUTH_NOT_EXPR,
600 : boolean_type_node,
601 : may_be_zero));
602 : else
603 8292 : niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
604 : build_int_cst (TREE_TYPE (niter), 0),
605 : rewrite_to_non_trapping_overflow (niter));
606 :
607 221080 : may_be_zero = NULL_TREE;
608 : }
609 0 : else if (integer_nonzerop (may_be_zero))
610 : {
611 0 : *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
612 0 : *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
613 0 : continue;
614 : }
615 : else
616 0 : continue;
617 : }
618 :
619 : /* Loop assumptions are based off the normal exit. */
620 221080 : *assumptions = niter_assumptions;
621 221080 : *number_of_iterationsm1 = niter;
622 :
623 : /* We want the number of loop header executions which is the number
624 : of latch executions plus one.
625 : ??? For UINT_MAX latch executions this number overflows to zero
626 : for loops like do { n++; } while (n != 0); */
627 221080 : if (niter && !chrec_contains_undetermined (niter))
628 : {
629 221080 : niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
630 : unshare_expr (niter),
631 : build_int_cst (TREE_TYPE (niter), 1));
632 221080 : if (TREE_CODE (niter) == INTEGER_CST
633 120566 : && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
634 : {
635 : /* If we manage to fold niter + 1 into INTEGER_CST even when
636 : niter is some complex expression, ensure back
637 : *number_of_iterationsm1 is an INTEGER_CST as well. See
638 : PR113210. */
639 0 : *number_of_iterationsm1
640 0 : = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
641 : build_minus_one_cst (TREE_TYPE (niter)));
642 : }
643 : }
644 221080 : *number_of_iterations = niter;
645 : }
646 :
647 281406 : if (dump_enabled_p ())
648 14584 : dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
649 :
650 281406 : return conds;
651 281406 : }
652 :
653 : /* Determine the main loop exit for the vectorizer. */
654 :
655 : edge
656 495258 : vec_init_loop_exit_info (class loop *loop)
657 : {
658 : /* Before we begin we must first determine which exit is the main one and
659 : which are auxilary exits. */
660 495258 : auto_vec<edge> exits = get_loop_exit_edges (loop);
661 985507 : if (exits.length () == 0)
662 : return NULL;
663 490249 : if (exits.length () == 1)
664 323299 : return exits[0];
665 :
666 : /* If we have multiple exits, look for counting IV exit.
667 : Analyze all exits and return the last one we can analyze. */
668 166950 : class tree_niter_desc niter_desc;
669 166950 : edge candidate = NULL;
670 617680 : for (edge exit : exits)
671 : {
672 471369 : if (!get_loop_exit_condition (exit))
673 : {
674 20639 : if (dump_enabled_p ())
675 14 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
676 : "Unhandled loop exit detected.\n");
677 20639 : return NULL;
678 : }
679 :
680 450730 : if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
681 450730 : && !chrec_contains_undetermined (niter_desc.niter))
682 : {
683 133245 : tree may_be_zero = niter_desc.may_be_zero;
684 133245 : if ((integer_zerop (may_be_zero)
685 : /* As we are handling may_be_zero that's not false by
686 : rewriting niter to may_be_zero ? 0 : niter we require
687 : an empty latch. */
688 461244 : || (single_pred_p (loop->latch)
689 10167 : && exit->src == single_pred (loop->latch)
690 2679 : && (integer_nonzerop (may_be_zero)
691 2679 : || COMPARISON_CLASS_P (may_be_zero))))
692 135924 : && (!candidate
693 5934 : || dominated_by_p (CDI_DOMINATORS, exit->src,
694 5934 : candidate->src)))
695 : candidate = exit;
696 : }
697 : }
698 :
699 : /* If no exit is analyzable by scalar evolution, we return the last exit
700 : under the assummption we are dealing with an uncounted loop. */
701 202256 : if (!candidate && single_pred_p (loop->latch))
702 35306 : candidate = loop_exits_from_bb_p (loop, single_pred (loop->latch));
703 :
704 : return candidate;
705 166950 : }
706 :
707 : /* Function bb_in_loop_p
708 :
709 : Used as predicate for dfs order traversal of the loop bbs. */
710 :
711 : static bool
712 1526430 : bb_in_loop_p (const_basic_block bb, const void *data)
713 : {
714 1526430 : const class loop *const loop = (const class loop *)data;
715 1526430 : if (flow_bb_inside_loop_p (loop, bb))
716 : return true;
717 : return false;
718 : }
719 :
720 :
721 : /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
722 : stmt_vec_info structs for all the stmts in LOOP_IN. */
723 :
724 499535 : _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
725 : : vec_info (vec_info::loop, shared),
726 499535 : loop (loop_in),
727 499535 : num_itersm1 (NULL_TREE),
728 499535 : num_iters (NULL_TREE),
729 499535 : num_iters_unchanged (NULL_TREE),
730 499535 : num_iters_assumptions (NULL_TREE),
731 499535 : vector_costs (nullptr),
732 499535 : scalar_costs (nullptr),
733 499535 : th (0),
734 499535 : versioning_threshold (0),
735 499535 : vectorization_factor (0),
736 499535 : main_loop_edge (nullptr),
737 499535 : skip_main_loop_edge (nullptr),
738 499535 : skip_this_loop_edge (nullptr),
739 499535 : reusable_accumulators (),
740 499535 : suggested_unroll_factor (1),
741 499535 : max_vectorization_factor (0),
742 499535 : mask_skip_niters (NULL_TREE),
743 499535 : mask_skip_niters_pfa_offset (NULL_TREE),
744 499535 : rgroup_compare_type (NULL_TREE),
745 499535 : simd_if_cond (NULL_TREE),
746 499535 : partial_vector_style (vect_partial_vectors_none),
747 499535 : unaligned_dr (NULL),
748 499535 : peeling_for_alignment (0),
749 499535 : ptr_mask (0),
750 499535 : max_spec_read_amount (0),
751 499535 : nonlinear_iv (false),
752 499535 : ivexpr_map (NULL),
753 499535 : scan_map (NULL),
754 499535 : inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
755 499535 : vectorizable (false),
756 499535 : can_use_partial_vectors_p (true),
757 499535 : must_use_partial_vectors_p (false),
758 499535 : using_partial_vectors_p (false),
759 499535 : using_decrementing_iv_p (false),
760 499535 : using_select_vl_p (false),
761 499535 : allow_mutual_alignment (false),
762 499535 : partial_load_store_bias (0),
763 499535 : peeling_for_gaps (false),
764 499535 : peeling_for_niter (false),
765 499535 : early_breaks (false),
766 499535 : user_unroll (false),
767 499535 : no_data_dependencies (false),
768 499535 : has_mask_store (false),
769 499535 : scalar_loop_scaling (profile_probability::uninitialized ()),
770 499535 : scalar_loop (NULL),
771 499535 : main_loop_info (NULL),
772 499535 : orig_loop_info (NULL),
773 499535 : epilogue_vinfo (NULL),
774 499535 : drs_advanced_by (NULL_TREE),
775 499535 : vec_loop_main_exit (NULL),
776 499535 : vec_epilogue_loop_main_exit (NULL),
777 499535 : scalar_loop_main_exit (NULL)
778 : {
779 : /* CHECKME: We want to visit all BBs before their successors (except for
780 : latch blocks, for which this assertion wouldn't hold). In the simple
781 : case of the loop forms we allow, a dfs order of the BBs would the same
782 : as reversed postorder traversal, so we are safe. */
783 :
784 499535 : bbs = XCNEWVEC (basic_block, loop->num_nodes);
785 999070 : nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, bbs,
786 499535 : loop->num_nodes, loop);
787 499535 : gcc_assert (nbbs == loop->num_nodes);
788 :
789 1775324 : for (unsigned int i = 0; i < nbbs; i++)
790 : {
791 1275789 : basic_block bb = bbs[i];
792 1275789 : gimple_stmt_iterator si;
793 :
794 2570468 : for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
795 : {
796 1294679 : gimple *phi = gsi_stmt (si);
797 1294679 : gimple_set_uid (phi, 0);
798 1294679 : add_stmt (phi);
799 : }
800 :
801 11711067 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
802 : {
803 9159489 : gimple *stmt = gsi_stmt (si);
804 9159489 : gimple_set_uid (stmt, 0);
805 9159489 : if (is_gimple_debug (stmt))
806 4045008 : continue;
807 5114481 : add_stmt (stmt);
808 : /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
809 : third argument is the #pragma omp simd if (x) condition, when 0,
810 : loop shouldn't be vectorized, when non-zero constant, it should
811 : be vectorized normally, otherwise versioned with vectorized loop
812 : done if the condition is non-zero at runtime. */
813 5114481 : if (loop_in->simduid
814 43372 : && is_gimple_call (stmt)
815 4268 : && gimple_call_internal_p (stmt)
816 4141 : && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
817 4137 : && gimple_call_num_args (stmt) >= 3
818 103 : && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
819 5114584 : && (loop_in->simduid
820 103 : == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
821 : {
822 103 : tree arg = gimple_call_arg (stmt, 2);
823 103 : if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
824 103 : simd_if_cond = arg;
825 : else
826 0 : gcc_assert (integer_nonzerop (arg));
827 : }
828 : }
829 : }
830 499535 : }
831 :
832 : /* Free all levels of rgroup CONTROLS. */
833 :
834 : void
835 1260961 : release_vec_loop_controls (vec<rgroup_controls> *controls)
836 : {
837 1260961 : rgroup_controls *rgc;
838 1260961 : unsigned int i;
839 1278535 : FOR_EACH_VEC_ELT (*controls, i, rgc)
840 17574 : rgc->controls.release ();
841 1260961 : controls->release ();
842 1260961 : }
843 :
844 : /* Free all memory used by the _loop_vec_info, as well as all the
845 : stmt_vec_info structs of all the stmts in the loop. */
846 :
847 499535 : _loop_vec_info::~_loop_vec_info ()
848 : {
849 499535 : free (bbs);
850 :
851 499535 : release_vec_loop_controls (&masks.rgc_vec);
852 499535 : release_vec_loop_controls (&lens);
853 503481 : delete ivexpr_map;
854 499857 : delete scan_map;
855 499535 : delete scalar_costs;
856 499535 : delete vector_costs;
857 646193 : for (auto reduc_info : reduc_infos)
858 142672 : delete reduc_info;
859 :
860 : /* When we release an epiloge vinfo that we do not intend to use
861 : avoid clearing AUX of the main loop which should continue to
862 : point to the main loop vinfo since otherwise we'll leak that. */
863 499535 : if (loop->aux == this)
864 61422 : loop->aux = NULL;
865 999070 : }
866 :
867 : /* Return an invariant or register for EXPR and emit necessary
868 : computations in the LOOP_VINFO loop preheader. */
869 :
870 : tree
871 20721 : cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
872 : {
873 20721 : if (is_gimple_reg (expr)
874 20721 : || is_gimple_min_invariant (expr))
875 6994 : return expr;
876 :
877 13727 : if (! loop_vinfo->ivexpr_map)
878 3946 : loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
879 13727 : tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
880 13727 : if (! cached)
881 : {
882 8893 : gimple_seq stmts = NULL;
883 8893 : cached = force_gimple_operand (unshare_expr (expr),
884 : &stmts, true, NULL_TREE);
885 8893 : if (stmts)
886 : {
887 8753 : edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
888 8753 : gsi_insert_seq_on_edge_immediate (e, stmts);
889 : }
890 : }
891 13727 : return cached;
892 : }
893 :
894 : /* Return true if we can use CMP_TYPE as the comparison type to produce
895 : all masks required to mask LOOP_VINFO. */
896 :
897 : static bool
898 78817 : can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
899 : {
900 78817 : rgroup_controls *rgm;
901 78817 : unsigned int i;
902 91375 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
903 91375 : if (rgm->type != NULL_TREE
904 91375 : && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
905 : cmp_type, rgm->type,
906 : OPTIMIZE_FOR_SPEED))
907 : return false;
908 : return true;
909 : }
910 :
911 : /* Calculate the maximum number of scalars per iteration for every
912 : rgroup in LOOP_VINFO. */
913 :
914 : static unsigned int
915 16755 : vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
916 : {
917 16755 : unsigned int res = 1;
918 16755 : unsigned int i;
919 16755 : rgroup_controls *rgm;
920 41044 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
921 24289 : res = MAX (res, rgm->max_nscalars_per_iter);
922 16755 : return res;
923 : }
924 :
925 : /* Calculate the minimum precision necessary to represent:
926 :
927 : MAX_NITERS * FACTOR
928 :
929 : as an unsigned integer, where MAX_NITERS is the maximum number of
930 : loop header iterations for the original scalar form of LOOP_VINFO. */
931 :
932 : unsigned
933 18597 : vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
934 : {
935 18597 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
936 :
937 : /* Get the maximum number of iterations that is representable
938 : in the counter type. */
939 18597 : tree ni_type;
940 18597 : if (!LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo))
941 18597 : ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
942 : else
943 0 : ni_type = sizetype;
944 18597 : widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
945 :
946 : /* Get a more refined estimate for the number of iterations. */
947 18597 : widest_int max_back_edges;
948 18597 : if (max_loop_iterations (loop, &max_back_edges))
949 18597 : max_ni = wi::smin (max_ni, max_back_edges + 1);
950 :
951 : /* Work out how many bits we need to represent the limit. */
952 18597 : return wi::min_precision (max_ni * factor, UNSIGNED);
953 18597 : }
954 :
955 : /* True if the loop needs peeling or partial vectors when vectorized. */
956 :
957 : static bool
958 112665 : vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
959 : {
960 112665 : unsigned HOST_WIDE_INT const_vf;
961 :
962 112665 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
963 : return true;
964 :
965 12581 : loop_vec_info main_loop_vinfo
966 111403 : = (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
967 111403 : ? LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo) : loop_vinfo);
968 111403 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
969 53451 : && LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo) >= 0)
970 : {
971 : /* Work out the (constant) number of iterations that need to be
972 : peeled for reasons other than niters. */
973 53408 : unsigned int peel_niter
974 : = LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo);
975 53408 : return !multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
976 53408 : LOOP_VINFO_VECT_FACTOR (loop_vinfo));
977 : }
978 :
979 57995 : if (!LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo)
980 57995 : && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf))
981 : {
982 : /* When the number of iterations is a multiple of the vectorization
983 : factor and we are not doing prologue or forced epilogue peeling
984 : the epilogue isn't necessary. */
985 57584 : if (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
986 115168 : >= (unsigned) exact_log2 (const_vf))
987 : return false;
988 : }
989 :
990 : return true;
991 : }
992 :
993 : /* Each statement in LOOP_VINFO can be masked where necessary. Check
994 : whether we can actually generate the masks required. Return true if so,
995 : storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
996 :
997 : static bool
998 16755 : vect_verify_full_masking (loop_vec_info loop_vinfo)
999 : {
1000 16755 : unsigned int min_ni_width;
1001 :
1002 : /* Use a normal loop if there are no statements that need masking.
1003 : This only happens in rare degenerate cases: it means that the loop
1004 : has no loads, no stores, and no live-out values. */
1005 16755 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1006 : return false;
1007 :
1008 : /* Produce the rgroup controls. */
1009 67279 : for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1010 : {
1011 25262 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1012 25262 : tree vectype = mask.first;
1013 25262 : unsigned nvectors = mask.second;
1014 :
1015 33769 : if (masks->rgc_vec.length () < nvectors)
1016 18444 : masks->rgc_vec.safe_grow_cleared (nvectors, true);
1017 25262 : rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1018 : /* The number of scalars per iteration and the number of vectors are
1019 : both compile-time constants. */
1020 25262 : unsigned int nscalars_per_iter
1021 25262 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1022 25262 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1023 :
1024 25262 : if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1025 : {
1026 20032 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1027 20032 : rgm->type = truth_type_for (vectype);
1028 20032 : rgm->factor = 1;
1029 : }
1030 : }
1031 :
1032 16755 : unsigned int max_nscalars_per_iter
1033 16755 : = vect_get_max_nscalars_per_iter (loop_vinfo);
1034 :
1035 : /* Work out how many bits we need to represent the limit. */
1036 16755 : min_ni_width
1037 16755 : = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1038 :
1039 : /* Find a scalar mode for which WHILE_ULT is supported. */
1040 16755 : opt_scalar_int_mode cmp_mode_iter;
1041 16755 : tree cmp_type = NULL_TREE;
1042 16755 : tree iv_type = NULL_TREE;
1043 16755 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1044 16755 : unsigned int iv_precision = UINT_MAX;
1045 :
1046 16755 : if (iv_limit != -1)
1047 16755 : iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1048 : UNSIGNED);
1049 :
1050 134040 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1051 : {
1052 117285 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1053 117285 : if (cmp_bits >= min_ni_width
1054 117285 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1055 : {
1056 78817 : tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1057 78817 : if (this_type
1058 78817 : && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1059 : {
1060 : /* Although we could stop as soon as we find a valid mode,
1061 : there are at least two reasons why that's not always the
1062 : best choice:
1063 :
1064 : - An IV that's Pmode or wider is more likely to be reusable
1065 : in address calculations than an IV that's narrower than
1066 : Pmode.
1067 :
1068 : - Doing the comparison in IV_PRECISION or wider allows
1069 : a natural 0-based IV, whereas using a narrower comparison
1070 : type requires mitigations against wrap-around.
1071 :
1072 : Conversely, if the IV limit is variable, doing the comparison
1073 : in a wider type than the original type can introduce
1074 : unnecessary extensions, so picking the widest valid mode
1075 : is not always a good choice either.
1076 :
1077 : Here we prefer the first IV type that's Pmode or wider,
1078 : and the first comparison type that's IV_PRECISION or wider.
1079 : (The comparison type must be no wider than the IV type,
1080 : to avoid extensions in the vector loop.)
1081 :
1082 : ??? We might want to try continuing beyond Pmode for ILP32
1083 : targets if CMP_BITS < IV_PRECISION. */
1084 0 : iv_type = this_type;
1085 0 : if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1086 : cmp_type = this_type;
1087 0 : if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1088 : break;
1089 : }
1090 : }
1091 : }
1092 :
1093 16755 : if (!cmp_type)
1094 : {
1095 16755 : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1096 16755 : return false;
1097 : }
1098 :
1099 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1100 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1101 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1102 0 : return true;
1103 16755 : }
1104 :
1105 : /* Each statement in LOOP_VINFO can be masked where necessary. Check
1106 : whether we can actually generate AVX512 style masks. Return true if so,
1107 : storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1108 :
1109 : static bool
1110 16755 : vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1111 : {
1112 : /* Produce differently organized rgc_vec and differently check
1113 : we can produce masks. */
1114 :
1115 : /* Use a normal loop if there are no statements that need masking.
1116 : This only happens in rare degenerate cases: it means that the loop
1117 : has no loads, no stores, and no live-out values. */
1118 16755 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1119 : return false;
1120 :
1121 : /* For the decrementing IV we need to represent all values in
1122 : [0, niter + niter_skip] where niter_skip is the elements we
1123 : skip in the first iteration for prologue peeling. */
1124 16755 : tree iv_type = NULL_TREE;
1125 16755 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1126 16755 : unsigned int iv_precision = UINT_MAX;
1127 16755 : if (iv_limit != -1)
1128 16755 : iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1129 :
1130 : /* First compute the type for the IV we use to track the remaining
1131 : scalar iterations. */
1132 16755 : opt_scalar_int_mode cmp_mode_iter;
1133 21696 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1134 : {
1135 21696 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1136 21696 : if (cmp_bits >= iv_precision
1137 21696 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1138 : {
1139 16755 : iv_type = build_nonstandard_integer_type (cmp_bits, true);
1140 16755 : if (iv_type)
1141 : break;
1142 : }
1143 : }
1144 16755 : if (!iv_type)
1145 : return false;
1146 :
1147 : /* Produce the rgroup controls. */
1148 67279 : for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1149 : {
1150 25262 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1151 25262 : tree vectype = mask.first;
1152 25262 : unsigned nvectors = mask.second;
1153 :
1154 : /* The number of scalars per iteration and the number of vectors are
1155 : both compile-time constants. */
1156 25262 : unsigned int nscalars_per_iter
1157 25262 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1158 25262 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1159 :
1160 : /* We index the rgroup_controls vector with nscalars_per_iter
1161 : which we keep constant and instead have a varying nvectors,
1162 : remembering the vector mask with the fewest nV. */
1163 33769 : if (masks->rgc_vec.length () < nscalars_per_iter)
1164 16798 : masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1165 25262 : rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1166 :
1167 25262 : if (!rgm->type || rgm->factor > nvectors)
1168 : {
1169 18374 : rgm->type = truth_type_for (vectype);
1170 18374 : rgm->compare_type = NULL_TREE;
1171 18374 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1172 18374 : rgm->factor = nvectors;
1173 18374 : rgm->bias_adjusted_ctrl = NULL_TREE;
1174 : }
1175 : }
1176 :
1177 : /* There is no fixed compare type we are going to use but we have to
1178 : be able to get at one for each mask group. */
1179 16755 : unsigned int min_ni_width
1180 16755 : = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1181 :
1182 16755 : bool ok = true;
1183 64142 : for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1184 : {
1185 17532 : tree mask_type = rgc.type;
1186 17532 : if (!mask_type)
1187 700 : continue;
1188 :
1189 : /* For now vect_get_loop_mask only supports integer mode masks
1190 : when we need to split it. */
1191 16832 : if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1192 16832 : || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1193 : {
1194 : ok = false;
1195 : break;
1196 : }
1197 :
1198 : /* If iv_type is usable as compare type use that - we can elide the
1199 : saturation in that case. */
1200 13181 : if (TYPE_PRECISION (iv_type) >= min_ni_width)
1201 : {
1202 13181 : tree cmp_vectype
1203 13181 : = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1204 13181 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1205 4668 : rgc.compare_type = cmp_vectype;
1206 : }
1207 13181 : if (!rgc.compare_type)
1208 24888 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1209 : {
1210 24884 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1211 24884 : if (cmp_bits >= min_ni_width
1212 24884 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1213 : {
1214 24872 : tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1215 24872 : if (!cmp_type)
1216 0 : continue;
1217 :
1218 : /* Check whether we can produce the mask with cmp_type. */
1219 24872 : tree cmp_vectype
1220 24872 : = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1221 24872 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1222 : {
1223 8509 : rgc.compare_type = cmp_vectype;
1224 8509 : break;
1225 : }
1226 : }
1227 : }
1228 13181 : if (!rgc.compare_type)
1229 : {
1230 : ok = false;
1231 : break;
1232 : }
1233 : }
1234 16755 : if (!ok)
1235 : {
1236 3655 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1237 3655 : return false;
1238 : }
1239 :
1240 13100 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1241 13100 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1242 13100 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1243 13100 : return true;
1244 16755 : }
1245 :
1246 : /* Check whether we can use vector access with length based on precison
1247 : comparison. So far, to keep it simple, we only allow the case that the
1248 : precision of the target supported length is larger than the precision
1249 : required by loop niters. */
1250 :
1251 : static bool
1252 6 : vect_verify_loop_lens (loop_vec_info loop_vinfo)
1253 : {
1254 6 : if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1255 : return false;
1256 :
1257 0 : if (!VECTOR_MODE_P (loop_vinfo->vector_mode))
1258 : return false;
1259 :
1260 0 : machine_mode len_load_mode, len_store_mode;
1261 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1262 0 : .exists (&len_load_mode))
1263 0 : return false;
1264 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1265 0 : .exists (&len_store_mode))
1266 0 : return false;
1267 :
1268 0 : signed char partial_load_bias = internal_len_load_store_bias
1269 0 : (IFN_LEN_LOAD, len_load_mode);
1270 :
1271 0 : signed char partial_store_bias = internal_len_load_store_bias
1272 0 : (IFN_LEN_STORE, len_store_mode);
1273 :
1274 0 : gcc_assert (partial_load_bias == partial_store_bias);
1275 :
1276 0 : if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1277 : return false;
1278 :
1279 : /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1280 : len_loads with a length of zero. In order to avoid that we prohibit
1281 : more than one loop length here. */
1282 0 : if (partial_load_bias == -1
1283 0 : && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1284 : return false;
1285 :
1286 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1287 :
1288 0 : unsigned int max_nitems_per_iter = 1;
1289 0 : unsigned int i;
1290 0 : rgroup_controls *rgl;
1291 : /* Find the maximum number of items per iteration for every rgroup. */
1292 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1293 : {
1294 0 : unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1295 0 : max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1296 : }
1297 :
1298 : /* Work out how many bits we need to represent the length limit. */
1299 0 : unsigned int min_ni_prec
1300 0 : = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1301 :
1302 : /* Now use the maximum of below precisions for one suitable IV type:
1303 : - the IV's natural precision
1304 : - the precision needed to hold: the maximum number of scalar
1305 : iterations multiplied by the scale factor (min_ni_prec above)
1306 : - the Pmode precision
1307 :
1308 : If min_ni_prec is less than the precision of the current niters,
1309 : we perfer to still use the niters type. Prefer to use Pmode and
1310 : wider IV to avoid narrow conversions. */
1311 :
1312 0 : unsigned int ni_prec
1313 0 : = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1314 0 : min_ni_prec = MAX (min_ni_prec, ni_prec);
1315 0 : min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1316 :
1317 0 : tree iv_type = NULL_TREE;
1318 0 : opt_scalar_int_mode tmode_iter;
1319 0 : FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1320 : {
1321 0 : scalar_mode tmode = tmode_iter.require ();
1322 0 : unsigned int tbits = GET_MODE_BITSIZE (tmode);
1323 :
1324 : /* ??? Do we really want to construct one IV whose precision exceeds
1325 : BITS_PER_WORD? */
1326 0 : if (tbits > BITS_PER_WORD)
1327 : break;
1328 :
1329 : /* Find the first available standard integral type. */
1330 0 : if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1331 : {
1332 0 : iv_type = build_nonstandard_integer_type (tbits, true);
1333 0 : break;
1334 : }
1335 : }
1336 :
1337 0 : if (!iv_type)
1338 : {
1339 0 : if (dump_enabled_p ())
1340 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1341 : "can't vectorize with length-based partial vectors"
1342 : " because there is no suitable iv type.\n");
1343 0 : return false;
1344 : }
1345 :
1346 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1347 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1348 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1349 :
1350 0 : return true;
1351 : }
1352 :
1353 : /* Calculate the cost of one scalar iteration of the loop. */
1354 : static void
1355 297469 : vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1356 : {
1357 297469 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1358 297469 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1359 297469 : int nbbs = loop->num_nodes, factor;
1360 297469 : int innerloop_iters, i;
1361 :
1362 297469 : DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1363 :
1364 : /* Gather costs for statements in the scalar loop. */
1365 :
1366 : /* FORNOW. */
1367 297469 : innerloop_iters = 1;
1368 297469 : if (loop->inner)
1369 1348 : innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1370 :
1371 1048383 : for (i = 0; i < nbbs; i++)
1372 : {
1373 750914 : gimple_stmt_iterator si;
1374 750914 : basic_block bb = bbs[i];
1375 :
1376 750914 : if (bb->loop_father == loop->inner)
1377 : factor = innerloop_iters;
1378 : else
1379 748218 : factor = 1;
1380 :
1381 5968883 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1382 : {
1383 4467055 : gimple *stmt = gsi_stmt (si);
1384 4467055 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1385 :
1386 4467055 : if (!is_gimple_assign (stmt)
1387 : && !is_gimple_call (stmt)
1388 : && !is_a<gcond *> (stmt))
1389 1653876 : continue;
1390 :
1391 : /* Skip stmts that are not vectorized inside the loop. */
1392 2813179 : stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1393 2813179 : if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1394 1369395 : && (!STMT_VINFO_LIVE_P (vstmt_info)
1395 47 : || !VECTORIZABLE_CYCLE_DEF
1396 : (STMT_VINFO_DEF_TYPE (vstmt_info))))
1397 1369395 : continue;
1398 :
1399 1443784 : vect_cost_for_stmt kind;
1400 1443784 : if (STMT_VINFO_DATA_REF (stmt_info))
1401 : {
1402 688986 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1403 : kind = scalar_load;
1404 : else
1405 242045 : kind = scalar_store;
1406 : }
1407 754798 : else if (vect_nop_conversion_p (stmt_info))
1408 42762 : continue;
1409 : else
1410 : kind = scalar_stmt;
1411 :
1412 : /* We are using vect_prologue here to avoid scaling twice
1413 : by the inner loop factor. */
1414 1401022 : record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1415 : factor, kind, stmt_info, 0, vect_prologue);
1416 : }
1417 : }
1418 :
1419 : /* Now accumulate cost. */
1420 297469 : loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1421 297469 : add_stmt_costs (loop_vinfo->scalar_costs,
1422 : &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1423 297469 : loop_vinfo->scalar_costs->finish_cost (nullptr);
1424 297469 : }
1425 :
1426 : /* Function vect_analyze_loop_form.
1427 :
1428 : Verify that certain CFG restrictions hold, including:
1429 : - the loop has a pre-header
1430 : - the loop has a single entry
1431 : - nested loops can have only a single exit.
1432 : - the loop exit condition is simple enough
1433 : - the number of iterations can be analyzed, i.e, a countable loop. The
1434 : niter could be analyzed under some assumptions. */
1435 :
1436 : opt_result
1437 458724 : vect_analyze_loop_form (class loop *loop, gimple *loop_vectorized_call,
1438 : vect_loop_form_info *info)
1439 : {
1440 458724 : DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1441 :
1442 458724 : edge exit_e = vec_init_loop_exit_info (loop);
1443 458724 : if (!exit_e)
1444 29645 : return opt_result::failure_at (vect_location,
1445 : "not vectorized:"
1446 : " Infinite loop detected.\n");
1447 429079 : if (loop_vectorized_call)
1448 : {
1449 28766 : tree arg = gimple_call_arg (loop_vectorized_call, 1);
1450 28766 : class loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg));
1451 28766 : edge scalar_exit_e = vec_init_loop_exit_info (scalar_loop);
1452 28766 : if (!scalar_exit_e)
1453 0 : return opt_result::failure_at (vect_location,
1454 : "not vectorized:"
1455 : " could not determine main exit from"
1456 : " loop with multiple exits.\n");
1457 : }
1458 :
1459 429079 : info->loop_exit = exit_e;
1460 429079 : if (dump_enabled_p ())
1461 15974 : dump_printf_loc (MSG_NOTE, vect_location,
1462 : "using as main loop exit: %d -> %d [AUX: %p]\n",
1463 15974 : exit_e->src->index, exit_e->dest->index, exit_e->aux);
1464 :
1465 : /* Check if we have any control flow that doesn't leave the loop. */
1466 429079 : basic_block *bbs = get_loop_body (loop);
1467 1403581 : for (unsigned i = 0; i < loop->num_nodes; i++)
1468 1091109 : if (EDGE_COUNT (bbs[i]->succs) != 1
1469 1091109 : && (EDGE_COUNT (bbs[i]->succs) != 2
1470 654660 : || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1471 : {
1472 116607 : free (bbs);
1473 116607 : return opt_result::failure_at (vect_location,
1474 : "not vectorized:"
1475 : " unsupported control flow in loop.\n");
1476 : }
1477 :
1478 : /* Check if we have any control flow that doesn't leave the loop. */
1479 313567 : bool has_phi = false;
1480 313567 : for (unsigned i = 0; i < loop->num_nodes; i++)
1481 313110 : if (!gimple_seq_empty_p (phi_nodes (bbs[i])))
1482 : {
1483 : has_phi = true;
1484 : break;
1485 : }
1486 312472 : if (!has_phi)
1487 457 : return opt_result::failure_at (vect_location,
1488 : "not vectorized:"
1489 : " no scalar evolution detected in loop.\n");
1490 :
1491 312015 : free (bbs);
1492 :
1493 : /* Different restrictions apply when we are considering an inner-most loop,
1494 : vs. an outer (nested) loop.
1495 : (FORNOW. May want to relax some of these restrictions in the future). */
1496 :
1497 312015 : info->inner_loop_cond = NULL;
1498 312015 : if (!loop->inner)
1499 : {
1500 : /* Inner-most loop. */
1501 :
1502 293545 : if (empty_block_p (loop->header))
1503 0 : return opt_result::failure_at (vect_location,
1504 : "not vectorized: empty loop.\n");
1505 : }
1506 : else
1507 : {
1508 18470 : class loop *innerloop = loop->inner;
1509 18470 : edge entryedge;
1510 :
1511 : /* Nested loop. We currently require that the loop is doubly-nested,
1512 : contains a single inner loop with a single exit to the block
1513 : with the single exit condition in the outer loop.
1514 : Vectorizable outer-loops look like this:
1515 :
1516 : (pre-header)
1517 : |
1518 : header <---+
1519 : | |
1520 : inner-loop |
1521 : | |
1522 : tail ------+
1523 : |
1524 : (exit-bb)
1525 :
1526 : The inner-loop also has the properties expected of inner-most loops
1527 : as described above. */
1528 :
1529 18470 : if ((loop->inner)->inner || (loop->inner)->next)
1530 2936 : return opt_result::failure_at (vect_location,
1531 : "not vectorized:"
1532 : " multiple nested loops.\n");
1533 :
1534 15534 : entryedge = loop_preheader_edge (innerloop);
1535 15534 : if (entryedge->src != loop->header
1536 15039 : || !single_exit (innerloop)
1537 26946 : || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1538 4464 : return opt_result::failure_at (vect_location,
1539 : "not vectorized:"
1540 : " unsupported outerloop form.\n");
1541 :
1542 : /* Analyze the inner-loop. */
1543 11070 : vect_loop_form_info inner;
1544 11070 : opt_result res = vect_analyze_loop_form (loop->inner, NULL, &inner);
1545 11070 : if (!res)
1546 : {
1547 416 : if (dump_enabled_p ())
1548 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1549 : "not vectorized: Bad inner loop.\n");
1550 416 : return res;
1551 : }
1552 :
1553 : /* Don't support analyzing niter under assumptions for inner
1554 : loop. */
1555 10654 : if (!integer_onep (inner.assumptions))
1556 257 : return opt_result::failure_at (vect_location,
1557 : "not vectorized: Bad inner loop.\n");
1558 :
1559 10397 : if (inner.number_of_iterations == chrec_dont_know
1560 10397 : || !expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1561 1837 : return opt_result::failure_at (vect_location,
1562 : "not vectorized: inner-loop count not"
1563 : " invariant.\n");
1564 :
1565 8560 : if (dump_enabled_p ())
1566 1046 : dump_printf_loc (MSG_NOTE, vect_location,
1567 : "Considering outer-loop vectorization.\n");
1568 8560 : info->inner_loop_cond = inner.conds[0];
1569 11070 : }
1570 :
1571 302105 : if (EDGE_COUNT (loop->header->preds) != 2)
1572 0 : return opt_result::failure_at (vect_location,
1573 : "not vectorized:"
1574 : " too many incoming edges.\n");
1575 :
1576 : /* We assume that the latch is empty. */
1577 302105 : basic_block latch = loop->latch;
1578 302105 : do
1579 : {
1580 302105 : if (!empty_block_p (latch)
1581 302105 : || !gimple_seq_empty_p (phi_nodes (latch)))
1582 20666 : return opt_result::failure_at (vect_location,
1583 : "not vectorized: latch block not "
1584 : "empty.\n");
1585 281439 : latch = single_pred (latch);
1586 : }
1587 562878 : while (single_succ_p (latch));
1588 :
1589 : /* Make sure there is no abnormal exit. */
1590 281439 : auto_vec<edge> exits = get_loop_exit_edges (loop);
1591 1248998 : for (edge e : exits)
1592 : {
1593 404714 : if (e->flags & EDGE_ABNORMAL)
1594 33 : return opt_result::failure_at (vect_location,
1595 : "not vectorized:"
1596 : " abnormal loop exit edge.\n");
1597 : }
1598 :
1599 281406 : info->conds
1600 281406 : = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1601 : &info->number_of_iterations,
1602 281406 : &info->number_of_iterationsm1);
1603 281406 : if (info->conds.is_empty ())
1604 30 : return opt_result::failure_at
1605 30 : (vect_location,
1606 : "not vectorized: complicated exit condition.\n");
1607 :
1608 : /* Determine what the primary and alternate exit conds are. */
1609 686027 : for (unsigned i = 0; i < info->conds.length (); i++)
1610 : {
1611 404651 : gcond *cond = info->conds[i];
1612 404651 : if (exit_e->src == gimple_bb (cond))
1613 281376 : std::swap (info->conds[0], info->conds[i]);
1614 : }
1615 :
1616 281376 : if (chrec_contains_undetermined (info->number_of_iterations))
1617 : {
1618 60296 : if (dump_enabled_p ())
1619 256 : dump_printf_loc (MSG_NOTE, vect_location,
1620 : "Loop being analyzed as uncounted.\n");
1621 60296 : if (loop->inner)
1622 562 : return opt_result::failure_at
1623 562 : (vect_location,
1624 : "not vectorized: outer loop vectorization of uncounted loops"
1625 : " is unsupported.\n");
1626 59734 : return opt_result::success ();
1627 : }
1628 :
1629 221080 : if (integer_zerop (info->assumptions))
1630 4 : return opt_result::failure_at
1631 4 : (info->conds[0],
1632 : "not vectorized: number of iterations cannot be computed.\n");
1633 :
1634 221076 : if (integer_zerop (info->number_of_iterations))
1635 12 : return opt_result::failure_at
1636 12 : (info->conds[0],
1637 : "not vectorized: number of iterations = 0.\n");
1638 :
1639 221064 : if (!(tree_fits_shwi_p (info->number_of_iterations)
1640 120543 : && tree_to_shwi (info->number_of_iterations) > 0))
1641 : {
1642 100521 : if (dump_enabled_p ())
1643 : {
1644 2469 : dump_printf_loc (MSG_NOTE, vect_location,
1645 : "Symbolic number of iterations is ");
1646 2469 : dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1647 2469 : dump_printf (MSG_NOTE, "\n");
1648 : }
1649 : }
1650 :
1651 221064 : if (!integer_onep (info->assumptions))
1652 : {
1653 8517 : if (dump_enabled_p ())
1654 : {
1655 66 : dump_printf_loc (MSG_NOTE, vect_location,
1656 : "Loop to be versioned with niter assumption ");
1657 66 : dump_generic_expr (MSG_NOTE, TDF_SLIM, info->assumptions);
1658 66 : dump_printf (MSG_NOTE, "\n");
1659 : }
1660 : }
1661 :
1662 221064 : return opt_result::success ();
1663 281439 : }
1664 :
1665 : /* Create a loop_vec_info for LOOP with SHARED and the
1666 : vect_analyze_loop_form result. */
1667 :
1668 : loop_vec_info
1669 499535 : vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1670 : const vect_loop_form_info *info,
1671 : loop_vec_info orig_loop_info)
1672 : {
1673 499535 : loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1674 499535 : LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1675 499535 : LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1676 499535 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1677 499535 : LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_info;
1678 499535 : if (orig_loop_info && LOOP_VINFO_EPILOGUE_P (orig_loop_info))
1679 171 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo)
1680 171 : = LOOP_VINFO_MAIN_LOOP_INFO (orig_loop_info);
1681 : else
1682 499364 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo) = orig_loop_info;
1683 : /* Also record the assumptions for versioning. */
1684 499535 : if (!integer_onep (info->assumptions) && !orig_loop_info)
1685 18811 : LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1686 :
1687 2248781 : for (gcond *cond : info->conds)
1688 : {
1689 750176 : stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1690 : /* Mark the statement as a condition. */
1691 750176 : STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1692 : }
1693 :
1694 499535 : unsigned cond_id = 0;
1695 499535 : if (!LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo))
1696 411924 : LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[cond_id++];
1697 :
1698 837787 : for (; cond_id < info->conds.length (); cond_id ++)
1699 338252 : LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[cond_id]);
1700 :
1701 499535 : LOOP_VINFO_MAIN_EXIT (loop_vinfo) = info->loop_exit;
1702 :
1703 : /* Check to see if we're vectorizing multiple exits. */
1704 499535 : LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1705 499535 : = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1706 :
1707 499535 : if (info->inner_loop_cond)
1708 : {
1709 : /* If we have an estimate on the number of iterations of the inner
1710 : loop use that to limit the scale for costing, otherwise use
1711 : --param vect-inner-loop-cost-factor literally. */
1712 8681 : widest_int nit;
1713 8681 : if (estimated_stmt_executions (loop->inner, &nit))
1714 7392 : LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1715 7392 : = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1716 8681 : }
1717 :
1718 499535 : return loop_vinfo;
1719 : }
1720 :
1721 :
1722 :
1723 : /* Return true if we know that the iteration count is smaller than the
1724 : vectorization factor. Return false if it isn't, or if we can't be sure
1725 : either way. */
1726 :
1727 : static bool
1728 111856 : vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1729 : {
1730 111856 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1731 :
1732 111856 : HOST_WIDE_INT max_niter;
1733 111856 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1734 53669 : max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1735 : else
1736 58187 : max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1737 :
1738 111856 : if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1739 10781 : return true;
1740 :
1741 : return false;
1742 : }
1743 :
1744 : /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1745 : is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1746 : definitely no, or -1 if it's worth retrying. */
1747 :
1748 : static int
1749 111864 : vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1750 : unsigned *suggested_unroll_factor)
1751 : {
1752 111864 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1753 111864 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1754 :
1755 : /* Only loops that can handle partially-populated vectors can have iteration
1756 : counts less than the vectorization factor. */
1757 111864 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
1758 111864 : && vect_known_niters_smaller_than_vf (loop_vinfo))
1759 : {
1760 10771 : if (dump_enabled_p ())
1761 237 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1762 : "not vectorized: iteration count smaller than "
1763 : "vectorization factor.\n");
1764 10771 : return 0;
1765 : }
1766 :
1767 : /* If we know the number of iterations we can do better, for the
1768 : epilogue we can also decide whether the main loop leaves us
1769 : with enough iterations, prefering a smaller vector epilog then
1770 : also possibly used for the case we skip the vector loop. */
1771 101093 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1772 : {
1773 44120 : widest_int scalar_niters
1774 44120 : = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
1775 44120 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1776 : {
1777 2691 : loop_vec_info orig_loop_vinfo
1778 : = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1779 2691 : loop_vec_info main_loop_vinfo
1780 : = LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo);
1781 2691 : unsigned lowest_vf
1782 2691 : = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
1783 2691 : int prolog_peeling = 0;
1784 2691 : if (!vect_use_loop_mask_for_alignment_p (main_loop_vinfo))
1785 2691 : prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo);
1786 2691 : if (prolog_peeling >= 0
1787 2691 : && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
1788 : lowest_vf))
1789 : {
1790 5372 : unsigned gap
1791 2686 : = LOOP_VINFO_PEELING_FOR_GAPS (main_loop_vinfo) ? 1 : 0;
1792 5372 : scalar_niters = ((scalar_niters - gap - prolog_peeling)
1793 5372 : % lowest_vf + gap);
1794 : }
1795 : }
1796 : /* Reject vectorizing for a single scalar iteration, even if
1797 : we could in principle implement that using partial vectors.
1798 : But allow such vectorization if VF == 1 in case we do not
1799 : need to peel for gaps (if we need, avoid vectorization for
1800 : reasons of code footprint). */
1801 44120 : unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
1802 44120 : if (scalar_niters <= peeling_gap + 1
1803 44120 : && (assumed_vf > 1 || peeling_gap != 0))
1804 : {
1805 690 : if (dump_enabled_p ())
1806 159 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1807 : "not vectorized: loop only has a single "
1808 : "scalar iteration.\n");
1809 690 : return 0;
1810 : }
1811 :
1812 43430 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1813 : {
1814 : /* Check that the loop processes at least one full vector. */
1815 43419 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1816 43419 : if (known_lt (scalar_niters, vf))
1817 : {
1818 364 : if (dump_enabled_p ())
1819 296 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1820 : "loop does not have enough iterations "
1821 : "to support vectorization.\n");
1822 404 : return 0;
1823 : }
1824 :
1825 : /* If we need to peel an extra epilogue iteration to handle data
1826 : accesses with gaps, check that there are enough scalar iterations
1827 : available.
1828 :
1829 : The check above is redundant with this one when peeling for gaps,
1830 : but the distinction is useful for diagnostics. */
1831 43055 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1832 43354 : && known_le (scalar_niters, vf))
1833 : {
1834 40 : if (dump_enabled_p ())
1835 9 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1836 : "loop does not have enough iterations "
1837 : "to support peeling for gaps.\n");
1838 40 : return 0;
1839 : }
1840 : }
1841 44120 : }
1842 :
1843 : /* If using the "very cheap" model. reject cases in which we'd keep
1844 : a copy of the scalar code (even if we might be able to vectorize it). */
1845 99999 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1846 99999 : && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1847 49298 : || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
1848 : {
1849 721 : if (dump_enabled_p ())
1850 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1851 : "some scalar iterations would need to be peeled\n");
1852 721 : return 0;
1853 : }
1854 :
1855 99278 : int min_profitable_iters, min_profitable_estimate;
1856 99278 : vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1857 : &min_profitable_estimate,
1858 : suggested_unroll_factor);
1859 :
1860 99278 : if (min_profitable_iters < 0)
1861 : {
1862 24149 : if (dump_enabled_p ())
1863 22 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1864 : "not vectorized: vectorization not profitable.\n");
1865 24149 : if (dump_enabled_p ())
1866 22 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1867 : "not vectorized: vector version will never be "
1868 : "profitable.\n");
1869 24149 : return -1;
1870 : }
1871 :
1872 75129 : int min_scalar_loop_bound = (param_min_vect_loop_bound
1873 75129 : * assumed_vf);
1874 :
1875 : /* Use the cost model only if it is more conservative than user specified
1876 : threshold. */
1877 75129 : unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1878 : min_profitable_iters);
1879 :
1880 75129 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1881 :
1882 38062 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1883 113191 : && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1884 : {
1885 384 : if (dump_enabled_p ())
1886 1 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1887 : "not vectorized: vectorization not profitable.\n");
1888 384 : if (dump_enabled_p ())
1889 1 : dump_printf_loc (MSG_NOTE, vect_location,
1890 : "not vectorized: iteration count smaller than user "
1891 : "specified loop bound parameter or minimum profitable "
1892 : "iterations (whichever is more conservative).\n");
1893 384 : return 0;
1894 : }
1895 :
1896 : /* The static profitablity threshold min_profitable_estimate includes
1897 : the cost of having to check at runtime whether the scalar loop
1898 : should be used instead. If it turns out that we don't need or want
1899 : such a check, the threshold we should use for the static estimate
1900 : is simply the point at which the vector loop becomes more profitable
1901 : than the scalar loop. */
1902 74745 : if (min_profitable_estimate > min_profitable_iters
1903 16085 : && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1904 15570 : && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1905 287 : && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1906 75032 : && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1907 : {
1908 11 : if (dump_enabled_p ())
1909 6 : dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1910 : " choice between the scalar and vector loops\n");
1911 11 : min_profitable_estimate = min_profitable_iters;
1912 : }
1913 :
1914 : /* If the vector loop needs multiple iterations to be beneficial then
1915 : things are probably too close to call, and the conservative thing
1916 : would be to stick with the scalar code. */
1917 74745 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1918 74745 : && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1919 : {
1920 8533 : if (dump_enabled_p ())
1921 177 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1922 : "one iteration of the vector loop would be"
1923 : " more expensive than the equivalent number of"
1924 : " iterations of the scalar loop\n");
1925 8533 : return 0;
1926 : }
1927 :
1928 66212 : HOST_WIDE_INT estimated_niter;
1929 :
1930 : /* If we are vectorizing an epilogue then we know the maximum number of
1931 : scalar iterations it will cover is at least one lower than the
1932 : vectorization factor of the main loop. */
1933 66212 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1934 11021 : estimated_niter
1935 11021 : = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1936 : else
1937 : {
1938 55191 : estimated_niter = estimated_stmt_executions_int (loop);
1939 55191 : if (estimated_niter == -1)
1940 20991 : estimated_niter = likely_max_stmt_executions_int (loop);
1941 : }
1942 32012 : if (estimated_niter != -1
1943 64385 : && ((unsigned HOST_WIDE_INT) estimated_niter
1944 64385 : < MAX (th, (unsigned) min_profitable_estimate)))
1945 : {
1946 4465 : if (dump_enabled_p ())
1947 28 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1948 : "not vectorized: estimated iteration count too "
1949 : "small.\n");
1950 4465 : if (dump_enabled_p ())
1951 28 : dump_printf_loc (MSG_NOTE, vect_location,
1952 : "not vectorized: estimated iteration count smaller "
1953 : "than specified loop bound parameter or minimum "
1954 : "profitable iterations (whichever is more "
1955 : "conservative).\n");
1956 4465 : return -1;
1957 : }
1958 :
1959 : /* As we cannot use a runtime check to gate profitability for uncounted
1960 : loops require either an estimate or if none, at least a profitable
1961 : vectorization within the first vector iteration (that condition
1962 : will practically never be true due to the required epilog and
1963 : likely alignment prologue). */
1964 61747 : if (LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo)
1965 155 : && estimated_niter == -1
1966 61875 : && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1967 : {
1968 112 : if (dump_enabled_p ())
1969 2 : dump_printf_loc (MSG_NOTE, vect_location,
1970 : "not vectorized: no loop iteration estimate on the "
1971 : "uncounted loop and not trivially profitable.\n");
1972 112 : return -1;
1973 : }
1974 :
1975 : return 1;
1976 : }
1977 :
1978 : static opt_result
1979 278642 : vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1980 : vec<data_reference_p> *datarefs)
1981 : {
1982 831253 : for (unsigned i = 0; i < loop->num_nodes; i++)
1983 1232980 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1984 5209805 : !gsi_end_p (gsi); gsi_next (&gsi))
1985 : {
1986 4657194 : gimple *stmt = gsi_stmt (gsi);
1987 4657194 : if (is_gimple_debug (stmt))
1988 2143048 : continue;
1989 2514276 : opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1990 : NULL, 0);
1991 2514276 : if (!res)
1992 : {
1993 64009 : if (is_gimple_call (stmt) && loop->safelen)
1994 : {
1995 404 : tree fndecl = gimple_call_fndecl (stmt), op;
1996 404 : if (fndecl == NULL_TREE
1997 404 : && gimple_call_internal_p (stmt, IFN_MASK_CALL))
1998 : {
1999 0 : fndecl = gimple_call_arg (stmt, 0);
2000 0 : gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2001 0 : fndecl = TREE_OPERAND (fndecl, 0);
2002 0 : gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2003 : }
2004 404 : if (fndecl != NULL_TREE)
2005 : {
2006 367 : cgraph_node *node = cgraph_node::get (fndecl);
2007 367 : if (node != NULL && node->simd_clones != NULL)
2008 : {
2009 131 : unsigned int j, n = gimple_call_num_args (stmt);
2010 545 : for (j = 0; j < n; j++)
2011 : {
2012 284 : op = gimple_call_arg (stmt, j);
2013 284 : if (DECL_P (op)
2014 284 : || (REFERENCE_CLASS_P (op)
2015 0 : && get_base_address (op)))
2016 : break;
2017 : }
2018 131 : op = gimple_call_lhs (stmt);
2019 : /* Ignore #pragma omp declare simd functions
2020 : if they don't have data references in the
2021 : call stmt itself. */
2022 261 : if (j == n
2023 131 : && !(op
2024 120 : && (DECL_P (op)
2025 120 : || (REFERENCE_CLASS_P (op)
2026 0 : && get_base_address (op)))))
2027 130 : continue;
2028 : }
2029 : }
2030 : }
2031 63879 : return res;
2032 : }
2033 : /* If dependence analysis will give up due to the limit on the
2034 : number of datarefs stop here and fail fatally. */
2035 4294841 : if (datarefs->length ()
2036 1844574 : > (unsigned)param_loop_max_datarefs_for_datadeps)
2037 0 : return opt_result::failure_at (stmt, "exceeded param "
2038 : "loop-max-datarefs-for-datadeps\n");
2039 : }
2040 214763 : return opt_result::success ();
2041 : }
2042 :
2043 : /* Determine if operating on full vectors for LOOP_VINFO might leave
2044 : some scalar iterations still to do. If so, decide how we should
2045 : handle those scalar iterations. The possibilities are:
2046 :
2047 : (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2048 : In this case:
2049 :
2050 : LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2051 : LOOP_VINFO_PEELING_FOR_NITER == false
2052 :
2053 : (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2054 : to handle the remaining scalar iterations. In this case:
2055 :
2056 : LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2057 : LOOP_VINFO_PEELING_FOR_NITER == true
2058 :
2059 : The MASKED_P argument specifies to what extent
2060 : param_vect_partial_vector_usage is to be honored. For MASKED_P == 0
2061 : no partial vectors are to be used, for MASKED_P == -1 it's
2062 : param_vect_partial_vector_usage that gets to decide whether we may
2063 : consider partial vector usage. For MASKED_P == 1 partial vectors
2064 : may be used if possible.
2065 :
2066 : */
2067 :
2068 : static opt_result
2069 112665 : vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2070 : int masked_p)
2071 : {
2072 : /* Determine whether there would be any scalar iterations left over. */
2073 112665 : bool need_peeling_or_partial_vectors_p
2074 112665 : = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2075 :
2076 : /* Decide whether to vectorize the loop with partial vectors. */
2077 112665 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2078 112665 : if (masked_p == 0
2079 112665 : || (masked_p == -1 && param_vect_partial_vector_usage == 0))
2080 : /* If requested explicitly do not use partial vectors. */
2081 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2082 121 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2083 42 : && LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo))
2084 0 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2085 121 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2086 42 : && need_peeling_or_partial_vectors_p)
2087 : {
2088 : /* For partial-vector-usage=1, try to push the handling of partial
2089 : vectors to the epilogue, with the main loop continuing to operate
2090 : on full vectors.
2091 :
2092 : If we are unrolling we also do not want to use partial vectors. This
2093 : is to avoid the overhead of generating multiple masks and also to
2094 : avoid having to execute entire iterations of FALSE masked instructions
2095 : when dealing with one or less full iterations.
2096 :
2097 : ??? We could then end up failing to use partial vectors if we
2098 : decide to peel iterations into a prologue, and if the main loop
2099 : then ends up processing fewer than VF iterations. */
2100 34 : if ((param_vect_partial_vector_usage == 1
2101 8 : || loop_vinfo->suggested_unroll_factor > 1)
2102 26 : && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2103 52 : && !vect_known_niters_smaller_than_vf (loop_vinfo))
2104 : ;
2105 : else
2106 26 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2107 : }
2108 :
2109 112665 : if (LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo)
2110 0 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2111 0 : return opt_result::failure_at (vect_location,
2112 : "not vectorized: loop needs but cannot "
2113 : "use partial vectors\n");
2114 :
2115 112665 : if (dump_enabled_p ())
2116 12003 : dump_printf_loc (MSG_NOTE, vect_location,
2117 : "operating on %s vectors%s.\n",
2118 12003 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2119 : ? "partial" : "full",
2120 12003 : LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2121 : ? " for epilogue loop" : "");
2122 :
2123 112665 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2124 225330 : = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2125 112665 : && need_peeling_or_partial_vectors_p);
2126 :
2127 112665 : return opt_result::success ();
2128 : }
2129 :
2130 : /* Function vect_analyze_loop_2.
2131 :
2132 : Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2133 : analyses will record information in some members of LOOP_VINFO. FATAL
2134 : indicates if some analysis meets fatal error. If one non-NULL pointer
2135 : SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2136 : worked out suggested unroll factor, while one NULL pointer shows it's
2137 : going to apply the suggested unroll factor.
2138 : SINGLE_LANE_SLP_DONE_FOR_SUGGESTED_UF is to hold whether single-lane
2139 : slp was forced when the suggested unroll factor was worked out. */
2140 : static opt_result
2141 498835 : vect_analyze_loop_2 (loop_vec_info loop_vinfo, int masked_p, bool &fatal,
2142 : unsigned *suggested_unroll_factor,
2143 : bool& single_lane_slp_done_for_suggested_uf)
2144 : {
2145 498835 : opt_result ok = opt_result::success ();
2146 498835 : int res;
2147 498835 : unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2148 498835 : loop_vec_info orig_loop_vinfo = NULL;
2149 :
2150 : /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2151 : loop_vec_info of the first vectorized loop. */
2152 498835 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2153 18046 : orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2154 : else
2155 : orig_loop_vinfo = loop_vinfo;
2156 18046 : gcc_assert (orig_loop_vinfo);
2157 :
2158 : /* We can't mask on niters for uncounted loops due to unkown upper bound. */
2159 498835 : if (LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo))
2160 87611 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2161 :
2162 : /* The first group of checks is independent of the vector size. */
2163 498835 : fatal = true;
2164 :
2165 498835 : if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2166 498835 : && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2167 5 : return opt_result::failure_at (vect_location,
2168 : "not vectorized: simd if(0)\n");
2169 :
2170 : /* Find all data references in the loop (which correspond to vdefs/vuses)
2171 : and analyze their evolution in the loop. */
2172 :
2173 498830 : loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2174 :
2175 : /* Gather the data references and count stmts in the loop. */
2176 498830 : if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2177 : {
2178 278642 : opt_result res
2179 278642 : = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2180 : &LOOP_VINFO_DATAREFS (loop_vinfo));
2181 278642 : if (!res)
2182 : {
2183 63879 : if (dump_enabled_p ())
2184 1639 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2185 : "not vectorized: loop contains function "
2186 : "calls or data references that cannot "
2187 : "be analyzed\n");
2188 63879 : return res;
2189 : }
2190 214763 : loop_vinfo->shared->save_datarefs ();
2191 : }
2192 : else
2193 220188 : loop_vinfo->shared->check_datarefs ();
2194 :
2195 : /* Analyze the data references and also adjust the minimal
2196 : vectorization factor according to the loads and stores. */
2197 :
2198 434951 : ok = vect_analyze_data_refs (loop_vinfo, &fatal);
2199 434951 : if (!ok)
2200 : {
2201 58539 : if (dump_enabled_p ())
2202 1033 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2203 : "bad data references.\n");
2204 58539 : return ok;
2205 : }
2206 :
2207 : /* Check if we are applying unroll factor now. */
2208 376412 : bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2209 376412 : gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2210 :
2211 : /* When single-lane SLP was forced and we are applying suggested unroll
2212 : factor, keep that decision here. */
2213 752824 : bool force_single_lane = (applying_suggested_uf
2214 376412 : && single_lane_slp_done_for_suggested_uf);
2215 :
2216 : /* Classify all cross-iteration scalar data-flow cycles.
2217 : Cross-iteration cycles caused by virtual phis are analyzed separately. */
2218 376412 : vect_analyze_scalar_cycles (loop_vinfo);
2219 :
2220 376412 : vect_pattern_recog (loop_vinfo);
2221 :
2222 : /* Analyze the access patterns of the data-refs in the loop (consecutive,
2223 : complex, etc.). FORNOW: Only handle consecutive access pattern. */
2224 :
2225 376412 : ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2226 376412 : if (!ok)
2227 : {
2228 7921 : if (dump_enabled_p ())
2229 291 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2230 : "bad data access.\n");
2231 7921 : return ok;
2232 : }
2233 :
2234 : /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2235 :
2236 368491 : ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2237 368491 : if (!ok)
2238 : {
2239 45844 : if (dump_enabled_p ())
2240 399 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2241 : "unexpected pattern.\n");
2242 45844 : return ok;
2243 : }
2244 :
2245 : /* While the rest of the analysis below depends on it in some way. */
2246 322647 : fatal = false;
2247 :
2248 : /* Analyze data dependences between the data-refs in the loop
2249 : and adjust the maximum vectorization factor according to
2250 : the dependences.
2251 : FORNOW: fail at the first data dependence that we encounter. */
2252 :
2253 322647 : ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2254 322647 : if (!ok)
2255 : {
2256 25178 : if (dump_enabled_p ())
2257 532 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2258 : "bad data dependence.\n");
2259 25178 : return ok;
2260 : }
2261 297469 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2262 :
2263 : /* Compute the scalar iteration cost. */
2264 297469 : vect_compute_single_scalar_iteration_cost (loop_vinfo);
2265 :
2266 297469 : bool saved_can_use_partial_vectors_p
2267 : = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2268 :
2269 : /* This is the point where we can re-start analysis with single-lane
2270 : SLP forced. */
2271 426587 : start_over:
2272 :
2273 : /* Check the SLP opportunities in the loop, analyze and build
2274 : SLP trees. */
2275 853174 : ok = vect_analyze_slp (loop_vinfo, loop_vinfo->stmt_vec_infos.length (),
2276 : force_single_lane);
2277 426587 : if (!ok)
2278 21413 : return ok;
2279 :
2280 : /* If there are any SLP instances mark them as pure_slp and compute
2281 : the overall vectorization factor. */
2282 405174 : if (!vect_make_slp_decision (loop_vinfo))
2283 46490 : return opt_result::failure_at (vect_location, "no stmts to vectorize.\n");
2284 :
2285 358684 : if (dump_enabled_p ())
2286 18394 : dump_printf_loc (MSG_NOTE, vect_location, "Loop contains only SLP stmts\n");
2287 :
2288 : /* Dump the vectorization factor from the SLP decision. */
2289 358684 : if (dump_enabled_p ())
2290 : {
2291 18394 : dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
2292 18394 : dump_dec (MSG_NOTE, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2293 18394 : dump_printf (MSG_NOTE, "\n");
2294 : }
2295 :
2296 : /* We don't expect to have to roll back to anything other than an empty
2297 : set of rgroups. */
2298 358684 : gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2299 :
2300 : /* Apply the suggested unrolling factor, this was determined by the backend
2301 : during finish_cost the first time we ran the analyzis for this
2302 : vector mode. */
2303 358684 : if (applying_suggested_uf)
2304 247 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2305 :
2306 : /* Now the vectorization factor is final. */
2307 358684 : poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2308 358684 : gcc_assert (known_ne (vectorization_factor, 0U));
2309 :
2310 : /* Optimize the SLP graph with the vectorization factor fixed. */
2311 358684 : vect_optimize_slp (loop_vinfo);
2312 :
2313 : /* Gather the loads reachable from the SLP graph entries. */
2314 358684 : vect_gather_slp_loads (loop_vinfo);
2315 :
2316 358684 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2317 : {
2318 13796 : dump_printf_loc (MSG_NOTE, vect_location,
2319 : "vectorization_factor = ");
2320 13796 : dump_dec (MSG_NOTE, vectorization_factor);
2321 13796 : dump_printf (MSG_NOTE, ", niters = %wd\n",
2322 13796 : LOOP_VINFO_INT_NITERS (loop_vinfo));
2323 : }
2324 :
2325 358684 : if (max_vf != MAX_VECTORIZATION_FACTOR
2326 358684 : && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2327 41 : return opt_result::failure_at (vect_location, "bad data dependence.\n");
2328 :
2329 358643 : loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2330 :
2331 : /* Analyze the alignment of the data-refs in the loop. */
2332 358643 : vect_analyze_data_refs_alignment (loop_vinfo);
2333 :
2334 : /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2335 : It is important to call pruning after vect_analyze_data_ref_accesses,
2336 : since we use grouping information gathered by interleaving analysis. */
2337 358643 : ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2338 358643 : if (!ok)
2339 17187 : return ok;
2340 :
2341 : /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2342 : vectorization, since we do not want to add extra peeling or
2343 : add versioning for alignment. */
2344 341456 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2345 : /* This pass will decide on using loop versioning and/or loop peeling in
2346 : order to enhance the alignment of data references in the loop. */
2347 326339 : ok = vect_enhance_data_refs_alignment (loop_vinfo);
2348 341456 : if (!ok)
2349 0 : return ok;
2350 :
2351 : /* Analyze operations in the SLP instances. We can't simply
2352 : remove unsupported SLP instances as this makes the above
2353 : SLP kind detection invalid and might also affect the VF. */
2354 341456 : if (! vect_slp_analyze_operations (loop_vinfo))
2355 : {
2356 228791 : ok = opt_result::failure_at (vect_location,
2357 : "unsupported SLP instances\n");
2358 228791 : goto again;
2359 : }
2360 :
2361 : /* For now, we don't expect to mix both masking and length approaches for one
2362 : loop, disable it if both are recorded. */
2363 112665 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2364 16761 : && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2365 129420 : && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2366 : {
2367 0 : if (dump_enabled_p ())
2368 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2369 : "can't vectorize a loop with partial vectors"
2370 : " because we don't expect to mix different"
2371 : " approaches with partial vectors for the"
2372 : " same loop.\n");
2373 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2374 : }
2375 :
2376 : /* If we still have the option of using partial vectors,
2377 : check whether we can generate the necessary loop controls. */
2378 112665 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2379 : {
2380 16761 : if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2381 : {
2382 16755 : if (!vect_verify_full_masking (loop_vinfo)
2383 16755 : && !vect_verify_full_masking_avx512 (loop_vinfo))
2384 3655 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2385 : }
2386 : else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
2387 6 : if (!vect_verify_loop_lens (loop_vinfo))
2388 6 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2389 : }
2390 :
2391 : /* Decide whether this loop_vinfo should use partial vectors or peeling,
2392 : assuming that the loop will be used as a main loop. We will redo
2393 : this analysis later if we instead decide to use the loop as an
2394 : epilogue loop. */
2395 112665 : ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, masked_p);
2396 112665 : if (!ok)
2397 0 : return ok;
2398 :
2399 : /* If we're vectorizing a loop that uses length "controls" and
2400 : can iterate more than once, we apply decrementing IV approach
2401 : in loop control. */
2402 112665 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2403 26 : && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
2404 0 : && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
2405 112665 : && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2406 0 : && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
2407 : LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
2408 0 : LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
2409 :
2410 : /* If a loop uses length controls and has a decrementing loop control IV,
2411 : we will normally pass that IV through a MIN_EXPR to calcaluate the
2412 : basis for the length controls. E.g. in a loop that processes one
2413 : element per scalar iteration, the number of elements would be
2414 : MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
2415 :
2416 : This MIN_EXPR approach allows us to use pointer IVs with an invariant
2417 : step, since only the final iteration of the vector loop can have
2418 : inactive lanes.
2419 :
2420 : However, some targets have a dedicated instruction for calculating the
2421 : preferred length, given the total number of elements that still need to
2422 : be processed. This is encapsulated in the SELECT_VL internal function.
2423 :
2424 : If the target supports SELECT_VL, we can use it instead of MIN_EXPR
2425 : to determine the basis for the length controls. However, unlike the
2426 : MIN_EXPR calculation, the SELECT_VL calculation can decide to make
2427 : lanes inactive in any iteration of the vector loop, not just the last
2428 : iteration. This SELECT_VL approach therefore requires us to use pointer
2429 : IVs with variable steps.
2430 :
2431 : Once we've decided how many elements should be processed by one
2432 : iteration of the vector loop, we need to populate the rgroup controls.
2433 : If a loop has multiple rgroups, we need to make sure that those rgroups
2434 : "line up" (that is, they must be consistent about which elements are
2435 : active and which aren't). This is done by vect_adjust_loop_lens_control.
2436 :
2437 : In principle, it would be possible to use vect_adjust_loop_lens_control
2438 : on either the result of a MIN_EXPR or the result of a SELECT_VL.
2439 : However:
2440 :
2441 : (1) In practice, it only makes sense to use SELECT_VL when a vector
2442 : operation will be controlled directly by the result. It is not
2443 : worth using SELECT_VL if it would only be the input to other
2444 : calculations.
2445 :
2446 : (2) If we use SELECT_VL for an rgroup that has N controls, each associated
2447 : pointer IV will need N updates by a variable amount (N-1 updates
2448 : within the iteration and 1 update to move to the next iteration).
2449 :
2450 : Because of this, we prefer to use the MIN_EXPR approach whenever there
2451 : is more than one length control.
2452 :
2453 : In addition, SELECT_VL always operates to a granularity of 1 unit.
2454 : If we wanted to use it to control an SLP operation on N consecutive
2455 : elements, we would need to make the SELECT_VL inputs measure scalar
2456 : iterations (rather than elements) and then multiply the SELECT_VL
2457 : result by N. But using SELECT_VL this way is inefficient because
2458 : of (1) above.
2459 :
2460 : 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
2461 : satisfied:
2462 :
2463 : (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
2464 : (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
2465 :
2466 : Since SELECT_VL (variable step) will make SCEV analysis failed and then
2467 : we will fail to gain benefits of following unroll optimizations. We prefer
2468 : using the MIN_EXPR approach in this situation. */
2469 112665 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
2470 : {
2471 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
2472 0 : if (LOOP_VINFO_LENS (loop_vinfo).length () == 1
2473 0 : && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1
2474 0 : && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2475 : || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
2476 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
2477 :
2478 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
2479 0 : for (auto rgc : LOOP_VINFO_LENS (loop_vinfo))
2480 0 : if (rgc.type
2481 0 : && !direct_internal_fn_supported_p (IFN_SELECT_VL,
2482 : rgc.type, iv_type,
2483 : OPTIMIZE_FOR_SPEED))
2484 : {
2485 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2486 0 : break;
2487 : }
2488 :
2489 : /* If any of the SLP instances cover more than a single lane
2490 : we cannot use .SELECT_VL at the moment, even if the number
2491 : of lanes is uniform throughout the SLP graph. */
2492 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
2493 0 : for (slp_instance inst : LOOP_VINFO_SLP_INSTANCES (loop_vinfo))
2494 0 : if (SLP_TREE_LANES (SLP_INSTANCE_TREE (inst)) != 1
2495 0 : && !(SLP_INSTANCE_KIND (inst) == slp_inst_kind_store
2496 0 : && SLP_INSTANCE_TREE (inst)->ldst_lanes))
2497 : {
2498 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2499 0 : break;
2500 : }
2501 : }
2502 :
2503 : /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2504 : to be able to handle fewer than VF scalars, or needs to have a lower VF
2505 : than the main loop. */
2506 112665 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2507 12654 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2508 : {
2509 12644 : poly_uint64 unscaled_vf
2510 12644 : = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2511 : orig_loop_vinfo->suggested_unroll_factor);
2512 12644 : if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
2513 285 : return opt_result::failure_at (vect_location,
2514 : "Vectorization factor too high for"
2515 : " epilogue loop.\n");
2516 : }
2517 :
2518 : /* If the epilogue needs peeling for gaps but the main loop doesn't give
2519 : up on the epilogue. */
2520 112380 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2521 12369 : && LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2522 67 : && (LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo)
2523 : != LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
2524 4 : return opt_result::failure_at (vect_location,
2525 : "Epilogue loop requires peeling for gaps "
2526 : "but main loop does not.\n");
2527 :
2528 : /* If an epilogue loop is required make sure we can create one. */
2529 112376 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2530 111124 : || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2531 32635 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
2532 : {
2533 80681 : if (dump_enabled_p ())
2534 5281 : dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2535 80681 : if (!vect_can_advance_ivs_p (loop_vinfo)
2536 160850 : || !slpeel_can_duplicate_loop_p (loop,
2537 : LOOP_VINFO_MAIN_EXIT (loop_vinfo),
2538 80169 : LOOP_VINFO_MAIN_EXIT (loop_vinfo)))
2539 : {
2540 512 : ok = opt_result::failure_at (vect_location,
2541 : "not vectorized: can't create required "
2542 : "epilog loop\n");
2543 512 : goto again;
2544 : }
2545 : }
2546 :
2547 : /* Check the costings of the loop make vectorizing worthwhile. */
2548 111864 : res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2549 111864 : if (res < 0)
2550 : {
2551 28726 : ok = opt_result::failure_at (vect_location,
2552 : "Loop costings may not be worthwhile.\n");
2553 28726 : goto again;
2554 : }
2555 83138 : if (!res)
2556 21503 : return opt_result::failure_at (vect_location,
2557 : "Loop costings not worthwhile.\n");
2558 :
2559 : /* During peeling, we need to check if number of loop iterations is
2560 : enough for both peeled prolog loop and vector loop. This check
2561 : can be merged along with threshold check of loop versioning, so
2562 : increase threshold for this case if necessary.
2563 :
2564 : If we are analyzing an epilogue we still want to check what its
2565 : versioning threshold would be. If we decide to vectorize the epilogues we
2566 : will want to use the lowest versioning threshold of all epilogues and main
2567 : loop. This will enable us to enter a vectorized epilogue even when
2568 : versioning the loop. We can't simply check whether the epilogue requires
2569 : versioning though since we may have skipped some versioning checks when
2570 : analyzing the epilogue. For instance, checks for alias versioning will be
2571 : skipped when dealing with epilogues as we assume we already checked them
2572 : for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2573 61635 : if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2574 : {
2575 5814 : poly_uint64 niters_th = 0;
2576 5814 : unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2577 :
2578 5814 : if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2579 : {
2580 : /* Niters for peeled prolog loop. */
2581 5814 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2582 : {
2583 118 : dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2584 118 : tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2585 118 : niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2586 : }
2587 : else
2588 5696 : niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2589 : }
2590 :
2591 : /* Niters for at least one iteration of vectorized loop. */
2592 5814 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2593 5810 : niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2594 : /* One additional iteration because of peeling for gap. */
2595 5814 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2596 60 : niters_th += 1;
2597 :
2598 : /* Use the same condition as vect_transform_loop to decide when to use
2599 : the cost to determine a versioning threshold. */
2600 5814 : if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2601 5814 : && ordered_p (th, niters_th))
2602 3911 : niters_th = ordered_max (poly_uint64 (th), niters_th);
2603 :
2604 5814 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2605 : }
2606 :
2607 61635 : gcc_assert (known_eq (vectorization_factor,
2608 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2609 :
2610 61635 : single_lane_slp_done_for_suggested_uf = force_single_lane;
2611 :
2612 : /* Ok to vectorize! */
2613 61635 : LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2614 61635 : return opt_result::success ();
2615 :
2616 258029 : again:
2617 : /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2618 258029 : gcc_assert (!ok);
2619 :
2620 : /* Try again with single-lane SLP. */
2621 258029 : if (force_single_lane)
2622 127971 : return ok;
2623 :
2624 : /* If we are applying suggested unroll factor, we don't need to
2625 : re-try any more as we want to keep the SLP mode fixed. */
2626 130058 : if (applying_suggested_uf)
2627 6 : return ok;
2628 :
2629 : /* Likewise if the grouped loads or stores in the SLP cannot be handled
2630 : via interleaving or lane instructions. */
2631 : slp_instance instance;
2632 : slp_tree node;
2633 : unsigned i, j;
2634 353597 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2635 : {
2636 224479 : if (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance)) != vect_internal_def)
2637 0 : continue;
2638 :
2639 224479 : stmt_vec_info vinfo;
2640 224479 : vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2641 224479 : if (!vinfo || !STMT_VINFO_GROUPED_ACCESS (vinfo))
2642 221967 : continue;
2643 2512 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2644 2512 : unsigned int size = DR_GROUP_SIZE (vinfo);
2645 2512 : tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
2646 2512 : if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
2647 4344 : && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2648 5019 : && ! vect_grouped_store_supported (vectype, size))
2649 675 : return opt_result::failure_at (vinfo->stmt,
2650 : "unsupported grouped store\n");
2651 226659 : FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2652 : {
2653 1929 : vinfo = SLP_TREE_REPRESENTATIVE (node);
2654 1929 : if (STMT_VINFO_GROUPED_ACCESS (vinfo))
2655 : {
2656 1681 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2657 1681 : bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2658 1681 : size = DR_GROUP_SIZE (vinfo);
2659 1681 : vectype = SLP_TREE_VECTYPE (node);
2660 1681 : if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
2661 1681 : && ! vect_grouped_load_supported (vectype, single_element_p,
2662 : size))
2663 259 : return opt_result::failure_at (vinfo->stmt,
2664 : "unsupported grouped load\n");
2665 : }
2666 : }
2667 : }
2668 :
2669 : /* Roll back state appropriately. Force single-lane SLP this time. */
2670 129118 : force_single_lane = true;
2671 129118 : if (dump_enabled_p ())
2672 3285 : dump_printf_loc (MSG_NOTE, vect_location,
2673 : "re-trying with single-lane SLP\n");
2674 :
2675 : /* Reset the vectorization factor. */
2676 129118 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = 0;
2677 : /* Free the SLP instances. */
2678 352656 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2679 223538 : vect_free_slp_instance (instance);
2680 129118 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2681 : /* Reset SLP type to loop_vect on all stmts. */
2682 494087 : for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2683 : {
2684 364969 : basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2685 364969 : for (gimple_stmt_iterator si = gsi_start_phis (bb);
2686 647440 : !gsi_end_p (si); gsi_next (&si))
2687 : {
2688 282471 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2689 282471 : STMT_SLP_TYPE (stmt_info) = not_vect;
2690 282471 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2691 282471 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2692 : {
2693 : /* vectorizable_reduction adjusts reduction stmt def-types,
2694 : restore them to that of the PHI. */
2695 20527 : STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2696 20527 : = STMT_VINFO_DEF_TYPE (stmt_info);
2697 20527 : STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2698 : (STMT_VINFO_REDUC_DEF (stmt_info)))
2699 20527 : = STMT_VINFO_DEF_TYPE (stmt_info);
2700 : }
2701 : }
2702 729938 : for (gimple_stmt_iterator si = gsi_start_bb (bb);
2703 2234391 : !gsi_end_p (si); gsi_next (&si))
2704 : {
2705 1869422 : if (is_gimple_debug (gsi_stmt (si)))
2706 717347 : continue;
2707 1152075 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2708 1152075 : STMT_SLP_TYPE (stmt_info) = not_vect;
2709 1152075 : if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2710 : {
2711 218932 : stmt_vec_info pattern_stmt_info
2712 : = STMT_VINFO_RELATED_STMT (stmt_info);
2713 218932 : if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2714 0 : STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2715 :
2716 218932 : gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2717 218932 : STMT_SLP_TYPE (pattern_stmt_info) = not_vect;
2718 218932 : for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2719 444756 : !gsi_end_p (pi); gsi_next (&pi))
2720 225824 : STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2721 225824 : = not_vect;
2722 : }
2723 : }
2724 : }
2725 : /* Free optimized alias test DDRS. */
2726 129118 : LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2727 129118 : LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2728 129118 : LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2729 : /* Reset target cost data. */
2730 129118 : delete loop_vinfo->vector_costs;
2731 129118 : loop_vinfo->vector_costs = nullptr;
2732 : /* Reset accumulated rgroup information. */
2733 129118 : LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
2734 129118 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
2735 129118 : release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2736 : /* Reset assorted flags. */
2737 129118 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2738 129118 : LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2739 129118 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2740 129118 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2741 129118 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2742 129118 : = saved_can_use_partial_vectors_p;
2743 129118 : LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2744 129118 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2745 129118 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2746 129118 : LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = false;
2747 :
2748 129118 : if (loop_vinfo->scan_map)
2749 122 : loop_vinfo->scan_map->empty ();
2750 :
2751 129118 : goto start_over;
2752 : }
2753 :
2754 : /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2755 : to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2756 : OLD_LOOP_VINFO is better unless something specifically indicates
2757 : otherwise.
2758 :
2759 : Note that this deliberately isn't a partial order. */
2760 :
2761 : static bool
2762 5 : vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2763 : loop_vec_info old_loop_vinfo)
2764 : {
2765 5 : struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2766 5 : gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2767 :
2768 5 : poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2769 5 : poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2770 :
2771 : /* Always prefer a VF of loop->simdlen over any other VF. */
2772 5 : if (loop->simdlen)
2773 : {
2774 0 : bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2775 0 : bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2776 0 : if (new_simdlen_p != old_simdlen_p)
2777 : return new_simdlen_p;
2778 : }
2779 :
2780 5 : const auto *old_costs = old_loop_vinfo->vector_costs;
2781 5 : const auto *new_costs = new_loop_vinfo->vector_costs;
2782 5 : if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2783 0 : return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2784 :
2785 5 : return new_costs->better_main_loop_than_p (old_costs);
2786 : }
2787 :
2788 : /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2789 : true if we should. */
2790 :
2791 : static bool
2792 5 : vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2793 : loop_vec_info old_loop_vinfo)
2794 : {
2795 5 : if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2796 : return false;
2797 :
2798 1 : if (dump_enabled_p ())
2799 1 : dump_printf_loc (MSG_NOTE, vect_location,
2800 : "***** Preferring vector mode %s to vector mode %s\n",
2801 1 : GET_MODE_NAME (new_loop_vinfo->vector_mode),
2802 1 : GET_MODE_NAME (old_loop_vinfo->vector_mode));
2803 : return true;
2804 : }
2805 :
2806 : /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if ORIG_LOOP_VINFO is
2807 : not NULL. When MASKED_P is not -1 override the default
2808 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P with it.
2809 : Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance MODE_I to the next
2810 : mode useful to analyze.
2811 : Return the loop_vinfo on success and wrapped null on failure. */
2812 :
2813 : static opt_loop_vec_info
2814 498588 : vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2815 : const vect_loop_form_info *loop_form_info,
2816 : loop_vec_info orig_loop_vinfo,
2817 : const vector_modes &vector_modes, unsigned &mode_i,
2818 : int masked_p,
2819 : machine_mode &autodetected_vector_mode,
2820 : bool &fatal)
2821 : {
2822 498588 : loop_vec_info loop_vinfo
2823 498588 : = vect_create_loop_vinfo (loop, shared, loop_form_info, orig_loop_vinfo);
2824 :
2825 498588 : machine_mode vector_mode = vector_modes[mode_i];
2826 498588 : loop_vinfo->vector_mode = vector_mode;
2827 498588 : unsigned int suggested_unroll_factor = 1;
2828 498588 : bool single_lane_slp_done_for_suggested_uf = false;
2829 :
2830 : /* Run the main analysis. */
2831 498588 : opt_result res = vect_analyze_loop_2 (loop_vinfo, masked_p, fatal,
2832 : &suggested_unroll_factor,
2833 : single_lane_slp_done_for_suggested_uf);
2834 498588 : if (dump_enabled_p ())
2835 20353 : dump_printf_loc (MSG_NOTE, vect_location,
2836 : "***** Analysis %s with vector mode %s\n",
2837 20353 : res ? "succeeded" : "failed",
2838 20353 : GET_MODE_NAME (loop_vinfo->vector_mode));
2839 :
2840 498588 : auto user_unroll = LOOP_VINFO_LOOP (loop_vinfo)->unroll;
2841 498588 : if (res && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2842 : /* Check to see if the user wants to unroll or if the target wants to. */
2843 552966 : && (suggested_unroll_factor > 1 || user_unroll > 1))
2844 : {
2845 261 : if (suggested_unroll_factor == 1)
2846 : {
2847 44 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
2848 44 : suggested_unroll_factor = user_unroll / assumed_vf;
2849 44 : if (suggested_unroll_factor > 1)
2850 : {
2851 30 : if (dump_enabled_p ())
2852 20 : dump_printf_loc (MSG_NOTE, vect_location,
2853 : "setting unroll factor to %d based on user requested "
2854 : "unroll factor %d and suggested vectorization "
2855 : "factor: %d\n",
2856 : suggested_unroll_factor, user_unroll, assumed_vf);
2857 : }
2858 : }
2859 :
2860 261 : if (suggested_unroll_factor > 1)
2861 : {
2862 247 : if (dump_enabled_p ())
2863 44 : dump_printf_loc (MSG_NOTE, vect_location,
2864 : "***** Re-trying analysis for unrolling"
2865 : " with unroll factor %d and %s slp.\n",
2866 : suggested_unroll_factor,
2867 : single_lane_slp_done_for_suggested_uf
2868 : ? "single-lane" : "");
2869 247 : loop_vec_info unroll_vinfo
2870 247 : = vect_create_loop_vinfo (loop, shared, loop_form_info, NULL);
2871 247 : unroll_vinfo->vector_mode = vector_mode;
2872 247 : unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
2873 247 : opt_result new_res
2874 247 : = vect_analyze_loop_2 (unroll_vinfo, masked_p, fatal, NULL,
2875 : single_lane_slp_done_for_suggested_uf);
2876 247 : if (new_res)
2877 : {
2878 201 : delete loop_vinfo;
2879 201 : loop_vinfo = unroll_vinfo;
2880 : }
2881 : else
2882 46 : delete unroll_vinfo;
2883 : }
2884 :
2885 : /* Record that we have honored a user unroll factor. */
2886 261 : LOOP_VINFO_USER_UNROLL (loop_vinfo) = user_unroll > 1;
2887 : }
2888 :
2889 : /* Remember the autodetected vector mode. */
2890 498588 : if (vector_mode == VOIDmode)
2891 269444 : autodetected_vector_mode = loop_vinfo->vector_mode;
2892 :
2893 : /* Advance mode_i, first skipping modes that would result in the
2894 : same analysis result. */
2895 2316470 : while (mode_i + 1 < vector_modes.length ()
2896 1636317 : && vect_chooses_same_modes_p (loop_vinfo,
2897 727376 : vector_modes[mode_i + 1]))
2898 : {
2899 410353 : if (dump_enabled_p ())
2900 16996 : dump_printf_loc (MSG_NOTE, vect_location,
2901 : "***** The result for vector mode %s would"
2902 : " be the same\n",
2903 16996 : GET_MODE_NAME (vector_modes[mode_i + 1]));
2904 410353 : mode_i += 1;
2905 : }
2906 498588 : if (mode_i + 1 < vector_modes.length ()
2907 815611 : && vect_chooses_same_modes_p (autodetected_vector_mode,
2908 317023 : vector_modes[mode_i + 1]))
2909 : {
2910 349 : if (dump_enabled_p ())
2911 10 : dump_printf_loc (MSG_NOTE, vect_location,
2912 : "***** Skipping vector mode %s, which would"
2913 : " repeat the analysis for %s\n",
2914 10 : GET_MODE_NAME (vector_modes[mode_i + 1]),
2915 10 : GET_MODE_NAME (autodetected_vector_mode));
2916 349 : mode_i += 1;
2917 : }
2918 498588 : mode_i++;
2919 :
2920 498588 : if (!res)
2921 : {
2922 437154 : delete loop_vinfo;
2923 437154 : if (fatal)
2924 105323 : gcc_checking_assert (orig_loop_vinfo == NULL);
2925 437154 : return opt_loop_vec_info::propagate_failure (res);
2926 : }
2927 :
2928 61434 : return opt_loop_vec_info::success (loop_vinfo);
2929 : }
2930 :
2931 : /* Function vect_analyze_loop.
2932 :
2933 : Apply a set of analyses on LOOP, and create a loop_vec_info struct
2934 : for it. The different analyses will record information in the
2935 : loop_vec_info struct. */
2936 : opt_loop_vec_info
2937 469419 : vect_analyze_loop (class loop *loop, gimple *loop_vectorized_call,
2938 : vec_info_shared *shared)
2939 : {
2940 469419 : DUMP_VECT_SCOPE ("analyze_loop_nest");
2941 :
2942 469419 : if (loop_outer (loop)
2943 469419 : && loop_vec_info_for_loop (loop_outer (loop))
2944 469977 : && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2945 558 : return opt_loop_vec_info::failure_at (vect_location,
2946 : "outer-loop already vectorized.\n");
2947 :
2948 468861 : if (!find_loop_nest (loop, &shared->loop_nest))
2949 22482 : return opt_loop_vec_info::failure_at
2950 22482 : (vect_location,
2951 : "not vectorized: loop nest containing two or more consecutive inner"
2952 : " loops cannot be vectorized\n");
2953 :
2954 : /* Analyze the loop form. */
2955 446379 : vect_loop_form_info loop_form_info;
2956 446379 : opt_result res = vect_analyze_loop_form (loop, loop_vectorized_call,
2957 : &loop_form_info);
2958 446379 : if (!res)
2959 : {
2960 176935 : if (dump_enabled_p ())
2961 1519 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2962 : "bad loop form.\n");
2963 176935 : return opt_loop_vec_info::propagate_failure (res);
2964 : }
2965 269444 : if (!integer_onep (loop_form_info.assumptions))
2966 : {
2967 : /* We consider to vectorize this loop by versioning it under
2968 : some assumptions. In order to do this, we need to clear
2969 : existing information computed by scev and niter analyzer. */
2970 8260 : scev_reset_htab ();
2971 8260 : free_numbers_of_iterations_estimates (loop);
2972 : /* Also set flag for this loop so that following scev and niter
2973 : analysis are done under the assumptions. */
2974 8260 : loop_constraint_set (loop, LOOP_C_FINITE);
2975 : }
2976 : else
2977 : /* Clear the existing niter information to make sure the nonwrapping flag
2978 : will be calculated and set propriately. */
2979 261184 : free_numbers_of_iterations_estimates (loop);
2980 :
2981 269444 : auto_vector_modes vector_modes;
2982 : /* Autodetect first vector size we try. */
2983 269444 : vector_modes.safe_push (VOIDmode);
2984 269444 : unsigned int autovec_flags
2985 538888 : = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2986 269444 : loop->simdlen != 0);
2987 269444 : bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2988 269444 : && !unlimited_cost_model (loop));
2989 269444 : machine_mode autodetected_vector_mode = VOIDmode;
2990 269444 : opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2991 269444 : unsigned int mode_i = 0;
2992 269444 : unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2993 :
2994 : /* Keep track of the VF for each mode. Initialize all to 0 which indicates
2995 : a mode has not been analyzed. */
2996 269444 : auto_vec<poly_uint64, 8> cached_vf_per_mode;
2997 2706728 : for (unsigned i = 0; i < vector_modes.length (); ++i)
2998 1083920 : cached_vf_per_mode.safe_push (0);
2999 :
3000 : /* First determine the main loop vectorization mode, either the first
3001 : one that works, starting with auto-detecting the vector mode and then
3002 : following the targets order of preference, or the one with the
3003 : lowest cost if pick_lowest_cost_p. */
3004 691640 : while (1)
3005 : {
3006 480542 : bool fatal;
3007 480542 : unsigned int last_mode_i = mode_i;
3008 : /* Set cached VF to -1 prior to analysis, which indicates a mode has
3009 : failed. */
3010 480542 : cached_vf_per_mode[last_mode_i] = -1;
3011 480542 : opt_loop_vec_info loop_vinfo
3012 480542 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3013 : NULL, vector_modes, mode_i, -1,
3014 : autodetected_vector_mode, fatal);
3015 480542 : if (fatal)
3016 : break;
3017 :
3018 375219 : if (loop_vinfo)
3019 : {
3020 : /* Analyzis has been successful so update the VF value. The
3021 : VF should always be a multiple of unroll_factor and we want to
3022 : capture the original VF here. */
3023 54378 : cached_vf_per_mode[last_mode_i]
3024 54378 : = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3025 54378 : loop_vinfo->suggested_unroll_factor);
3026 : /* Once we hit the desired simdlen for the first time,
3027 : discard any previous attempts. */
3028 54378 : if (simdlen
3029 54378 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3030 : {
3031 47 : delete first_loop_vinfo;
3032 : first_loop_vinfo = opt_loop_vec_info::success (NULL);
3033 : simdlen = 0;
3034 : }
3035 54331 : else if (pick_lowest_cost_p
3036 10 : && first_loop_vinfo
3037 54336 : && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3038 : {
3039 : /* Pick loop_vinfo over first_loop_vinfo. */
3040 1 : delete first_loop_vinfo;
3041 1 : first_loop_vinfo = opt_loop_vec_info::success (NULL);
3042 : }
3043 54378 : if (first_loop_vinfo == NULL)
3044 : first_loop_vinfo = loop_vinfo;
3045 : else
3046 : {
3047 6 : delete loop_vinfo;
3048 6 : loop_vinfo = opt_loop_vec_info::success (NULL);
3049 : }
3050 :
3051 : /* Commit to first_loop_vinfo if we have no reason to try
3052 : alternatives. */
3053 54378 : if (!simdlen && !pick_lowest_cost_p)
3054 : break;
3055 : }
3056 320860 : if (mode_i == vector_modes.length ()
3057 320860 : || autodetected_vector_mode == VOIDmode)
3058 : break;
3059 :
3060 : /* Try the next biggest vector size. */
3061 211098 : if (dump_enabled_p ())
3062 3951 : dump_printf_loc (MSG_NOTE, vect_location,
3063 : "***** Re-trying analysis with vector mode %s\n",
3064 3951 : GET_MODE_NAME (vector_modes[mode_i]));
3065 211098 : }
3066 269444 : if (!first_loop_vinfo)
3067 215078 : return opt_loop_vec_info::propagate_failure (res);
3068 :
3069 54366 : if (dump_enabled_p ())
3070 9500 : dump_printf_loc (MSG_NOTE, vect_location,
3071 : "***** Choosing vector mode %s\n",
3072 9500 : GET_MODE_NAME (first_loop_vinfo->vector_mode));
3073 :
3074 : /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3075 : enabled, SIMDUID is not set, it is the innermost loop and we have
3076 : either already found the loop's SIMDLEN or there was no SIMDLEN to
3077 : begin with.
3078 : TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3079 54366 : bool vect_epilogues = (!simdlen
3080 54364 : && loop->inner == NULL
3081 53792 : && param_vect_epilogues_nomask
3082 52719 : && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3083 : /* No code motion support for multiple epilogues so for now
3084 : not supported when multiple exits. */
3085 25862 : && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3086 25389 : && !loop->simduid
3087 78342 : && loop_cost_model (loop) > VECT_COST_MODEL_VERY_CHEAP);
3088 54366 : if (!vect_epilogues)
3089 41464 : return first_loop_vinfo;
3090 :
3091 : /* Now analyze first_loop_vinfo for epilogue vectorization. */
3092 :
3093 : /* For epilogues start the analysis from the first mode. The motivation
3094 : behind starting from the beginning comes from cases where the VECTOR_MODES
3095 : array may contain length-agnostic and length-specific modes. Their
3096 : ordering is not guaranteed, so we could end up picking a mode for the main
3097 : loop that is after the epilogue's optimal mode. */
3098 12902 : int masked_p = -1;
3099 12902 : if (!unlimited_cost_model (loop)
3100 12902 : && (first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p)
3101 : != VOIDmode))
3102 : {
3103 4 : vector_modes[0]
3104 4 : = first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p);
3105 4 : cached_vf_per_mode[0] = 0;
3106 : }
3107 : else
3108 12898 : vector_modes[0] = autodetected_vector_mode;
3109 12902 : mode_i = 0;
3110 :
3111 12938 : bool supports_partial_vectors = (param_vect_partial_vector_usage != 0
3112 12902 : || masked_p == 1);
3113 : if (supports_partial_vectors
3114 36 : && !partial_vectors_supported_p ()
3115 36 : && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (first_loop_vinfo))
3116 : supports_partial_vectors = false;
3117 12902 : poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3118 :
3119 12902 : loop_vec_info orig_loop_vinfo = first_loop_vinfo;
3120 13090 : do
3121 : {
3122 : /* Let the user override what the target suggests. */
3123 12996 : if (OPTION_SET_P (param_vect_partial_vector_usage))
3124 43 : masked_p = -1;
3125 :
3126 44506 : while (1)
3127 : {
3128 : /* If the target does not support partial vectors we can shorten the
3129 : number of modes to analyze for the epilogue as we know we can't
3130 : pick a mode that would lead to a VF at least as big as the
3131 : FIRST_VINFO_VF. */
3132 58277 : if (!supports_partial_vectors
3133 44506 : && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3134 : {
3135 13800 : mode_i++;
3136 27600 : if (mode_i == vector_modes.length ())
3137 : break;
3138 26431 : continue;
3139 : }
3140 : /* We would need an exhaustive search to find all modes we
3141 : skipped but that would lead to the same result as the
3142 : analysis it was skipped for and where we'd could check
3143 : cached_vf_per_mode against.
3144 : Check for the autodetected mode, which is the common
3145 : situation on x86 which does not perform cost comparison. */
3146 43366 : if (!supports_partial_vectors
3147 30696 : && maybe_ge (cached_vf_per_mode[0], first_vinfo_vf)
3148 60863 : && vect_chooses_same_modes_p (autodetected_vector_mode,
3149 30157 : vector_modes[mode_i]))
3150 : {
3151 12660 : mode_i++;
3152 25320 : if (mode_i == vector_modes.length ())
3153 : break;
3154 12660 : continue;
3155 : }
3156 :
3157 18046 : if (dump_enabled_p ())
3158 3232 : dump_printf_loc (MSG_NOTE, vect_location,
3159 : "***** Re-trying epilogue analysis with vector "
3160 3232 : "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3161 :
3162 18046 : bool fatal;
3163 18046 : opt_loop_vec_info loop_vinfo
3164 18046 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3165 : orig_loop_vinfo,
3166 : vector_modes, mode_i, masked_p,
3167 : autodetected_vector_mode, fatal);
3168 18046 : if (fatal)
3169 : break;
3170 :
3171 18046 : if (loop_vinfo)
3172 : {
3173 7056 : if (pick_lowest_cost_p
3174 4 : && orig_loop_vinfo->epilogue_vinfo
3175 7056 : && vect_joust_loop_vinfos (loop_vinfo,
3176 0 : orig_loop_vinfo->epilogue_vinfo))
3177 : {
3178 0 : gcc_assert (vect_epilogues);
3179 0 : delete orig_loop_vinfo->epilogue_vinfo;
3180 0 : orig_loop_vinfo->epilogue_vinfo = nullptr;
3181 : }
3182 7056 : if (!orig_loop_vinfo->epilogue_vinfo)
3183 7056 : orig_loop_vinfo->epilogue_vinfo = loop_vinfo;
3184 : else
3185 : {
3186 0 : delete loop_vinfo;
3187 0 : loop_vinfo = opt_loop_vec_info::success (NULL);
3188 : }
3189 :
3190 : /* For now only allow one epilogue loop, but allow
3191 : pick_lowest_cost_p to replace it, so commit to the
3192 : first epilogue if we have no reason to try alternatives. */
3193 7056 : if (!pick_lowest_cost_p)
3194 : break;
3195 : }
3196 :
3197 : /* Revert back to the default from the suggested prefered
3198 : epilogue vectorization mode. */
3199 10994 : masked_p = -1;
3200 21988 : if (mode_i == vector_modes.length ())
3201 : break;
3202 : }
3203 :
3204 12996 : orig_loop_vinfo = orig_loop_vinfo->epilogue_vinfo;
3205 12996 : if (!orig_loop_vinfo)
3206 : break;
3207 :
3208 : /* When we selected a first vectorized epilogue, see if the target
3209 : suggests to have another one. */
3210 7056 : masked_p = -1;
3211 7056 : if (!unlimited_cost_model (loop)
3212 4120 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (orig_loop_vinfo)
3213 11169 : && (orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p)
3214 : != VOIDmode))
3215 : {
3216 188 : vector_modes[0]
3217 94 : = orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p);
3218 94 : cached_vf_per_mode[0] = 0;
3219 94 : mode_i = 0;
3220 : }
3221 : else
3222 : break;
3223 94 : }
3224 : while (1);
3225 :
3226 12902 : if (first_loop_vinfo->epilogue_vinfo)
3227 : {
3228 6967 : poly_uint64 lowest_th
3229 6967 : = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3230 6967 : loop_vec_info epilog_vinfo = first_loop_vinfo->epilogue_vinfo;
3231 7056 : do
3232 : {
3233 7056 : poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (epilog_vinfo);
3234 7056 : gcc_assert (!LOOP_REQUIRES_VERSIONING (epilog_vinfo)
3235 : || maybe_ne (lowest_th, 0U));
3236 : /* Keep track of the known smallest versioning threshold. */
3237 7056 : if (ordered_p (lowest_th, th))
3238 7056 : lowest_th = ordered_min (lowest_th, th);
3239 7056 : epilog_vinfo = epilog_vinfo->epilogue_vinfo;
3240 : }
3241 7056 : while (epilog_vinfo);
3242 6967 : LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3243 6967 : if (dump_enabled_p ())
3244 1441 : dump_printf_loc (MSG_NOTE, vect_location,
3245 : "***** Choosing epilogue vector mode %s\n",
3246 1441 : GET_MODE_NAME
3247 : (first_loop_vinfo->epilogue_vinfo->vector_mode));
3248 : }
3249 :
3250 12902 : return first_loop_vinfo;
3251 715823 : }
3252 :
3253 : /* Return true if there is an in-order reduction function for CODE, storing
3254 : it in *REDUC_FN if so. */
3255 :
3256 : static bool
3257 4714 : fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3258 : {
3259 : /* We support MINUS_EXPR by negating the operand. This also preserves an
3260 : initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3261 : (-0.0) = -0.0. */
3262 4714 : if (code == PLUS_EXPR || code == MINUS_EXPR)
3263 : {
3264 4038 : *reduc_fn = IFN_FOLD_LEFT_PLUS;
3265 0 : return true;
3266 : }
3267 : return false;
3268 : }
3269 :
3270 : /* Function reduction_fn_for_scalar_code
3271 :
3272 : Input:
3273 : CODE - tree_code of a reduction operations.
3274 :
3275 : Output:
3276 : REDUC_FN - the corresponding internal function to be used to reduce the
3277 : vector of partial results into a single scalar result, or IFN_LAST
3278 : if the operation is a supported reduction operation, but does not have
3279 : such an internal function.
3280 :
3281 : Return FALSE if CODE currently cannot be vectorized as reduction. */
3282 :
3283 : bool
3284 1983936 : reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3285 : {
3286 1983936 : if (code.is_tree_code ())
3287 1983878 : switch (tree_code (code))
3288 : {
3289 15079 : case MAX_EXPR:
3290 15079 : *reduc_fn = IFN_REDUC_MAX;
3291 15079 : return true;
3292 :
3293 49837 : case MIN_EXPR:
3294 49837 : *reduc_fn = IFN_REDUC_MIN;
3295 49837 : return true;
3296 :
3297 1081180 : case PLUS_EXPR:
3298 1081180 : *reduc_fn = IFN_REDUC_PLUS;
3299 1081180 : return true;
3300 :
3301 235784 : case BIT_AND_EXPR:
3302 235784 : *reduc_fn = IFN_REDUC_AND;
3303 235784 : return true;
3304 :
3305 279901 : case BIT_IOR_EXPR:
3306 279901 : *reduc_fn = IFN_REDUC_IOR;
3307 279901 : return true;
3308 :
3309 42883 : case BIT_XOR_EXPR:
3310 42883 : *reduc_fn = IFN_REDUC_XOR;
3311 42883 : return true;
3312 :
3313 279214 : case MULT_EXPR:
3314 279214 : case MINUS_EXPR:
3315 279214 : *reduc_fn = IFN_LAST;
3316 279214 : return true;
3317 :
3318 : default:
3319 : return false;
3320 : }
3321 : else
3322 58 : switch (combined_fn (code))
3323 : {
3324 34 : CASE_CFN_FMAX:
3325 34 : *reduc_fn = IFN_REDUC_FMAX;
3326 34 : return true;
3327 :
3328 24 : CASE_CFN_FMIN:
3329 24 : *reduc_fn = IFN_REDUC_FMIN;
3330 24 : return true;
3331 :
3332 : default:
3333 : return false;
3334 : }
3335 : }
3336 :
3337 : /* Set *SBOOL_FN to the corresponding function working on vector masks
3338 : for REDUC_FN. Return true if that exists, false otherwise. */
3339 :
3340 : static bool
3341 0 : sbool_reduction_fn_for_fn (internal_fn reduc_fn, internal_fn *sbool_fn)
3342 : {
3343 0 : switch (reduc_fn)
3344 : {
3345 0 : case IFN_REDUC_AND:
3346 0 : *sbool_fn = IFN_REDUC_SBOOL_AND;
3347 0 : return true;
3348 0 : case IFN_REDUC_IOR:
3349 0 : *sbool_fn = IFN_REDUC_SBOOL_IOR;
3350 0 : return true;
3351 0 : case IFN_REDUC_XOR:
3352 0 : *sbool_fn = IFN_REDUC_SBOOL_XOR;
3353 0 : return true;
3354 : default:
3355 : return false;
3356 : }
3357 : }
3358 :
3359 : /* If there is a neutral value X such that a reduction would not be affected
3360 : by the introduction of additional X elements, return that X, otherwise
3361 : return null. CODE is the code of the reduction and SCALAR_TYPE is type
3362 : of the scalar elements. If the reduction has just a single initial value
3363 : then INITIAL_VALUE is that value, otherwise it is null.
3364 : If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3365 : In that case no signed zero is returned. */
3366 :
3367 : tree
3368 52268 : neutral_op_for_reduction (tree scalar_type, code_helper code,
3369 : tree initial_value, bool as_initial)
3370 : {
3371 52268 : if (code.is_tree_code ())
3372 52210 : switch (tree_code (code))
3373 : {
3374 7792 : case DOT_PROD_EXPR:
3375 7792 : case SAD_EXPR:
3376 7792 : case MINUS_EXPR:
3377 7792 : case BIT_IOR_EXPR:
3378 7792 : case BIT_XOR_EXPR:
3379 7792 : return build_zero_cst (scalar_type);
3380 39413 : case WIDEN_SUM_EXPR:
3381 39413 : case PLUS_EXPR:
3382 39413 : if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3383 92 : return build_real (scalar_type, dconstm0);
3384 : else
3385 39321 : return build_zero_cst (scalar_type);
3386 :
3387 2046 : case MULT_EXPR:
3388 2046 : return build_one_cst (scalar_type);
3389 :
3390 960 : case BIT_AND_EXPR:
3391 960 : return build_all_ones_cst (scalar_type);
3392 :
3393 : case MAX_EXPR:
3394 : case MIN_EXPR:
3395 : return initial_value;
3396 :
3397 384 : default:
3398 384 : return NULL_TREE;
3399 : }
3400 : else
3401 58 : switch (combined_fn (code))
3402 : {
3403 : CASE_CFN_FMIN:
3404 : CASE_CFN_FMAX:
3405 : return initial_value;
3406 :
3407 0 : default:
3408 0 : return NULL_TREE;
3409 : }
3410 : }
3411 :
3412 : /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3413 : STMT is printed with a message MSG. */
3414 :
3415 : static void
3416 577 : report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3417 : {
3418 577 : dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3419 577 : }
3420 :
3421 : /* Return true if we need an in-order reduction for operation CODE
3422 : on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3423 : overflow must wrap. */
3424 :
3425 : bool
3426 6433991 : needs_fold_left_reduction_p (tree type, code_helper code)
3427 : {
3428 : /* CHECKME: check for !flag_finite_math_only too? */
3429 6433991 : if (SCALAR_FLOAT_TYPE_P (type))
3430 : {
3431 547752 : if (code.is_tree_code ())
3432 547698 : switch (tree_code (code))
3433 : {
3434 : case MIN_EXPR:
3435 : case MAX_EXPR:
3436 : return false;
3437 :
3438 545999 : default:
3439 545999 : return !flag_associative_math;
3440 : }
3441 : else
3442 54 : switch (combined_fn (code))
3443 : {
3444 : CASE_CFN_FMIN:
3445 : CASE_CFN_FMAX:
3446 : return false;
3447 :
3448 2 : default:
3449 2 : return !flag_associative_math;
3450 : }
3451 : }
3452 :
3453 5886239 : if (INTEGRAL_TYPE_P (type))
3454 5885373 : return (!code.is_tree_code ()
3455 5885373 : || !operation_no_trapping_overflow (type, tree_code (code)));
3456 :
3457 866 : if (SAT_FIXED_POINT_TYPE_P (type))
3458 : return true;
3459 :
3460 : return false;
3461 : }
3462 :
3463 : /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3464 : has a handled computation expression. Store the main reduction
3465 : operation in *CODE. */
3466 :
3467 : static bool
3468 76248 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3469 : tree loop_arg, code_helper *code,
3470 : vec<std::pair<ssa_op_iter, use_operand_p> > &path,
3471 : bool inner_loop_of_double_reduc)
3472 : {
3473 76248 : auto_bitmap visited;
3474 76248 : tree lookfor = PHI_RESULT (phi);
3475 76248 : ssa_op_iter curri;
3476 76248 : use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3477 160086 : while (USE_FROM_PTR (curr) != loop_arg)
3478 7590 : curr = op_iter_next_use (&curri);
3479 76248 : curri.i = curri.numops;
3480 700064 : do
3481 : {
3482 700064 : path.safe_push (std::make_pair (curri, curr));
3483 700064 : tree use = USE_FROM_PTR (curr);
3484 700064 : if (use == lookfor)
3485 : break;
3486 624211 : gimple *def = SSA_NAME_DEF_STMT (use);
3487 624211 : if (gimple_nop_p (def)
3488 624211 : || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3489 : {
3490 522358 : pop:
3491 522358 : do
3492 : {
3493 522358 : std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3494 522358 : curri = x.first;
3495 522358 : curr = x.second;
3496 573563 : do
3497 573563 : curr = op_iter_next_use (&curri);
3498 : /* Skip already visited or non-SSA operands (from iterating
3499 : over PHI args). */
3500 : while (curr != NULL_USE_OPERAND_P
3501 1147126 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3502 204354 : || ! bitmap_set_bit (visited,
3503 204354 : SSA_NAME_VERSION
3504 : (USE_FROM_PTR (curr)))));
3505 : }
3506 1044716 : while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3507 180352 : if (curr == NULL_USE_OPERAND_P)
3508 : break;
3509 : }
3510 : else
3511 : {
3512 524540 : if (gimple_code (def) == GIMPLE_PHI)
3513 56247 : curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3514 : else
3515 468293 : curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3516 : while (curr != NULL_USE_OPERAND_P
3517 636605 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3518 550929 : || ! bitmap_set_bit (visited,
3519 550929 : SSA_NAME_VERSION
3520 : (USE_FROM_PTR (curr)))))
3521 112065 : curr = op_iter_next_use (&curri);
3522 524540 : if (curr == NULL_USE_OPERAND_P)
3523 80681 : goto pop;
3524 : }
3525 : }
3526 : while (1);
3527 76248 : if (dump_file && (dump_flags & TDF_DETAILS))
3528 : {
3529 3980 : dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3530 3980 : unsigned i;
3531 3980 : std::pair<ssa_op_iter, use_operand_p> *x;
3532 13527 : FOR_EACH_VEC_ELT (path, i, x)
3533 9547 : dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3534 3980 : dump_printf (MSG_NOTE, "\n");
3535 : }
3536 :
3537 : /* Check whether the reduction path detected is valid. */
3538 76248 : bool fail = path.length () == 0;
3539 76248 : bool neg = false;
3540 76248 : int sign = -1;
3541 76248 : *code = ERROR_MARK;
3542 161688 : for (unsigned i = 1; i < path.length (); ++i)
3543 : {
3544 92043 : gimple *use_stmt = USE_STMT (path[i].second);
3545 92043 : gimple_match_op op;
3546 92043 : if (!gimple_extract_op (use_stmt, &op))
3547 : {
3548 : fail = true;
3549 6603 : break;
3550 : }
3551 91159 : unsigned int opi = op.num_ops;
3552 91159 : if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3553 : {
3554 : /* The following make sure we can compute the operand index
3555 : easily plus it mostly disallows chaining via COND_EXPR condition
3556 : operands. */
3557 147648 : for (opi = 0; opi < op.num_ops; ++opi)
3558 146696 : if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3559 : break;
3560 : }
3561 3524 : else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3562 : {
3563 7062 : for (opi = 0; opi < op.num_ops; ++opi)
3564 7062 : if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3565 : break;
3566 : }
3567 91159 : if (opi == op.num_ops)
3568 : {
3569 : fail = true;
3570 : break;
3571 : }
3572 90207 : op.code = canonicalize_code (op.code, op.type);
3573 90207 : if (op.code == MINUS_EXPR)
3574 : {
3575 3872 : op.code = PLUS_EXPR;
3576 : /* Track whether we negate the reduction value each iteration. */
3577 3872 : if (op.ops[1] == op.ops[opi])
3578 34 : neg = ! neg;
3579 : }
3580 86335 : else if (op.code == IFN_COND_SUB)
3581 : {
3582 2 : op.code = IFN_COND_ADD;
3583 : /* Track whether we negate the reduction value each iteration. */
3584 2 : if (op.ops[2] == op.ops[opi])
3585 0 : neg = ! neg;
3586 : }
3587 : /* For an FMA the reduction code is the PLUS if the addition chain
3588 : is the reduction. */
3589 86333 : else if (op.code == IFN_FMA && opi == 2)
3590 28 : op.code = PLUS_EXPR;
3591 90207 : if (CONVERT_EXPR_CODE_P (op.code)
3592 90207 : && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3593 : ;
3594 86733 : else if (*code == ERROR_MARK)
3595 : {
3596 74111 : *code = op.code;
3597 74111 : sign = TYPE_SIGN (op.type);
3598 : }
3599 12622 : else if (op.code != *code)
3600 : {
3601 : fail = true;
3602 : break;
3603 : }
3604 11300 : else if ((op.code == MIN_EXPR
3605 11216 : || op.code == MAX_EXPR)
3606 11315 : && sign != TYPE_SIGN (op.type))
3607 : {
3608 : fail = true;
3609 : break;
3610 : }
3611 : /* Check there's only a single stmt the op is used on. For the
3612 : not value-changing tail and the last stmt allow out-of-loop uses,
3613 : but not when this is the inner loop of a double reduction.
3614 : ??? We could relax this and handle arbitrary live stmts by
3615 : forcing a scalar epilogue for example. */
3616 88882 : imm_use_iterator imm_iter;
3617 88882 : use_operand_p use_p;
3618 88882 : gimple *op_use_stmt;
3619 88882 : unsigned cnt = 0;
3620 92376 : bool cond_fn_p = op.code.is_internal_fn ()
3621 3494 : && (conditional_internal_fn_code (internal_fn (op.code))
3622 88882 : != ERROR_MARK);
3623 :
3624 303559 : FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3625 : {
3626 : /* In case of a COND_OP (mask, op1, op2, op1) reduction we should
3627 : have op1 twice (once as definition, once as else) in the same
3628 : operation. Enforce this. */
3629 125795 : if (cond_fn_p && op_use_stmt == use_stmt)
3630 : {
3631 3428 : gcall *call = as_a<gcall *> (use_stmt);
3632 3428 : unsigned else_pos
3633 3428 : = internal_fn_else_index (internal_fn (op.code));
3634 3428 : if (gimple_call_arg (call, else_pos) != op.ops[opi])
3635 : {
3636 : fail = true;
3637 : break;
3638 : }
3639 17140 : for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
3640 : {
3641 13712 : if (j == else_pos)
3642 3428 : continue;
3643 10284 : if (gimple_call_arg (call, j) == op.ops[opi])
3644 3428 : cnt++;
3645 : }
3646 : }
3647 122367 : else if (!is_gimple_debug (op_use_stmt)
3648 122367 : && ((*code != ERROR_MARK || inner_loop_of_double_reduc)
3649 1806 : || flow_bb_inside_loop_p (loop,
3650 1806 : gimple_bb (op_use_stmt))))
3651 177939 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3652 88974 : cnt++;
3653 88882 : }
3654 :
3655 88882 : if (cnt != 1)
3656 : {
3657 : fail = true;
3658 : break;
3659 : }
3660 : }
3661 83257 : return ! fail && ! neg && *code != ERROR_MARK;
3662 76248 : }
3663 :
3664 : bool
3665 21 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3666 : tree loop_arg, enum tree_code code)
3667 : {
3668 21 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3669 21 : code_helper code_;
3670 21 : return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path, false)
3671 21 : && code_ == code);
3672 21 : }
3673 :
3674 :
3675 :
3676 : /* Function vect_is_simple_reduction
3677 :
3678 : (1) Detect a cross-iteration def-use cycle that represents a simple
3679 : reduction computation. We look for the following pattern:
3680 :
3681 : loop_header:
3682 : a1 = phi < a0, a2 >
3683 : a3 = ...
3684 : a2 = operation (a3, a1)
3685 :
3686 : or
3687 :
3688 : a3 = ...
3689 : loop_header:
3690 : a1 = phi < a0, a2 >
3691 : a2 = operation (a3, a1)
3692 :
3693 : such that:
3694 : 1. operation is commutative and associative and it is safe to
3695 : change the order of the computation
3696 : 2. no uses for a2 in the loop (a2 is used out of the loop)
3697 : 3. no uses of a1 in the loop besides the reduction operation
3698 : 4. no uses of a1 outside the loop.
3699 :
3700 : Conditions 1,4 are tested here.
3701 : Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3702 :
3703 : (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3704 : nested cycles.
3705 :
3706 : (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3707 : reductions:
3708 :
3709 : a1 = phi < a0, a2 >
3710 : inner loop (def of a3)
3711 : a2 = phi < a3 >
3712 :
3713 : (4) Detect condition expressions, ie:
3714 : for (int i = 0; i < N; i++)
3715 : if (a[i] < val)
3716 : ret_val = a[i];
3717 :
3718 : */
3719 :
3720 : static stmt_vec_info
3721 139515 : vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3722 : gphi **double_reduc)
3723 : {
3724 139515 : gphi *phi = as_a <gphi *> (phi_info->stmt);
3725 139515 : gimple *phi_use_stmt = NULL;
3726 139515 : imm_use_iterator imm_iter;
3727 139515 : use_operand_p use_p;
3728 :
3729 : /* When double_reduc is NULL we are testing the inner loop of a
3730 : double reduction. */
3731 139515 : bool inner_loop_of_double_reduc = double_reduc == NULL;
3732 139515 : if (double_reduc)
3733 138468 : *double_reduc = NULL;
3734 139515 : STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3735 :
3736 139515 : tree phi_name = PHI_RESULT (phi);
3737 : /* ??? If there are no uses of the PHI result the inner loop reduction
3738 : won't be detected as possibly double-reduction by vectorizable_reduction
3739 : because that tries to walk the PHI arg from the preheader edge which
3740 : can be constant. See PR60382. */
3741 139515 : if (has_zero_uses (phi_name))
3742 : return NULL;
3743 139388 : class loop *loop = (gimple_bb (phi))->loop_father;
3744 139388 : unsigned nphi_def_loop_uses = 0;
3745 535453 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3746 : {
3747 268349 : gimple *use_stmt = USE_STMT (use_p);
3748 268349 : if (is_gimple_debug (use_stmt))
3749 71994 : continue;
3750 :
3751 196355 : if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3752 : {
3753 11672 : if (dump_enabled_p ())
3754 34 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3755 : "intermediate value used outside loop.\n");
3756 :
3757 11672 : return NULL;
3758 : }
3759 :
3760 : /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
3761 : op1 twice (once as definition, once as else) in the same operation.
3762 : Only count it as one. */
3763 184683 : if (use_stmt != phi_use_stmt)
3764 : {
3765 180818 : nphi_def_loop_uses++;
3766 180818 : phi_use_stmt = use_stmt;
3767 : }
3768 11672 : }
3769 :
3770 127716 : tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3771 127716 : if (TREE_CODE (latch_def) != SSA_NAME)
3772 : {
3773 1453 : if (dump_enabled_p ())
3774 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3775 : "reduction: not ssa_name: %T\n", latch_def);
3776 1453 : return NULL;
3777 : }
3778 :
3779 126263 : stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3780 126263 : if (!def_stmt_info
3781 126263 : || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3782 161 : return NULL;
3783 :
3784 126102 : bool nested_in_vect_loop
3785 126102 : = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3786 126102 : unsigned nlatch_def_loop_uses = 0;
3787 126102 : auto_vec<gphi *, 3> lcphis;
3788 618410 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3789 : {
3790 366206 : gimple *use_stmt = USE_STMT (use_p);
3791 366206 : if (is_gimple_debug (use_stmt))
3792 111548 : continue;
3793 254658 : if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3794 165633 : nlatch_def_loop_uses++;
3795 : else
3796 : /* We can have more than one loop-closed PHI. */
3797 89025 : lcphis.safe_push (as_a <gphi *> (use_stmt));
3798 126102 : }
3799 :
3800 : /* If we are vectorizing an inner reduction we are executing that
3801 : in the original order only in case we are not dealing with a
3802 : double reduction. */
3803 126102 : if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3804 : {
3805 2272 : if (dump_enabled_p ())
3806 433 : report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3807 : "detected nested cycle: ");
3808 2272 : return def_stmt_info;
3809 : }
3810 :
3811 : /* When the inner loop of a double reduction ends up with more than
3812 : one loop-closed PHI we have failed to classify alternate such
3813 : PHIs as double reduction, leading to wrong code. See PR103237. */
3814 124865 : if (inner_loop_of_double_reduc && lcphis.length () != 1)
3815 : {
3816 1 : if (dump_enabled_p ())
3817 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3818 : "unhandle double reduction\n");
3819 1 : return NULL;
3820 : }
3821 :
3822 : /* If this isn't a nested cycle or if the nested cycle reduction value
3823 : is used ouside of the inner loop we cannot handle uses of the reduction
3824 : value. */
3825 123829 : if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3826 : {
3827 46300 : if (dump_enabled_p ())
3828 401 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3829 : "reduction used in loop.\n");
3830 46300 : return NULL;
3831 : }
3832 :
3833 : /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3834 : defined in the inner loop. */
3835 77529 : if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3836 : {
3837 1302 : tree op1 = PHI_ARG_DEF (def_stmt, 0);
3838 1302 : if (gimple_phi_num_args (def_stmt) != 1
3839 1302 : || TREE_CODE (op1) != SSA_NAME)
3840 : {
3841 91 : if (dump_enabled_p ())
3842 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3843 : "unsupported phi node definition.\n");
3844 :
3845 91 : return NULL;
3846 : }
3847 :
3848 : /* Verify there is an inner cycle composed of the PHI phi_use_stmt
3849 : and the latch definition op1. */
3850 1211 : gimple *def1 = SSA_NAME_DEF_STMT (op1);
3851 1211 : if (gimple_bb (def1)
3852 1211 : && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3853 1211 : && loop->inner
3854 1157 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3855 1157 : && (is_gimple_assign (def1) || is_gimple_call (def1))
3856 1148 : && is_a <gphi *> (phi_use_stmt)
3857 1136 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
3858 1136 : && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
3859 : loop_latch_edge (loop->inner)))
3860 2345 : && lcphis.length () == 1)
3861 : {
3862 1047 : if (dump_enabled_p ())
3863 144 : report_vect_op (MSG_NOTE, def_stmt,
3864 : "detected double reduction: ");
3865 :
3866 1047 : *double_reduc = as_a <gphi *> (phi_use_stmt);
3867 1047 : return def_stmt_info;
3868 : }
3869 :
3870 164 : return NULL;
3871 : }
3872 :
3873 : /* Look for the expression computing latch_def from then loop PHI result. */
3874 76227 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3875 76227 : code_helper code;
3876 76227 : if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3877 : path, inner_loop_of_double_reduc))
3878 : {
3879 69218 : STMT_VINFO_REDUC_CODE (phi_info) = code;
3880 69218 : if (code == COND_EXPR && !nested_in_vect_loop)
3881 8177 : STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3882 :
3883 : /* Fill in STMT_VINFO_REDUC_IDX. */
3884 69218 : unsigned i;
3885 222222 : for (i = path.length () - 1; i >= 1; --i)
3886 : {
3887 83786 : gimple *stmt = USE_STMT (path[i].second);
3888 83786 : stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3889 83786 : gimple_match_op op;
3890 83786 : if (!gimple_extract_op (stmt, &op))
3891 0 : gcc_unreachable ();
3892 83786 : if (gassign *assign = dyn_cast<gassign *> (stmt))
3893 80282 : STMT_VINFO_REDUC_IDX (stmt_info)
3894 80282 : = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3895 : else
3896 : {
3897 3504 : gcall *call = as_a<gcall *> (stmt);
3898 3504 : STMT_VINFO_REDUC_IDX (stmt_info)
3899 3504 : = path[i].second->use - gimple_call_arg_ptr (call, 0);
3900 : }
3901 : }
3902 69218 : if (dump_enabled_p ())
3903 3918 : dump_printf_loc (MSG_NOTE, vect_location,
3904 : "reduction: detected reduction\n");
3905 :
3906 69218 : return def_stmt_info;
3907 : }
3908 :
3909 7009 : if (dump_enabled_p ())
3910 86 : dump_printf_loc (MSG_NOTE, vect_location,
3911 : "reduction: unknown pattern\n");
3912 :
3913 : return NULL;
3914 202329 : }
3915 :
3916 : /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3917 : PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3918 : or -1 if not known. */
3919 :
3920 : static int
3921 361909 : vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3922 : {
3923 361909 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
3924 361909 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3925 : {
3926 156621 : if (dump_enabled_p ())
3927 2912 : dump_printf_loc (MSG_NOTE, vect_location,
3928 : "cost model: epilogue peel iters set to vf/2 "
3929 : "because loop iterations are unknown .\n");
3930 156621 : return assumed_vf / 2;
3931 : }
3932 : else
3933 : {
3934 205288 : int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3935 205288 : peel_iters_prologue = MIN (niters, peel_iters_prologue);
3936 205288 : int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3937 : /* If we need to peel for gaps, but no peeling is required, we have to
3938 : peel VF iterations. */
3939 205288 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3940 205288 : peel_iters_epilogue = assumed_vf;
3941 205288 : return peel_iters_epilogue;
3942 : }
3943 : }
3944 :
3945 : /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3946 : int
3947 279798 : vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3948 : int *peel_iters_epilogue,
3949 : stmt_vector_for_cost *scalar_cost_vec,
3950 : stmt_vector_for_cost *prologue_cost_vec,
3951 : stmt_vector_for_cost *epilogue_cost_vec)
3952 : {
3953 279798 : int retval = 0;
3954 :
3955 279798 : *peel_iters_epilogue
3956 279798 : = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3957 :
3958 279798 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3959 : {
3960 : /* If peeled iterations are known but number of scalar loop
3961 : iterations are unknown, count a taken branch per peeled loop. */
3962 107617 : if (peel_iters_prologue > 0)
3963 69152 : retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3964 : vect_prologue);
3965 107617 : if (*peel_iters_epilogue > 0)
3966 107540 : retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3967 : vect_epilogue);
3968 : }
3969 :
3970 279798 : stmt_info_for_cost *si;
3971 279798 : int j;
3972 279798 : if (peel_iters_prologue)
3973 607693 : FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3974 479980 : retval += record_stmt_cost (prologue_cost_vec,
3975 479980 : si->count * peel_iters_prologue,
3976 : si->kind, si->stmt_info, si->misalign,
3977 : vect_prologue);
3978 279798 : if (*peel_iters_epilogue)
3979 944066 : FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3980 749211 : retval += record_stmt_cost (epilogue_cost_vec,
3981 749211 : si->count * *peel_iters_epilogue,
3982 : si->kind, si->stmt_info, si->misalign,
3983 : vect_epilogue);
3984 :
3985 279798 : return retval;
3986 : }
3987 :
3988 : /* Function vect_estimate_min_profitable_iters
3989 :
3990 : Return the number of iterations required for the vector version of the
3991 : loop to be profitable relative to the cost of the scalar version of the
3992 : loop.
3993 :
3994 : *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3995 : of iterations for vectorization. -1 value means loop vectorization
3996 : is not profitable. This returned value may be used for dynamic
3997 : profitability check.
3998 :
3999 : *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4000 : for static check against estimated number of iterations. */
4001 :
4002 : static void
4003 99278 : vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4004 : int *ret_min_profitable_niters,
4005 : int *ret_min_profitable_estimate,
4006 : unsigned *suggested_unroll_factor)
4007 : {
4008 99278 : int min_profitable_iters;
4009 99278 : int min_profitable_estimate;
4010 99278 : int peel_iters_prologue;
4011 99278 : int peel_iters_epilogue;
4012 99278 : unsigned vec_inside_cost = 0;
4013 99278 : int vec_outside_cost = 0;
4014 99278 : unsigned vec_prologue_cost = 0;
4015 99278 : unsigned vec_epilogue_cost = 0;
4016 99278 : int scalar_single_iter_cost = 0;
4017 99278 : int scalar_outside_cost = 0;
4018 99278 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
4019 99278 : int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4020 99278 : vector_costs *target_cost_data = loop_vinfo->vector_costs;
4021 :
4022 : /* Cost model disabled. */
4023 99278 : if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4024 : {
4025 16862 : if (dump_enabled_p ())
4026 10589 : dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4027 16862 : *ret_min_profitable_niters = 0;
4028 16862 : *ret_min_profitable_estimate = 0;
4029 16862 : return;
4030 : }
4031 :
4032 : /* Requires loop versioning tests to handle misalignment. */
4033 82416 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4034 : {
4035 : /* FIXME: Make cost depend on complexity of individual check. */
4036 16 : unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4037 16 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4038 16 : if (dump_enabled_p ())
4039 1 : dump_printf (MSG_NOTE,
4040 : "cost model: Adding cost of checks for loop "
4041 : "versioning to treat misalignment.\n");
4042 : }
4043 :
4044 : /* Requires loop versioning with alias checks. */
4045 82416 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4046 : {
4047 : /* FIXME: Make cost depend on complexity of individual check. */
4048 4117 : unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4049 4117 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4050 4117 : len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4051 2 : if (len)
4052 : /* Count LEN - 1 ANDs and LEN comparisons. */
4053 2 : (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4054 : scalar_stmt, vect_prologue);
4055 4117 : len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4056 1108 : if (len)
4057 : {
4058 : /* Count LEN - 1 ANDs and LEN comparisons. */
4059 1108 : unsigned int nstmts = len * 2 - 1;
4060 : /* +1 for each bias that needs adding. */
4061 2216 : for (unsigned int i = 0; i < len; ++i)
4062 1108 : if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4063 125 : nstmts += 1;
4064 1108 : (void) add_stmt_cost (target_cost_data, nstmts,
4065 : scalar_stmt, vect_prologue);
4066 : }
4067 4117 : if (dump_enabled_p ())
4068 18 : dump_printf (MSG_NOTE,
4069 : "cost model: Adding cost of checks for loop "
4070 : "versioning aliasing.\n");
4071 : }
4072 :
4073 : /* Requires loop versioning with niter checks. */
4074 82416 : if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4075 : {
4076 : /* FIXME: Make cost depend on complexity of individual check. */
4077 665 : (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4078 : NULL, NULL, NULL_TREE, 0, vect_prologue);
4079 665 : if (dump_enabled_p ())
4080 1 : dump_printf (MSG_NOTE,
4081 : "cost model: Adding cost of checks for loop "
4082 : "versioning niters.\n");
4083 : }
4084 :
4085 82416 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4086 4794 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4087 : vect_prologue);
4088 :
4089 : /* Count statements in scalar loop. Using this as scalar cost for a single
4090 : iteration for now.
4091 :
4092 : TODO: Add outer loop support.
4093 :
4094 : TODO: Consider assigning different costs to different scalar
4095 : statements. */
4096 :
4097 82416 : scalar_single_iter_cost = (loop_vinfo->scalar_costs->total_cost ()
4098 82416 : * param_vect_scalar_cost_multiplier) / 100;
4099 :
4100 : /* Add additional cost for the peeled instructions in prologue and epilogue
4101 : loop. (For fully-masked loops there will be no peeling.)
4102 :
4103 : FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4104 : at compile-time - we assume it's vf/2 (the worst would be vf-1).
4105 :
4106 : TODO: Build an expression that represents peel_iters for prologue and
4107 : epilogue to be used in a run-time test. */
4108 :
4109 82416 : bool prologue_need_br_taken_cost = false;
4110 82416 : bool prologue_need_br_not_taken_cost = false;
4111 :
4112 : /* Calculate peel_iters_prologue. */
4113 82416 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4114 : peel_iters_prologue = 0;
4115 82416 : else if (npeel < 0)
4116 : {
4117 283 : peel_iters_prologue = assumed_vf / 2;
4118 283 : if (dump_enabled_p ())
4119 6 : dump_printf (MSG_NOTE, "cost model: "
4120 : "prologue peel iters set to vf/2.\n");
4121 :
4122 : /* If peeled iterations are unknown, count a taken branch and a not taken
4123 : branch per peeled loop. Even if scalar loop iterations are known,
4124 : vector iterations are not known since peeled prologue iterations are
4125 : not known. Hence guards remain the same. */
4126 : prologue_need_br_taken_cost = true;
4127 : prologue_need_br_not_taken_cost = true;
4128 : }
4129 : else
4130 : {
4131 82133 : peel_iters_prologue = npeel;
4132 82133 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4133 : /* If peeled iterations are known but number of scalar loop
4134 : iterations are unknown, count a taken branch per peeled loop. */
4135 82416 : prologue_need_br_taken_cost = true;
4136 : }
4137 :
4138 82416 : bool epilogue_need_br_taken_cost = false;
4139 82416 : bool epilogue_need_br_not_taken_cost = false;
4140 :
4141 : /* Calculate peel_iters_epilogue. */
4142 82416 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4143 : /* We need to peel exactly one iteration for gaps. */
4144 22 : peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4145 82394 : else if (npeel < 0)
4146 : {
4147 : /* If peeling for alignment is unknown, loop bound of main loop
4148 : becomes unknown. */
4149 283 : peel_iters_epilogue = assumed_vf / 2;
4150 283 : if (dump_enabled_p ())
4151 6 : dump_printf (MSG_NOTE, "cost model: "
4152 : "epilogue peel iters set to vf/2 because "
4153 : "peeling for alignment is unknown.\n");
4154 :
4155 : /* See the same reason above in peel_iters_prologue calculation. */
4156 : epilogue_need_br_taken_cost = true;
4157 : epilogue_need_br_not_taken_cost = true;
4158 : }
4159 : else
4160 : {
4161 82111 : peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4162 82111 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4163 : /* If peeled iterations are known but number of scalar loop
4164 : iterations are unknown, count a taken branch per peeled loop. */
4165 82416 : epilogue_need_br_taken_cost = true;
4166 : }
4167 :
4168 82416 : stmt_info_for_cost *si;
4169 82416 : int j;
4170 : /* Add costs associated with peel_iters_prologue. */
4171 82416 : if (peel_iters_prologue)
4172 1040 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4173 : {
4174 748 : (void) add_stmt_cost (target_cost_data,
4175 748 : si->count * peel_iters_prologue, si->kind,
4176 : si->stmt_info, si->node, si->vectype,
4177 : si->misalign, vect_prologue);
4178 : }
4179 :
4180 : /* Add costs associated with peel_iters_epilogue. */
4181 82416 : if (peel_iters_epilogue)
4182 282246 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4183 : {
4184 224108 : (void) add_stmt_cost (target_cost_data,
4185 224108 : si->count * peel_iters_epilogue, si->kind,
4186 : si->stmt_info, si->node, si->vectype,
4187 : si->misalign, vect_epilogue);
4188 : }
4189 :
4190 : /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4191 :
4192 82416 : if (prologue_need_br_taken_cost)
4193 283 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4194 : vect_prologue);
4195 :
4196 82416 : if (prologue_need_br_not_taken_cost)
4197 283 : (void) add_stmt_cost (target_cost_data, 1,
4198 : cond_branch_not_taken, vect_prologue);
4199 :
4200 82416 : if (epilogue_need_br_taken_cost)
4201 48711 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4202 : vect_epilogue);
4203 :
4204 82416 : if (epilogue_need_br_not_taken_cost)
4205 283 : (void) add_stmt_cost (target_cost_data, 1,
4206 : cond_branch_not_taken, vect_epilogue);
4207 :
4208 : /* Take care of special costs for rgroup controls of partial vectors. */
4209 22 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4210 82438 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4211 : == vect_partial_vectors_avx512))
4212 : {
4213 : /* Calculate how many masks we need to generate. */
4214 22 : unsigned int num_masks = 0;
4215 22 : bool need_saturation = false;
4216 90 : for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4217 24 : if (rgm.type)
4218 : {
4219 22 : unsigned nvectors = rgm.factor;
4220 22 : num_masks += nvectors;
4221 22 : if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4222 22 : < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4223 7 : need_saturation = true;
4224 : }
4225 :
4226 : /* ??? The target isn't able to identify the costs below as
4227 : producing masks so it cannot penaltize cases where we'd run
4228 : out of mask registers for example. */
4229 :
4230 : /* ??? We are also failing to account for smaller vector masks
4231 : we generate by splitting larger masks in vect_get_loop_mask. */
4232 :
4233 : /* In the worst case, we need to generate each mask in the prologue
4234 : and in the loop body. We need one splat per group and one
4235 : compare per mask.
4236 :
4237 : Sometimes the prologue mask will fold to a constant,
4238 : so the actual prologue cost might be smaller. However, it's
4239 : simpler and safer to use the worst-case cost; if this ends up
4240 : being the tie-breaker between vectorizing or not, then it's
4241 : probably better not to vectorize. */
4242 22 : (void) add_stmt_cost (target_cost_data,
4243 : num_masks
4244 22 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4245 : vector_stmt, NULL, NULL, NULL_TREE, 0,
4246 : vect_prologue);
4247 44 : (void) add_stmt_cost (target_cost_data,
4248 : num_masks
4249 44 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4250 : vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4251 :
4252 : /* When we need saturation we need it both in the prologue and
4253 : the epilogue. */
4254 22 : if (need_saturation)
4255 : {
4256 7 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4257 : NULL, NULL, NULL_TREE, 0, vect_prologue);
4258 7 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4259 : NULL, NULL, NULL_TREE, 0, vect_body);
4260 : }
4261 : }
4262 0 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4263 82394 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4264 : == vect_partial_vectors_while_ult))
4265 : {
4266 : /* Calculate how many masks we need to generate. */
4267 : unsigned int num_masks = 0;
4268 : rgroup_controls *rgm;
4269 : unsigned int num_vectors_m1;
4270 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4271 : num_vectors_m1, rgm)
4272 0 : if (rgm->type)
4273 0 : num_masks += num_vectors_m1 + 1;
4274 0 : gcc_assert (num_masks > 0);
4275 :
4276 : /* In the worst case, we need to generate each mask in the prologue
4277 : and in the loop body. One of the loop body mask instructions
4278 : replaces the comparison in the scalar loop, and since we don't
4279 : count the scalar comparison against the scalar body, we shouldn't
4280 : count that vector instruction against the vector body either.
4281 :
4282 : Sometimes we can use unpacks instead of generating prologue
4283 : masks and sometimes the prologue mask will fold to a constant,
4284 : so the actual prologue cost might be smaller. However, it's
4285 : simpler and safer to use the worst-case cost; if this ends up
4286 : being the tie-breaker between vectorizing or not, then it's
4287 : probably better not to vectorize. */
4288 0 : (void) add_stmt_cost (target_cost_data, num_masks,
4289 : vector_stmt, NULL, NULL, NULL_TREE, 0,
4290 : vect_prologue);
4291 0 : (void) add_stmt_cost (target_cost_data, num_masks - 1,
4292 : vector_stmt, NULL, NULL, NULL_TREE, 0,
4293 : vect_body);
4294 : }
4295 82394 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4296 : {
4297 : /* Referring to the functions vect_set_loop_condition_partial_vectors
4298 : and vect_set_loop_controls_directly, we need to generate each
4299 : length in the prologue and in the loop body if required. Although
4300 : there are some possible optimizations, we consider the worst case
4301 : here. */
4302 :
4303 0 : bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4304 0 : signed char partial_load_store_bias
4305 : = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4306 0 : bool need_iterate_p
4307 0 : = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4308 0 : && !vect_known_niters_smaller_than_vf (loop_vinfo));
4309 :
4310 : /* Calculate how many statements to be added. */
4311 0 : unsigned int prologue_stmts = 0;
4312 0 : unsigned int body_stmts = 0;
4313 :
4314 0 : rgroup_controls *rgc;
4315 0 : unsigned int num_vectors_m1;
4316 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4317 0 : if (rgc->type)
4318 : {
4319 : /* May need one SHIFT for nitems_total computation. */
4320 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4321 0 : if (nitems != 1 && !niters_known_p)
4322 0 : prologue_stmts += 1;
4323 :
4324 : /* May need one MAX and one MINUS for wrap around. */
4325 0 : if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4326 0 : prologue_stmts += 2;
4327 :
4328 : /* Need one MAX and one MINUS for each batch limit excepting for
4329 : the 1st one. */
4330 0 : prologue_stmts += num_vectors_m1 * 2;
4331 :
4332 0 : unsigned int num_vectors = num_vectors_m1 + 1;
4333 :
4334 : /* Need to set up lengths in prologue, only one MIN required
4335 : for each since start index is zero. */
4336 0 : prologue_stmts += num_vectors;
4337 :
4338 : /* If we have a non-zero partial load bias, we need one PLUS
4339 : to adjust the load length. */
4340 0 : if (partial_load_store_bias != 0)
4341 0 : body_stmts += 1;
4342 :
4343 0 : unsigned int length_update_cost = 0;
4344 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4345 : /* For decrement IV style, Each only need a single SELECT_VL
4346 : or MIN since beginning to calculate the number of elements
4347 : need to be processed in current iteration. */
4348 : length_update_cost = 1;
4349 : else
4350 : /* For increment IV stype, Each may need two MINs and one MINUS to
4351 : update lengths in body for next iteration. */
4352 0 : length_update_cost = 3;
4353 :
4354 0 : if (need_iterate_p)
4355 0 : body_stmts += length_update_cost * num_vectors;
4356 : }
4357 :
4358 0 : (void) add_stmt_cost (target_cost_data, prologue_stmts,
4359 : scalar_stmt, vect_prologue);
4360 0 : (void) add_stmt_cost (target_cost_data, body_stmts,
4361 : scalar_stmt, vect_body);
4362 : }
4363 :
4364 : /* FORNOW: The scalar outside cost is incremented in one of the
4365 : following ways:
4366 :
4367 : 1. The vectorizer checks for alignment and aliasing and generates
4368 : a condition that allows dynamic vectorization. A cost model
4369 : check is ANDED with the versioning condition. Hence scalar code
4370 : path now has the added cost of the versioning check.
4371 :
4372 : if (cost > th & versioning_check)
4373 : jmp to vector code
4374 :
4375 : Hence run-time scalar is incremented by not-taken branch cost.
4376 :
4377 : 2. The vectorizer then checks if a prologue is required. If the
4378 : cost model check was not done before during versioning, it has to
4379 : be done before the prologue check.
4380 :
4381 : if (cost <= th)
4382 : prologue = scalar_iters
4383 : if (prologue == 0)
4384 : jmp to vector code
4385 : else
4386 : execute prologue
4387 : if (prologue == num_iters)
4388 : go to exit
4389 :
4390 : Hence the run-time scalar cost is incremented by a taken branch,
4391 : plus a not-taken branch, plus a taken branch cost.
4392 :
4393 : 3. The vectorizer then checks if an epilogue is required. If the
4394 : cost model check was not done before during prologue check, it
4395 : has to be done with the epilogue check.
4396 :
4397 : if (prologue == 0)
4398 : jmp to vector code
4399 : else
4400 : execute prologue
4401 : if (prologue == num_iters)
4402 : go to exit
4403 : vector code:
4404 : if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4405 : jmp to epilogue
4406 :
4407 : Hence the run-time scalar cost should be incremented by 2 taken
4408 : branches.
4409 :
4410 : TODO: The back end may reorder the BBS's differently and reverse
4411 : conditions/branch directions. Change the estimates below to
4412 : something more reasonable. */
4413 :
4414 : /* If the number of iterations is known and we do not do versioning, we can
4415 : decide whether to vectorize at compile time. Hence the scalar version
4416 : do not carry cost model guard costs. */
4417 33151 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4418 115567 : || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4419 : {
4420 : /* Cost model check occurs at versioning. */
4421 49880 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4422 4794 : scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4423 : else
4424 : {
4425 : /* Cost model check occurs at prologue generation. */
4426 45086 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4427 150 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4428 150 : + vect_get_stmt_cost (cond_branch_not_taken);
4429 : /* Cost model check occurs at epilogue generation. */
4430 : else
4431 44936 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4432 : }
4433 : }
4434 :
4435 : /* Complete the target-specific cost calculations. */
4436 82416 : loop_vinfo->vector_costs->finish_cost (loop_vinfo->scalar_costs);
4437 82416 : vec_prologue_cost = loop_vinfo->vector_costs->prologue_cost ();
4438 82416 : vec_inside_cost = loop_vinfo->vector_costs->body_cost ();
4439 82416 : vec_epilogue_cost = loop_vinfo->vector_costs->epilogue_cost ();
4440 82416 : if (suggested_unroll_factor)
4441 82229 : *suggested_unroll_factor
4442 82229 : = loop_vinfo->vector_costs->suggested_unroll_factor ();
4443 :
4444 82229 : if (suggested_unroll_factor && *suggested_unroll_factor > 1
4445 233 : && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4446 0 : && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4447 : *suggested_unroll_factor,
4448 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4449 : {
4450 0 : if (dump_enabled_p ())
4451 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4452 : "can't unroll as unrolled vectorization factor larger"
4453 : " than maximum vectorization factor: "
4454 : HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4455 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4456 0 : *suggested_unroll_factor = 1;
4457 : }
4458 :
4459 82416 : vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4460 :
4461 82416 : if (dump_enabled_p ())
4462 : {
4463 609 : dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4464 609 : dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4465 : vec_inside_cost);
4466 609 : dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4467 : vec_prologue_cost);
4468 609 : dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4469 : vec_epilogue_cost);
4470 609 : dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4471 : scalar_single_iter_cost);
4472 609 : dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4473 : scalar_outside_cost);
4474 609 : dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4475 : vec_outside_cost);
4476 609 : dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4477 : peel_iters_prologue);
4478 609 : dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4479 : peel_iters_epilogue);
4480 : }
4481 :
4482 : /* Calculate number of iterations required to make the vector version
4483 : profitable, relative to the loop bodies only. The following condition
4484 : must hold true:
4485 : SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4486 : where
4487 : SIC = scalar iteration cost, VIC = vector iteration cost,
4488 : VOC = vector outside cost, VF = vectorization factor,
4489 : NPEEL = prologue iterations + epilogue iterations,
4490 : SOC = scalar outside cost for run time cost model check. */
4491 :
4492 82416 : int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4493 82416 : - vec_inside_cost);
4494 82416 : if (saving_per_viter <= 0)
4495 : {
4496 24149 : if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4497 0 : warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4498 : "vectorization did not happen for a simd loop");
4499 :
4500 24149 : if (dump_enabled_p ())
4501 22 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4502 : "cost model: the vector iteration cost = %d "
4503 : "divided by the scalar iteration cost = %d "
4504 : "is greater or equal to the vectorization factor = %d"
4505 : ".\n",
4506 : vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4507 24149 : *ret_min_profitable_niters = -1;
4508 24149 : *ret_min_profitable_estimate = -1;
4509 24149 : return;
4510 : }
4511 :
4512 : /* ??? The "if" arm is written to handle all cases; see below for what
4513 : we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4514 58267 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4515 : {
4516 : /* Rewriting the condition above in terms of the number of
4517 : vector iterations (vniters) rather than the number of
4518 : scalar iterations (niters) gives:
4519 :
4520 : SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4521 :
4522 : <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4523 :
4524 : For integer N, X and Y when X > 0:
4525 :
4526 : N * X > Y <==> N >= (Y /[floor] X) + 1. */
4527 14 : int outside_overhead = (vec_outside_cost
4528 14 : - scalar_single_iter_cost * peel_iters_prologue
4529 14 : - scalar_single_iter_cost * peel_iters_epilogue
4530 : - scalar_outside_cost);
4531 : /* We're only interested in cases that require at least one
4532 : vector iteration. */
4533 14 : int min_vec_niters = 1;
4534 14 : if (outside_overhead > 0)
4535 11 : min_vec_niters = outside_overhead / saving_per_viter + 1;
4536 :
4537 14 : if (dump_enabled_p ())
4538 6 : dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4539 : min_vec_niters);
4540 :
4541 14 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4542 : {
4543 : /* Now that we know the minimum number of vector iterations,
4544 : find the minimum niters for which the scalar cost is larger:
4545 :
4546 : SIC * niters > VIC * vniters + VOC - SOC
4547 :
4548 : We know that the minimum niters is no more than
4549 : vniters * VF + NPEEL, but it might be (and often is) less
4550 : than that if a partial vector iteration is cheaper than the
4551 : equivalent scalar code. */
4552 14 : int threshold = (vec_inside_cost * min_vec_niters
4553 14 : + vec_outside_cost
4554 14 : - scalar_outside_cost);
4555 14 : if (threshold <= 0)
4556 : min_profitable_iters = 1;
4557 : else
4558 14 : min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4559 : }
4560 : else
4561 : /* Convert the number of vector iterations into a number of
4562 : scalar iterations. */
4563 0 : min_profitable_iters = (min_vec_niters * assumed_vf
4564 0 : + peel_iters_prologue
4565 : + peel_iters_epilogue);
4566 : }
4567 : else
4568 : {
4569 58253 : min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4570 58253 : * assumed_vf
4571 58253 : - vec_inside_cost * peel_iters_prologue
4572 58253 : - vec_inside_cost * peel_iters_epilogue);
4573 58253 : if (min_profitable_iters <= 0)
4574 : min_profitable_iters = 0;
4575 : else
4576 : {
4577 48971 : min_profitable_iters /= saving_per_viter;
4578 :
4579 48971 : if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4580 48971 : <= (((int) vec_inside_cost * min_profitable_iters)
4581 48971 : + (((int) vec_outside_cost - scalar_outside_cost)
4582 : * assumed_vf)))
4583 48971 : min_profitable_iters++;
4584 : }
4585 : }
4586 :
4587 58267 : if (dump_enabled_p ())
4588 587 : dump_printf (MSG_NOTE,
4589 : " Calculated minimum iters for profitability: %d\n",
4590 : min_profitable_iters);
4591 :
4592 58267 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4593 58253 : && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4594 : /* We want the vectorized loop to execute at least once. */
4595 : min_profitable_iters = assumed_vf + peel_iters_prologue;
4596 10779 : else if (min_profitable_iters < peel_iters_prologue)
4597 : /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4598 : vectorized loop executes at least once. */
4599 : min_profitable_iters = peel_iters_prologue;
4600 :
4601 58267 : if (dump_enabled_p ())
4602 587 : dump_printf_loc (MSG_NOTE, vect_location,
4603 : " Runtime profitability threshold = %d\n",
4604 : min_profitable_iters);
4605 :
4606 58267 : *ret_min_profitable_niters = min_profitable_iters;
4607 :
4608 : /* Calculate number of iterations required to make the vector version
4609 : profitable, relative to the loop bodies only.
4610 :
4611 : Non-vectorized variant is SIC * niters and it must win over vector
4612 : variant on the expected loop trip count. The following condition must hold true:
4613 : SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4614 :
4615 58267 : if (vec_outside_cost <= 0)
4616 : min_profitable_estimate = 0;
4617 : /* ??? This "else if" arm is written to handle all cases; see below for
4618 : what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4619 52756 : else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4620 : {
4621 : /* This is a repeat of the code above, but with + SOC rather
4622 : than - SOC. */
4623 14 : int outside_overhead = (vec_outside_cost
4624 14 : - scalar_single_iter_cost * peel_iters_prologue
4625 14 : - scalar_single_iter_cost * peel_iters_epilogue
4626 : + scalar_outside_cost);
4627 14 : int min_vec_niters = 1;
4628 14 : if (outside_overhead > 0)
4629 14 : min_vec_niters = outside_overhead / saving_per_viter + 1;
4630 :
4631 14 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4632 : {
4633 14 : int threshold = (vec_inside_cost * min_vec_niters
4634 14 : + vec_outside_cost
4635 14 : + scalar_outside_cost);
4636 14 : min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4637 : }
4638 : else
4639 : min_profitable_estimate = (min_vec_niters * assumed_vf
4640 : + peel_iters_prologue
4641 : + peel_iters_epilogue);
4642 : }
4643 : else
4644 : {
4645 52742 : min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4646 52742 : * assumed_vf
4647 52742 : - vec_inside_cost * peel_iters_prologue
4648 52742 : - vec_inside_cost * peel_iters_epilogue)
4649 52742 : / ((scalar_single_iter_cost * assumed_vf)
4650 : - vec_inside_cost);
4651 : }
4652 58267 : min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4653 58267 : if (dump_enabled_p ())
4654 587 : dump_printf_loc (MSG_NOTE, vect_location,
4655 : " Static estimate profitability threshold = %d\n",
4656 : min_profitable_estimate);
4657 :
4658 58267 : *ret_min_profitable_estimate = min_profitable_estimate;
4659 : }
4660 :
4661 : /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4662 : vector elements (not bits) for a vector with NELT elements. */
4663 : static void
4664 2185 : calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4665 : vec_perm_builder *sel)
4666 : {
4667 : /* The encoding is a single stepped pattern. Any wrap-around is handled
4668 : by vec_perm_indices. */
4669 2185 : sel->new_vector (nelt, 1, 3);
4670 8740 : for (unsigned int i = 0; i < 3; i++)
4671 6555 : sel->quick_push (i + offset);
4672 2185 : }
4673 :
4674 : /* Checks whether the target supports whole-vector shifts for vectors of mode
4675 : MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4676 : it supports vec_perm_const with masks for all necessary shift amounts. */
4677 : static bool
4678 7681 : have_whole_vector_shift (machine_mode mode)
4679 : {
4680 7681 : if (can_implement_p (vec_shr_optab, mode))
4681 : return true;
4682 :
4683 : /* Variable-length vectors should be handled via the optab. */
4684 61 : unsigned int nelt;
4685 122 : if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4686 : return false;
4687 :
4688 61 : vec_perm_builder sel;
4689 61 : vec_perm_indices indices;
4690 307 : for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4691 : {
4692 246 : calc_vec_perm_mask_for_shift (i, nelt, &sel);
4693 246 : indices.new_vector (sel, 2, nelt);
4694 246 : if (!can_vec_perm_const_p (mode, mode, indices, false))
4695 : return false;
4696 : }
4697 : return true;
4698 61 : }
4699 :
4700 : /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
4701 : multiplication operands have differing signs and (b) we intend
4702 : to emulate the operation using a series of signed DOT_PROD_EXPRs.
4703 : See vect_emulate_mixed_dot_prod for the actual sequence used. */
4704 :
4705 : static bool
4706 2184 : vect_is_emulated_mixed_dot_prod (slp_tree slp_node)
4707 : {
4708 2184 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
4709 2184 : gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
4710 1731 : if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
4711 : return false;
4712 :
4713 578 : tree rhs1 = gimple_assign_rhs1 (assign);
4714 578 : tree rhs2 = gimple_assign_rhs2 (assign);
4715 578 : if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
4716 : return false;
4717 :
4718 429 : return !directly_supported_p (DOT_PROD_EXPR,
4719 : SLP_TREE_VECTYPE (slp_node),
4720 143 : SLP_TREE_VECTYPE
4721 : (SLP_TREE_CHILDREN (slp_node)[0]),
4722 143 : optab_vector_mixed_sign);
4723 : }
4724 :
4725 : /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4726 : functions. Design better to avoid maintenance issues. */
4727 :
4728 : /* Function vect_model_reduction_cost.
4729 :
4730 : Models cost for a reduction operation, including the vector ops
4731 : generated within the strip-mine loop in some cases, the initial
4732 : definition before the loop, and the epilogue code that must be generated. */
4733 :
4734 : static void
4735 46947 : vect_model_reduction_cost (loop_vec_info loop_vinfo,
4736 : slp_tree node, internal_fn reduc_fn,
4737 : vect_reduction_type reduction_type,
4738 : int ncopies, stmt_vector_for_cost *cost_vec)
4739 : {
4740 46947 : int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4741 46947 : tree vectype;
4742 46947 : machine_mode mode;
4743 46947 : class loop *loop = NULL;
4744 :
4745 46947 : if (loop_vinfo)
4746 46947 : loop = LOOP_VINFO_LOOP (loop_vinfo);
4747 :
4748 : /* Condition reductions generate two reductions in the loop. */
4749 46947 : if (reduction_type == COND_REDUCTION)
4750 280 : ncopies *= 2;
4751 :
4752 46947 : vectype = SLP_TREE_VECTYPE (node);
4753 46947 : mode = TYPE_MODE (vectype);
4754 46947 : stmt_vec_info orig_stmt_info
4755 46947 : = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
4756 :
4757 46947 : gimple_match_op op;
4758 46947 : if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4759 0 : gcc_unreachable ();
4760 :
4761 46947 : if (reduction_type == EXTRACT_LAST_REDUCTION)
4762 : /* No extra instructions are needed in the prologue. The loop body
4763 : operations are costed in vectorizable_condition. */
4764 : inside_cost = 0;
4765 46947 : else if (reduction_type == FOLD_LEFT_REDUCTION)
4766 : {
4767 : /* No extra instructions needed in the prologue. */
4768 3927 : prologue_cost = 0;
4769 :
4770 3927 : if (reduc_fn != IFN_LAST)
4771 : /* Count one reduction-like operation per vector. */
4772 0 : inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4773 : node, 0, vect_body);
4774 : else
4775 : {
4776 : /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4777 3927 : unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4778 3927 : inside_cost = record_stmt_cost (cost_vec, nelements,
4779 : vec_to_scalar, node, 0,
4780 : vect_body);
4781 3927 : inside_cost += record_stmt_cost (cost_vec, nelements,
4782 : scalar_stmt, node, 0,
4783 : vect_body);
4784 : }
4785 : }
4786 : else
4787 : {
4788 : /* Add in the cost of the initial definitions. */
4789 43020 : int prologue_stmts;
4790 43020 : if (reduction_type == COND_REDUCTION)
4791 : /* For cond reductions we have four vectors: initial index, step,
4792 : initial result of the data reduction, initial value of the index
4793 : reduction. */
4794 : prologue_stmts = 4;
4795 : else
4796 : /* We need the initial reduction value. */
4797 42740 : prologue_stmts = 1;
4798 43020 : prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4799 : scalar_to_vec, node, 0,
4800 : vect_prologue);
4801 : }
4802 :
4803 : /* Determine cost of epilogue code.
4804 :
4805 : We have a reduction operator that will reduce the vector in one statement.
4806 : Also requires scalar extract. */
4807 :
4808 46947 : if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4809 : {
4810 46788 : if (reduc_fn != IFN_LAST)
4811 : {
4812 35371 : if (reduction_type == COND_REDUCTION)
4813 : {
4814 : /* An EQ stmt and an COND_EXPR stmt. */
4815 8 : epilogue_cost += record_stmt_cost (cost_vec, 2,
4816 : vector_stmt, node, 0,
4817 : vect_epilogue);
4818 : /* Reduction of the max index and a reduction of the found
4819 : values. */
4820 8 : epilogue_cost += record_stmt_cost (cost_vec, 2,
4821 : vec_to_scalar, node, 0,
4822 : vect_epilogue);
4823 : /* A broadcast of the max value. */
4824 8 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4825 : scalar_to_vec, node, 0,
4826 : vect_epilogue);
4827 : }
4828 : else
4829 : {
4830 35363 : epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4831 : node, 0, vect_epilogue);
4832 35363 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4833 : vec_to_scalar, node, 0,
4834 : vect_epilogue);
4835 : }
4836 : }
4837 11417 : else if (reduction_type == COND_REDUCTION)
4838 : {
4839 272 : unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4840 : /* Extraction of scalar elements. */
4841 544 : epilogue_cost += record_stmt_cost (cost_vec,
4842 272 : 2 * estimated_nunits,
4843 : vec_to_scalar, node, 0,
4844 : vect_epilogue);
4845 : /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4846 272 : epilogue_cost += record_stmt_cost (cost_vec,
4847 272 : 2 * estimated_nunits - 3,
4848 : scalar_stmt, node, 0,
4849 : vect_epilogue);
4850 : }
4851 11145 : else if (reduction_type == EXTRACT_LAST_REDUCTION
4852 11145 : || reduction_type == FOLD_LEFT_REDUCTION)
4853 : /* No extra instructions need in the epilogue. */
4854 : ;
4855 : else
4856 : {
4857 7218 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4858 7218 : tree bitsize = TYPE_SIZE (op.type);
4859 7218 : int element_bitsize = tree_to_uhwi (bitsize);
4860 7218 : int nelements = vec_size_in_bits / element_bitsize;
4861 :
4862 7218 : if (op.code == COND_EXPR)
4863 28 : op.code = MAX_EXPR;
4864 :
4865 : /* We have a whole vector shift available. */
4866 968 : if (VECTOR_MODE_P (mode)
4867 7218 : && directly_supported_p (op.code, vectype)
4868 13029 : && have_whole_vector_shift (mode))
4869 : {
4870 : /* Final reduction via vector shifts and the reduction operator.
4871 : Also requires scalar extract. */
4872 17433 : epilogue_cost += record_stmt_cost (cost_vec,
4873 11622 : exact_log2 (nelements) * 2,
4874 : vector_stmt, node, 0,
4875 : vect_epilogue);
4876 5811 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4877 : vec_to_scalar, node, 0,
4878 : vect_epilogue);
4879 : }
4880 : else
4881 : /* Use extracts and reduction op for final reduction. For N
4882 : elements, we have N extracts and N-1 reduction ops. */
4883 1407 : epilogue_cost += record_stmt_cost (cost_vec,
4884 1407 : nelements + nelements - 1,
4885 : vector_stmt, node, 0,
4886 : vect_epilogue);
4887 : }
4888 : }
4889 :
4890 46947 : if (dump_enabled_p ())
4891 2846 : dump_printf (MSG_NOTE,
4892 : "vect_model_reduction_cost: inside_cost = %d, "
4893 : "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4894 : prologue_cost, epilogue_cost);
4895 46947 : }
4896 :
4897 : /* SEQ is a sequence of instructions that initialize the reduction
4898 : described by REDUC_INFO. Emit them in the appropriate place. */
4899 :
4900 : static void
4901 445 : vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4902 : vect_reduc_info reduc_info, gimple *seq)
4903 : {
4904 445 : if (VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info))
4905 : {
4906 : /* When reusing an accumulator from the main loop, we only need
4907 : initialization instructions if the main loop can be skipped.
4908 : In that case, emit the initialization instructions at the end
4909 : of the guard block that does the skip. */
4910 25 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
4911 25 : gcc_assert (skip_edge);
4912 25 : gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4913 25 : gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4914 : }
4915 : else
4916 : {
4917 : /* The normal case: emit the initialization instructions on the
4918 : preheader edge. */
4919 420 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4920 420 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4921 : }
4922 445 : }
4923 :
4924 : /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4925 : which performs a reduction involving GROUP_SIZE scalar statements.
4926 : NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
4927 : is nonnull, introducing extra elements of that value will not change the
4928 : result. */
4929 :
4930 : static void
4931 21855 : get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4932 : vect_reduc_info reduc_info,
4933 : tree vector_type,
4934 : vec<tree> *vec_oprnds,
4935 : unsigned int number_of_vectors,
4936 : unsigned int group_size, tree neutral_op)
4937 : {
4938 21855 : vec<tree> &initial_values = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info);
4939 21855 : unsigned HOST_WIDE_INT nunits;
4940 21855 : unsigned j, number_of_places_left_in_vector;
4941 21855 : unsigned int i;
4942 :
4943 43710 : gcc_assert (group_size == initial_values.length () || neutral_op);
4944 :
4945 : /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4946 : created vectors. It is greater than 1 if unrolling is performed.
4947 :
4948 : For example, we have two scalar operands, s1 and s2 (e.g., group of
4949 : strided accesses of size two), while NUNITS is four (i.e., four scalars
4950 : of this type can be packed in a vector). The output vector will contain
4951 : two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4952 : will be 2).
4953 :
4954 : If GROUP_SIZE > NUNITS, the scalars will be split into several
4955 : vectors containing the operands.
4956 :
4957 : For example, NUNITS is four as before, and the group size is 8
4958 : (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4959 : {s5, s6, s7, s8}. */
4960 :
4961 21855 : if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4962 : nunits = group_size;
4963 :
4964 21855 : tree vector_elt_type = TREE_TYPE (vector_type);
4965 21855 : number_of_places_left_in_vector = nunits;
4966 21855 : bool constant_p = true;
4967 21855 : tree_vector_builder elts (vector_type, nunits, 1);
4968 21855 : elts.quick_grow (nunits);
4969 21855 : gimple_seq ctor_seq = NULL;
4970 21855 : if (neutral_op
4971 43139 : && !useless_type_conversion_p (vector_elt_type,
4972 21284 : TREE_TYPE (neutral_op)))
4973 : {
4974 220 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4975 199 : neutral_op = gimple_build (&ctor_seq, COND_EXPR,
4976 : vector_elt_type,
4977 : neutral_op,
4978 : build_all_ones_cst (vector_elt_type),
4979 : build_zero_cst (vector_elt_type));
4980 : else
4981 21 : neutral_op = gimple_convert (&ctor_seq, vector_elt_type, neutral_op);
4982 : }
4983 210515 : for (j = 0; j < nunits * number_of_vectors; ++j)
4984 : {
4985 188660 : tree op;
4986 188660 : i = j % group_size;
4987 :
4988 : /* Get the def before the loop. In reduction chain we have only
4989 : one initial value. Else we have as many as PHIs in the group. */
4990 188660 : if (i >= initial_values.length () || (j > i && neutral_op))
4991 : op = neutral_op;
4992 : else
4993 : {
4994 51304 : if (!useless_type_conversion_p (vector_elt_type,
4995 25652 : TREE_TYPE (initial_values[i])))
4996 : {
4997 235 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4998 422 : initial_values[i] = gimple_build (&ctor_seq, COND_EXPR,
4999 : vector_elt_type,
5000 211 : initial_values[i],
5001 : build_all_ones_cst
5002 : (vector_elt_type),
5003 : build_zero_cst
5004 : (vector_elt_type));
5005 : else
5006 48 : initial_values[i] = gimple_convert (&ctor_seq,
5007 : vector_elt_type,
5008 24 : initial_values[i]);
5009 : }
5010 25652 : op = initial_values[i];
5011 : }
5012 :
5013 : /* Create 'vect_ = {op0,op1,...,opn}'. */
5014 188660 : number_of_places_left_in_vector--;
5015 188660 : elts[nunits - number_of_places_left_in_vector - 1] = op;
5016 188660 : if (!CONSTANT_CLASS_P (op))
5017 2360 : constant_p = false;
5018 :
5019 188660 : if (number_of_places_left_in_vector == 0)
5020 : {
5021 23317 : tree init;
5022 46634 : if (constant_p && !neutral_op
5023 46346 : ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5024 23317 : : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5025 : /* Build the vector directly from ELTS. */
5026 23317 : init = gimple_build_vector (&ctor_seq, &elts);
5027 0 : else if (neutral_op)
5028 : {
5029 : /* Build a vector of the neutral value and shift the
5030 : other elements into place. */
5031 0 : init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5032 : neutral_op);
5033 0 : int k = nunits;
5034 0 : while (k > 0 && operand_equal_p (elts[k - 1], neutral_op))
5035 : k -= 1;
5036 0 : while (k > 0)
5037 : {
5038 0 : k -= 1;
5039 0 : init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5040 0 : vector_type, init, elts[k]);
5041 : }
5042 : }
5043 : else
5044 : {
5045 : /* First time round, duplicate ELTS to fill the
5046 : required number of vectors. */
5047 0 : duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5048 : elts, number_of_vectors, *vec_oprnds);
5049 0 : break;
5050 : }
5051 23317 : vec_oprnds->quick_push (init);
5052 :
5053 23317 : number_of_places_left_in_vector = nunits;
5054 23317 : elts.new_vector (vector_type, nunits, 1);
5055 23317 : elts.quick_grow (nunits);
5056 23317 : constant_p = true;
5057 : }
5058 : }
5059 21855 : if (ctor_seq != NULL)
5060 445 : vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5061 21855 : }
5062 :
5063 : vect_reduc_info
5064 133055 : info_for_reduction (loop_vec_info loop_vinfo, slp_tree node)
5065 : {
5066 133055 : if (node->cycle_info.id == -1)
5067 : return NULL;
5068 131145 : return loop_vinfo->reduc_infos[node->cycle_info.id];
5069 : }
5070 :
5071 : /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5072 : REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5073 : return false. */
5074 :
5075 : static bool
5076 21494 : vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5077 : vect_reduc_info reduc_info, tree vectype)
5078 : {
5079 21494 : loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5080 21494 : if (!main_loop_vinfo)
5081 : return false;
5082 :
5083 4839 : if (VECT_REDUC_INFO_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5084 : return false;
5085 :
5086 : /* We are not set up to handle vector bools when they are not mapped
5087 : to vector integer data types. */
5088 4824 : if (VECTOR_BOOLEAN_TYPE_P (vectype)
5089 4894 : && GET_MODE_CLASS (TYPE_MODE (vectype)) != MODE_VECTOR_INT)
5090 : return false;
5091 :
5092 4822 : unsigned int num_phis = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).length ();
5093 4822 : auto_vec<tree, 16> main_loop_results (num_phis);
5094 4822 : auto_vec<tree, 16> initial_values (num_phis);
5095 4822 : if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5096 : {
5097 : /* The epilogue loop can be entered either from the main loop or
5098 : from an earlier guard block. */
5099 4599 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
5100 18420 : for (tree incoming_value : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info))
5101 : {
5102 : /* Look for:
5103 :
5104 : INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5105 : INITIAL_VALUE(guard block)>. */
5106 4623 : gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5107 :
5108 4623 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5109 4623 : gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5110 :
5111 4623 : tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5112 4623 : tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5113 :
5114 4623 : main_loop_results.quick_push (from_main_loop);
5115 4623 : initial_values.quick_push (from_skip);
5116 : }
5117 : }
5118 : else
5119 : /* The main loop dominates the epilogue loop. */
5120 223 : main_loop_results.splice (VECT_REDUC_INFO_INITIAL_VALUES (reduc_info));
5121 :
5122 : /* See if the main loop has the kind of accumulator we need. */
5123 4822 : vect_reusable_accumulator *accumulator
5124 4822 : = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5125 4822 : if (!accumulator
5126 9628 : || num_phis != VECT_REDUC_INFO_SCALAR_RESULTS (accumulator->reduc_info).length ()
5127 14446 : || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5128 : VECT_REDUC_INFO_SCALAR_RESULTS (accumulator->reduc_info).begin ()))
5129 : return false;
5130 :
5131 : /* Handle the case where we can reduce wider vectors to narrower ones. */
5132 4812 : tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5133 4812 : unsigned HOST_WIDE_INT m;
5134 4812 : if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5135 4812 : TYPE_VECTOR_SUBPARTS (vectype), &m))
5136 0 : return false;
5137 : /* Check the intermediate vector types and operations are available. */
5138 4812 : tree prev_vectype = old_vectype;
5139 4812 : poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5140 13913 : while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5141 : {
5142 4811 : intermediate_nunits = exact_div (intermediate_nunits, 2);
5143 4811 : tree intermediate_vectype = get_related_vectype_for_scalar_type
5144 4811 : (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5145 4811 : if (!intermediate_vectype
5146 4811 : || !directly_supported_p (VECT_REDUC_INFO_CODE (reduc_info),
5147 : intermediate_vectype)
5148 9102 : || !can_vec_extract (TYPE_MODE (prev_vectype),
5149 4291 : TYPE_MODE (intermediate_vectype)))
5150 : return false;
5151 : prev_vectype = intermediate_vectype;
5152 : }
5153 :
5154 : /* Non-SLP reductions might apply an adjustment after the reduction
5155 : operation, in order to simplify the initialization of the accumulator.
5156 : If the epilogue loop carries on from where the main loop left off,
5157 : it should apply the same adjustment to the final reduction result.
5158 :
5159 : If the epilogue loop can also be entered directly (rather than via
5160 : the main loop), we need to be able to handle that case in the same way,
5161 : with the same adjustment. (In principle we could add a PHI node
5162 : to select the correct adjustment, but in practice that shouldn't be
5163 : necessary.) */
5164 4290 : tree main_adjustment
5165 4290 : = VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5166 4290 : if (loop_vinfo->main_loop_edge && main_adjustment)
5167 : {
5168 3631 : gcc_assert (num_phis == 1);
5169 3631 : tree initial_value = initial_values[0];
5170 : /* Check that we can use INITIAL_VALUE as the adjustment and
5171 : initialize the accumulator with a neutral value instead. */
5172 3631 : if (!operand_equal_p (initial_value, main_adjustment))
5173 : return false;
5174 3525 : initial_values[0] = VECT_REDUC_INFO_NEUTRAL_OP (reduc_info);
5175 : }
5176 4184 : VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5177 4184 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).truncate (0);
5178 4184 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).splice (initial_values);
5179 4184 : VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info) = accumulator;
5180 4184 : return true;
5181 4822 : }
5182 :
5183 : /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5184 : CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5185 :
5186 : static tree
5187 4228 : vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5188 : gimple_seq *seq)
5189 : {
5190 4228 : gcc_assert (!VECTOR_BOOLEAN_TYPE_P (TREE_TYPE (vec_def))
5191 : || (GET_MODE_CLASS (TYPE_MODE (TREE_TYPE (vec_def)))
5192 : == MODE_VECTOR_INT));
5193 4228 : unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5194 4228 : unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5195 4228 : tree stype = TREE_TYPE (vectype);
5196 4228 : tree new_temp = vec_def;
5197 8448 : while (nunits > nunits1)
5198 : {
5199 4220 : nunits /= 2;
5200 4220 : tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5201 4220 : stype, nunits);
5202 4220 : unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5203 :
5204 : /* The target has to make sure we support lowpart/highpart
5205 : extraction, either via direct vector extract or through
5206 : an integer mode punning. */
5207 4220 : tree dst1, dst2;
5208 4220 : gimple *epilog_stmt;
5209 4220 : if (convert_optab_handler (vec_extract_optab,
5210 4220 : TYPE_MODE (TREE_TYPE (new_temp)),
5211 4220 : TYPE_MODE (vectype1))
5212 : != CODE_FOR_nothing)
5213 : {
5214 : /* Extract sub-vectors directly once vec_extract becomes
5215 : a conversion optab. */
5216 2685 : dst1 = make_ssa_name (vectype1);
5217 2685 : epilog_stmt
5218 5370 : = gimple_build_assign (dst1, BIT_FIELD_REF,
5219 : build3 (BIT_FIELD_REF, vectype1,
5220 2685 : new_temp, TYPE_SIZE (vectype1),
5221 : bitsize_int (0)));
5222 2685 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5223 2685 : dst2 = make_ssa_name (vectype1);
5224 2685 : epilog_stmt
5225 2685 : = gimple_build_assign (dst2, BIT_FIELD_REF,
5226 : build3 (BIT_FIELD_REF, vectype1,
5227 2685 : new_temp, TYPE_SIZE (vectype1),
5228 2685 : bitsize_int (bitsize)));
5229 2685 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5230 : }
5231 : else
5232 : {
5233 : /* Extract via punning to appropriately sized integer mode
5234 : vector. */
5235 1535 : tree eltype = build_nonstandard_integer_type (bitsize, 1);
5236 1535 : tree etype = build_vector_type (eltype, 2);
5237 3070 : gcc_assert (convert_optab_handler (vec_extract_optab,
5238 : TYPE_MODE (etype),
5239 : TYPE_MODE (eltype))
5240 : != CODE_FOR_nothing);
5241 1535 : tree tem = make_ssa_name (etype);
5242 1535 : epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5243 : build1 (VIEW_CONVERT_EXPR,
5244 : etype, new_temp));
5245 1535 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5246 1535 : new_temp = tem;
5247 1535 : tem = make_ssa_name (eltype);
5248 1535 : epilog_stmt
5249 3070 : = gimple_build_assign (tem, BIT_FIELD_REF,
5250 : build3 (BIT_FIELD_REF, eltype,
5251 1535 : new_temp, TYPE_SIZE (eltype),
5252 : bitsize_int (0)));
5253 1535 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5254 1535 : dst1 = make_ssa_name (vectype1);
5255 1535 : epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5256 : build1 (VIEW_CONVERT_EXPR,
5257 : vectype1, tem));
5258 1535 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5259 1535 : tem = make_ssa_name (eltype);
5260 1535 : epilog_stmt
5261 1535 : = gimple_build_assign (tem, BIT_FIELD_REF,
5262 : build3 (BIT_FIELD_REF, eltype,
5263 1535 : new_temp, TYPE_SIZE (eltype),
5264 1535 : bitsize_int (bitsize)));
5265 1535 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5266 1535 : dst2 = make_ssa_name (vectype1);
5267 1535 : epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5268 : build1 (VIEW_CONVERT_EXPR,
5269 : vectype1, tem));
5270 1535 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5271 : }
5272 :
5273 4220 : new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5274 : }
5275 4228 : if (!useless_type_conversion_p (vectype, TREE_TYPE (new_temp)))
5276 : {
5277 66 : tree dst3 = make_ssa_name (vectype);
5278 66 : gimple *epilog_stmt = gimple_build_assign (dst3, VIEW_CONVERT_EXPR,
5279 : build1 (VIEW_CONVERT_EXPR,
5280 : vectype, new_temp));
5281 66 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5282 66 : new_temp = dst3;
5283 : }
5284 :
5285 4228 : return new_temp;
5286 : }
5287 :
5288 : /* Function vect_create_epilog_for_reduction
5289 :
5290 : Create code at the loop-epilog to finalize the result of a reduction
5291 : computation.
5292 :
5293 : STMT_INFO is the scalar reduction stmt that is being vectorized.
5294 : SLP_NODE is an SLP node containing a group of reduction statements. The
5295 : first one in this group is STMT_INFO.
5296 : SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5297 : REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5298 : (counting from 0)
5299 : LOOP_EXIT is the edge to update in the merge block. In the case of a single
5300 : exit this edge is always the main loop exit.
5301 :
5302 : This function:
5303 : 1. Completes the reduction def-use cycles.
5304 : 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5305 : by calling the function specified by REDUC_FN if available, or by
5306 : other means (whole-vector shifts or a scalar loop).
5307 : The function also creates a new phi node at the loop exit to preserve
5308 : loop-closed form, as illustrated below.
5309 :
5310 : The flow at the entry to this function:
5311 :
5312 : loop:
5313 : vec_def = phi <vec_init, null> # REDUCTION_PHI
5314 : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5315 : s_loop = scalar_stmt # (scalar) STMT_INFO
5316 : loop_exit:
5317 : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5318 : use <s_out0>
5319 : use <s_out0>
5320 :
5321 : The above is transformed by this function into:
5322 :
5323 : loop:
5324 : vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5325 : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5326 : s_loop = scalar_stmt # (scalar) STMT_INFO
5327 : loop_exit:
5328 : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5329 : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5330 : v_out2 = reduce <v_out1>
5331 : s_out3 = extract_field <v_out2, 0>
5332 : s_out4 = adjust_result <s_out3>
5333 : use <s_out4>
5334 : use <s_out4>
5335 : */
5336 :
5337 : static void
5338 22202 : vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5339 : stmt_vec_info stmt_info,
5340 : slp_tree slp_node,
5341 : slp_instance slp_node_instance,
5342 : edge loop_exit)
5343 : {
5344 22202 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
5345 22202 : code_helper code = VECT_REDUC_INFO_CODE (reduc_info);
5346 22202 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
5347 22202 : tree vectype;
5348 22202 : machine_mode mode;
5349 22202 : basic_block exit_bb;
5350 22202 : gimple *new_phi = NULL, *phi = NULL;
5351 22202 : gimple_stmt_iterator exit_gsi;
5352 22202 : tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5353 22202 : gimple *epilog_stmt = NULL;
5354 22202 : gimple *exit_phi;
5355 22202 : tree def;
5356 22202 : tree orig_name, scalar_result;
5357 22202 : imm_use_iterator imm_iter;
5358 22202 : use_operand_p use_p;
5359 22202 : gimple *use_stmt;
5360 22202 : auto_vec<tree> reduc_inputs;
5361 22202 : int j, i;
5362 22202 : vec<tree> &scalar_results = VECT_REDUC_INFO_SCALAR_RESULTS (reduc_info);
5363 22202 : unsigned int k;
5364 : /* SLP reduction without reduction chain, e.g.,
5365 : # a1 = phi <a2, a0>
5366 : # b1 = phi <b2, b0>
5367 : a2 = operation (a1)
5368 : b2 = operation (b1) */
5369 22202 : const bool slp_reduc = !reduc_info->is_reduc_chain;
5370 22202 : tree induction_index = NULL_TREE;
5371 :
5372 22202 : unsigned int group_size = SLP_TREE_LANES (slp_node);
5373 :
5374 22202 : bool double_reduc = false;
5375 22202 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5376 22202 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5377 : {
5378 0 : double_reduc = true;
5379 0 : gcc_assert (slp_reduc);
5380 : }
5381 :
5382 22202 : vectype = VECT_REDUC_INFO_VECTYPE (reduc_info);
5383 22202 : gcc_assert (vectype);
5384 22202 : mode = TYPE_MODE (vectype);
5385 :
5386 22202 : tree induc_val = NULL_TREE;
5387 22202 : tree adjustment_def = NULL;
5388 : /* Optimize: for induction condition reduction, if we can't use zero
5389 : for induc_val, use initial_def. */
5390 22202 : if (VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5391 62 : induc_val = VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info);
5392 22140 : else if (double_reduc)
5393 : ;
5394 : else
5395 22140 : adjustment_def = VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info);
5396 :
5397 22202 : stmt_vec_info single_live_out_stmt[] = { stmt_info };
5398 22202 : array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5399 22202 : if (slp_reduc)
5400 : /* All statements produce live-out values. */
5401 43996 : live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5402 :
5403 22202 : unsigned vec_num
5404 22202 : = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5405 :
5406 : /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5407 : which is updated with the current index of the loop for every match of
5408 : the original loop's cond_expr (VEC_STMT). This results in a vector
5409 : containing the last time the condition passed for that vector lane.
5410 : The first match will be a 1 to allow 0 to be used for non-matching
5411 : indexes. If there are no matches at all then the vector will be all
5412 : zeroes.
5413 :
5414 : PR92772: This algorithm is broken for architectures that support
5415 : masked vectors, but do not provide fold_extract_last. */
5416 22202 : if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION)
5417 : {
5418 67 : gcc_assert (!double_reduc);
5419 67 : auto_vec<std::pair<tree, bool>, 2> ccompares;
5420 67 : slp_tree cond_node = slp_node_instance->root;
5421 143 : while (cond_node != slp_node_instance->reduc_phis)
5422 : {
5423 76 : stmt_vec_info cond_info = SLP_TREE_REPRESENTATIVE (cond_node);
5424 76 : if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5425 : {
5426 76 : gimple *vec_stmt
5427 76 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (cond_node)[0]);
5428 76 : gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5429 76 : ccompares.safe_push
5430 76 : (std::make_pair (gimple_assign_rhs1 (vec_stmt),
5431 76 : SLP_TREE_REDUC_IDX (cond_node) == 2));
5432 : }
5433 76 : int slp_reduc_idx = SLP_TREE_REDUC_IDX (cond_node);
5434 76 : cond_node = SLP_TREE_CHILDREN (cond_node)[slp_reduc_idx];
5435 : }
5436 67 : gcc_assert (ccompares.length () != 0);
5437 :
5438 67 : tree indx_before_incr, indx_after_incr;
5439 67 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5440 67 : int scalar_precision
5441 67 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5442 67 : tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5443 67 : tree cr_index_vector_type = get_related_vectype_for_scalar_type
5444 67 : (TYPE_MODE (vectype), cr_index_scalar_type,
5445 : TYPE_VECTOR_SUBPARTS (vectype));
5446 :
5447 : /* First we create a simple vector induction variable which starts
5448 : with the values {1,2,3,...} (SERIES_VECT) and increments by the
5449 : vector size (STEP). */
5450 :
5451 : /* Create a {1,2,3,...} vector. */
5452 67 : tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5453 :
5454 : /* Create a vector of the step value. */
5455 67 : tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5456 67 : tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5457 :
5458 : /* Create an induction variable. */
5459 67 : gimple_stmt_iterator incr_gsi;
5460 67 : bool insert_after;
5461 67 : vect_iv_increment_position (LOOP_VINFO_MAIN_EXIT (loop_vinfo),
5462 : &incr_gsi, &insert_after);
5463 67 : create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
5464 : insert_after, &indx_before_incr, &indx_after_incr);
5465 :
5466 : /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5467 : filled with zeros (VEC_ZERO). */
5468 :
5469 : /* Create a vector of 0s. */
5470 67 : tree zero = build_zero_cst (cr_index_scalar_type);
5471 67 : tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5472 :
5473 : /* Create a vector phi node. */
5474 67 : tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5475 67 : new_phi = create_phi_node (new_phi_tree, loop->header);
5476 67 : add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5477 : loop_preheader_edge (loop), UNKNOWN_LOCATION);
5478 :
5479 : /* Now take the condition from the loops original cond_exprs
5480 : and produce a new cond_exprs (INDEX_COND_EXPR) which for
5481 : every match uses values from the induction variable
5482 : (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5483 : (NEW_PHI_TREE).
5484 : Finally, we update the phi (NEW_PHI_TREE) to take the value of
5485 : the new cond_expr (INDEX_COND_EXPR). */
5486 67 : gimple_seq stmts = NULL;
5487 210 : for (int i = ccompares.length () - 1; i != -1; --i)
5488 : {
5489 76 : tree ccompare = ccompares[i].first;
5490 76 : if (ccompares[i].second)
5491 69 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5492 : cr_index_vector_type,
5493 : ccompare,
5494 : indx_before_incr, new_phi_tree);
5495 : else
5496 7 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5497 : cr_index_vector_type,
5498 : ccompare,
5499 : new_phi_tree, indx_before_incr);
5500 : }
5501 67 : gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5502 :
5503 : /* Update the phi with the vec cond. */
5504 67 : induction_index = new_phi_tree;
5505 67 : add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5506 : loop_latch_edge (loop), UNKNOWN_LOCATION);
5507 67 : }
5508 :
5509 : /* 2. Create epilog code.
5510 : The reduction epilog code operates across the elements of the vector
5511 : of partial results computed by the vectorized loop.
5512 : The reduction epilog code consists of:
5513 :
5514 : step 1: compute the scalar result in a vector (v_out2)
5515 : step 2: extract the scalar result (s_out3) from the vector (v_out2)
5516 : step 3: adjust the scalar result (s_out3) if needed.
5517 :
5518 : Step 1 can be accomplished using one the following three schemes:
5519 : (scheme 1) using reduc_fn, if available.
5520 : (scheme 2) using whole-vector shifts, if available.
5521 : (scheme 3) using a scalar loop. In this case steps 1+2 above are
5522 : combined.
5523 :
5524 : The overall epilog code looks like this:
5525 :
5526 : s_out0 = phi <s_loop> # original EXIT_PHI
5527 : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5528 : v_out2 = reduce <v_out1> # step 1
5529 : s_out3 = extract_field <v_out2, 0> # step 2
5530 : s_out4 = adjust_result <s_out3> # step 3
5531 :
5532 : (step 3 is optional, and steps 1 and 2 may be combined).
5533 : Lastly, the uses of s_out0 are replaced by s_out4. */
5534 :
5535 :
5536 : /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5537 : v_out1 = phi <VECT_DEF>
5538 : Store them in NEW_PHIS. */
5539 : /* We need to reduce values in all exits. */
5540 22202 : exit_bb = loop_exit->dest;
5541 22202 : exit_gsi = gsi_after_labels (exit_bb);
5542 22202 : reduc_inputs.create (vec_num);
5543 45876 : for (unsigned i = 0; i < vec_num; i++)
5544 : {
5545 23674 : gimple_seq stmts = NULL;
5546 23674 : def = vect_get_slp_vect_def (slp_node, i);
5547 23674 : tree new_def = copy_ssa_name (def);
5548 23674 : phi = create_phi_node (new_def, exit_bb);
5549 23674 : if (LOOP_VINFO_MAIN_EXIT (loop_vinfo) == loop_exit)
5550 23647 : SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
5551 : else
5552 : {
5553 57 : for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
5554 30 : SET_PHI_ARG_DEF (phi, k, def);
5555 : }
5556 23674 : new_def = gimple_convert (&stmts, vectype, new_def);
5557 23674 : reduc_inputs.quick_push (new_def);
5558 23674 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5559 : }
5560 :
5561 : /* 2.2 Get the original scalar reduction variable as defined in the loop.
5562 : In case STMT is a "pattern-stmt" (i.e. - it represents a reduction
5563 : pattern), the scalar-def is taken from the original stmt that the
5564 : pattern-stmt (STMT) replaces. */
5565 :
5566 23019 : tree scalar_dest = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
5567 22202 : tree scalar_type = TREE_TYPE (scalar_dest);
5568 22202 : scalar_results.truncate (0);
5569 22202 : scalar_results.reserve_exact (group_size);
5570 22202 : new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5571 :
5572 : /* True if we should implement SLP_REDUC using native reduction operations
5573 : instead of scalar operations. */
5574 22202 : const bool direct_slp_reduc
5575 22202 : = (reduc_fn != IFN_LAST
5576 22202 : && slp_reduc
5577 22202 : && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5578 :
5579 : /* If signed overflow is undefined we might need to perform reduction
5580 : computations in an unsigned type. */
5581 22202 : tree compute_vectype = vectype;
5582 22202 : if (ANY_INTEGRAL_TYPE_P (vectype)
5583 15202 : && TYPE_OVERFLOW_UNDEFINED (vectype)
5584 5549 : && code.is_tree_code ()
5585 27751 : && arith_code_with_undefined_signed_overflow ((tree_code) code))
5586 4086 : compute_vectype = unsigned_type_for (vectype);
5587 :
5588 : /* In case of reduction chain, e.g.,
5589 : # a1 = phi <a3, a0>
5590 : a2 = operation (a1)
5591 : a3 = operation (a2),
5592 :
5593 : we may end up with more than one vector result. Here we reduce them
5594 : to one vector.
5595 :
5596 : The same is true for a SLP reduction, e.g.,
5597 : # a1 = phi <a2, a0>
5598 : # b1 = phi <b2, b0>
5599 : a2 = operation (a1)
5600 : b2 = operation (a2),
5601 :
5602 : where we can end up with more than one vector as well. We can
5603 : easily accumulate vectors when the number of vector elements is
5604 : a multiple of the SLP group size.
5605 :
5606 : The same is true if we couldn't use a single defuse cycle. */
5607 22202 : if ((!slp_reduc
5608 : || direct_slp_reduc
5609 : || (slp_reduc
5610 22202 : && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size)))
5611 44404 : && reduc_inputs.length () > 1)
5612 : {
5613 542 : gimple_seq stmts = NULL;
5614 542 : tree single_input = reduc_inputs[0];
5615 542 : if (compute_vectype != vectype)
5616 157 : single_input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5617 : compute_vectype, single_input);
5618 1849 : for (k = 1; k < reduc_inputs.length (); k++)
5619 : {
5620 1307 : tree input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5621 1307 : compute_vectype, reduc_inputs[k]);
5622 1307 : single_input = gimple_build (&stmts, code, compute_vectype,
5623 : single_input, input);
5624 : }
5625 542 : if (compute_vectype != vectype)
5626 157 : single_input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5627 : vectype, single_input);
5628 542 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5629 :
5630 542 : reduc_inputs.truncate (0);
5631 542 : reduc_inputs.safe_push (single_input);
5632 : }
5633 :
5634 22202 : tree orig_reduc_input = reduc_inputs[0];
5635 :
5636 : /* If this loop is an epilogue loop that can be skipped after the
5637 : main loop, we can only share a reduction operation between the
5638 : main loop and the epilogue if we put it at the target of the
5639 : skip edge.
5640 :
5641 : We can still reuse accumulators if this check fails. Doing so has
5642 : the minor(?) benefit of making the epilogue loop's scalar result
5643 : independent of the main loop's scalar result. */
5644 22202 : bool unify_with_main_loop_p = false;
5645 22202 : if (VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)
5646 4184 : && loop_vinfo->skip_this_loop_edge
5647 3944 : && single_succ_p (exit_bb)
5648 22223 : && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5649 : {
5650 21 : unify_with_main_loop_p = true;
5651 :
5652 21 : basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5653 21 : reduc_inputs[0] = make_ssa_name (vectype);
5654 21 : gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5655 21 : add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5656 : UNKNOWN_LOCATION);
5657 21 : add_phi_arg (new_phi,
5658 21 : VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)->reduc_input,
5659 : loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5660 21 : exit_gsi = gsi_after_labels (reduc_block);
5661 : }
5662 :
5663 : /* Shouldn't be used beyond this point. */
5664 22202 : exit_bb = nullptr;
5665 :
5666 : /* If we are operating on a mask vector and do not support direct mask
5667 : reduction, work on a bool data vector instead of a mask vector. */
5668 22202 : if (VECTOR_BOOLEAN_TYPE_P (vectype)
5669 227 : && VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info)
5670 22394 : && vectype != VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info))
5671 : {
5672 192 : compute_vectype = vectype = VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info);
5673 192 : gimple_seq stmts = NULL;
5674 392 : for (unsigned i = 0; i < reduc_inputs.length (); ++i)
5675 400 : reduc_inputs[i] = gimple_build (&stmts, VEC_COND_EXPR, vectype,
5676 200 : reduc_inputs[i],
5677 : build_one_cst (vectype),
5678 : build_zero_cst (vectype));
5679 192 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5680 : }
5681 :
5682 22202 : if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
5683 67 : && reduc_fn != IFN_LAST)
5684 : {
5685 : /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5686 : various data values where the condition matched and another vector
5687 : (INDUCTION_INDEX) containing all the indexes of those matches. We
5688 : need to extract the last matching index (which will be the index with
5689 : highest value) and use this to index into the data vector.
5690 : For the case where there were no matches, the data vector will contain
5691 : all default values and the index vector will be all zeros. */
5692 :
5693 : /* Get various versions of the type of the vector of indexes. */
5694 4 : tree index_vec_type = TREE_TYPE (induction_index);
5695 4 : gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5696 4 : tree index_scalar_type = TREE_TYPE (index_vec_type);
5697 4 : tree index_vec_cmp_type = truth_type_for (index_vec_type);
5698 :
5699 : /* Get an unsigned integer version of the type of the data vector. */
5700 4 : int scalar_precision
5701 4 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5702 4 : tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5703 4 : tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5704 : vectype);
5705 :
5706 : /* First we need to create a vector (ZERO_VEC) of zeros and another
5707 : vector (MAX_INDEX_VEC) filled with the last matching index, which we
5708 : can create using a MAX reduction and then expanding.
5709 : In the case where the loop never made any matches, the max index will
5710 : be zero. */
5711 :
5712 : /* Vector of {0, 0, 0,...}. */
5713 4 : tree zero_vec = build_zero_cst (vectype);
5714 :
5715 : /* Find maximum value from the vector of found indexes. */
5716 4 : tree max_index = make_ssa_name (index_scalar_type);
5717 4 : gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5718 : 1, induction_index);
5719 4 : gimple_call_set_lhs (max_index_stmt, max_index);
5720 4 : gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5721 :
5722 : /* Vector of {max_index, max_index, max_index,...}. */
5723 4 : tree max_index_vec = make_ssa_name (index_vec_type);
5724 4 : tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5725 : max_index);
5726 4 : gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5727 : max_index_vec_rhs);
5728 4 : gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5729 :
5730 : /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5731 : with the vector (INDUCTION_INDEX) of found indexes, choosing values
5732 : from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5733 : otherwise. Only one value should match, resulting in a vector
5734 : (VEC_COND) with one data value and the rest zeros.
5735 : In the case where the loop never made any matches, every index will
5736 : match, resulting in a vector with all data values (which will all be
5737 : the default value). */
5738 :
5739 : /* Compare the max index vector to the vector of found indexes to find
5740 : the position of the max value. */
5741 4 : tree vec_compare = make_ssa_name (index_vec_cmp_type);
5742 4 : gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5743 : induction_index,
5744 : max_index_vec);
5745 4 : gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5746 :
5747 : /* Use the compare to choose either values from the data vector or
5748 : zero. */
5749 4 : tree vec_cond = make_ssa_name (vectype);
5750 4 : gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5751 : vec_compare,
5752 4 : reduc_inputs[0],
5753 : zero_vec);
5754 4 : gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5755 :
5756 : /* Finally we need to extract the data value from the vector (VEC_COND)
5757 : into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5758 : reduction, but because this doesn't exist, we can use a MAX reduction
5759 : instead. The data value might be signed or a float so we need to cast
5760 : it first.
5761 : In the case where the loop never made any matches, the data values are
5762 : all identical, and so will reduce down correctly. */
5763 :
5764 : /* Make the matched data values unsigned. */
5765 4 : tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5766 4 : tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5767 : vec_cond);
5768 4 : gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5769 : VIEW_CONVERT_EXPR,
5770 : vec_cond_cast_rhs);
5771 4 : gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5772 :
5773 : /* Reduce down to a scalar value. */
5774 4 : tree data_reduc = make_ssa_name (scalar_type_unsigned);
5775 4 : gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5776 : 1, vec_cond_cast);
5777 4 : gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5778 4 : gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5779 :
5780 : /* Convert the reduced value back to the result type and set as the
5781 : result. */
5782 4 : gimple_seq stmts = NULL;
5783 4 : new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5784 : data_reduc);
5785 4 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5786 4 : scalar_results.safe_push (new_temp);
5787 4 : }
5788 22198 : else if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
5789 63 : && reduc_fn == IFN_LAST)
5790 : {
5791 : /* Condition reduction without supported IFN_REDUC_MAX. Generate
5792 : idx = 0;
5793 : idx_val = induction_index[0];
5794 : val = data_reduc[0];
5795 : for (idx = 0, val = init, i = 0; i < nelts; ++i)
5796 : if (induction_index[i] > idx_val)
5797 : val = data_reduc[i], idx_val = induction_index[i];
5798 : return val; */
5799 :
5800 63 : tree data_eltype = TREE_TYPE (vectype);
5801 63 : tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5802 63 : unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5803 63 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5804 : /* Enforced by vectorizable_reduction, which ensures we have target
5805 : support before allowing a conditional reduction on variable-length
5806 : vectors. */
5807 63 : unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5808 63 : tree idx_val = NULL_TREE, val = NULL_TREE;
5809 419 : for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5810 : {
5811 356 : tree old_idx_val = idx_val;
5812 356 : tree old_val = val;
5813 356 : idx_val = make_ssa_name (idx_eltype);
5814 356 : epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5815 : build3 (BIT_FIELD_REF, idx_eltype,
5816 : induction_index,
5817 356 : bitsize_int (el_size),
5818 356 : bitsize_int (off)));
5819 356 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5820 356 : val = make_ssa_name (data_eltype);
5821 712 : epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5822 : build3 (BIT_FIELD_REF,
5823 : data_eltype,
5824 356 : reduc_inputs[0],
5825 356 : bitsize_int (el_size),
5826 356 : bitsize_int (off)));
5827 356 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5828 356 : if (off != 0)
5829 : {
5830 293 : tree new_idx_val = idx_val;
5831 293 : if (off != v_size - el_size)
5832 : {
5833 230 : new_idx_val = make_ssa_name (idx_eltype);
5834 230 : epilog_stmt = gimple_build_assign (new_idx_val,
5835 : MAX_EXPR, idx_val,
5836 : old_idx_val);
5837 230 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5838 : }
5839 293 : tree cond = make_ssa_name (boolean_type_node);
5840 293 : epilog_stmt = gimple_build_assign (cond, GT_EXPR,
5841 : idx_val, old_idx_val);
5842 293 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5843 293 : tree new_val = make_ssa_name (data_eltype);
5844 293 : epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
5845 : cond, val, old_val);
5846 293 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5847 293 : idx_val = new_idx_val;
5848 293 : val = new_val;
5849 : }
5850 : }
5851 : /* Convert the reduced value back to the result type and set as the
5852 : result. */
5853 63 : gimple_seq stmts = NULL;
5854 63 : val = gimple_convert (&stmts, scalar_type, val);
5855 63 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5856 63 : scalar_results.safe_push (val);
5857 63 : }
5858 :
5859 : /* 2.3 Create the reduction code, using one of the three schemes described
5860 : above. In SLP we simply need to extract all the elements from the
5861 : vector (without reducing them), so we use scalar shifts. */
5862 22135 : else if (reduc_fn != IFN_LAST && (!slp_reduc || group_size == 1))
5863 : {
5864 20265 : tree tmp;
5865 20265 : tree vec_elem_type;
5866 :
5867 : /* Case 1: Create:
5868 : v_out2 = reduc_expr <v_out1> */
5869 :
5870 20265 : if (dump_enabled_p ())
5871 1514 : dump_printf_loc (MSG_NOTE, vect_location,
5872 : "Reduce using direct vector reduction.\n");
5873 :
5874 20265 : gimple_seq stmts = NULL;
5875 20265 : vec_elem_type = TREE_TYPE (vectype);
5876 20265 : new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5877 20265 : vec_elem_type, reduc_inputs[0]);
5878 20265 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5879 20265 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5880 :
5881 20265 : if ((VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5882 62 : && induc_val)
5883 : {
5884 : /* Earlier we set the initial value to be a vector if induc_val
5885 : values. Check the result and if it is induc_val then replace
5886 : with the original initial value, unless induc_val is
5887 : the same as initial_def already. */
5888 60 : tree zcompare = make_ssa_name (boolean_type_node);
5889 60 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
5890 : new_temp, induc_val);
5891 60 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5892 60 : tree initial_def = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[0];
5893 60 : tmp = make_ssa_name (new_scalar_dest);
5894 60 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5895 : initial_def, new_temp);
5896 60 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5897 60 : new_temp = tmp;
5898 : }
5899 :
5900 20265 : scalar_results.safe_push (new_temp);
5901 20265 : }
5902 1679 : else if (direct_slp_reduc)
5903 : {
5904 : /* Here we create one vector for each of the GROUP_SIZE results,
5905 : with the elements for other SLP statements replaced with the
5906 : neutral value. We can then do a normal reduction on each vector. */
5907 :
5908 : /* Enforced by vectorizable_reduction. */
5909 : gcc_assert (reduc_inputs.length () == 1);
5910 : gcc_assert (pow2p_hwi (group_size));
5911 :
5912 : gimple_seq seq = NULL;
5913 :
5914 : /* Build a vector {0, 1, 2, ...}, with the same number of elements
5915 : and the same element size as VECTYPE. */
5916 : tree index = build_index_vector (vectype, 0, 1);
5917 : tree index_type = TREE_TYPE (index);
5918 : tree index_elt_type = TREE_TYPE (index_type);
5919 : tree mask_type = truth_type_for (index_type);
5920 :
5921 : /* Create a vector that, for each element, identifies which of
5922 : the results should use it. */
5923 : tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5924 : index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5925 : build_vector_from_val (index_type, index_mask));
5926 :
5927 : /* Get a neutral vector value. This is simply a splat of the neutral
5928 : scalar value if we have one, otherwise the initial scalar value
5929 : is itself a neutral value. */
5930 : tree vector_identity = NULL_TREE;
5931 : tree neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5932 : NULL_TREE, false);
5933 : if (neutral_op)
5934 : vector_identity = gimple_build_vector_from_val (&seq, vectype,
5935 : neutral_op);
5936 : for (unsigned int i = 0; i < group_size; ++i)
5937 : {
5938 : /* If there's no univeral neutral value, we can use the
5939 : initial scalar value from the original PHI. This is used
5940 : for MIN and MAX reduction, for example. */
5941 : if (!neutral_op)
5942 : {
5943 : tree scalar_value
5944 : = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[i];
5945 : scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5946 : scalar_value);
5947 : vector_identity = gimple_build_vector_from_val (&seq, vectype,
5948 : scalar_value);
5949 : }
5950 :
5951 : /* Calculate the equivalent of:
5952 :
5953 : sel[j] = (index[j] == i);
5954 :
5955 : which selects the elements of REDUC_INPUTS[0] that should
5956 : be included in the result. */
5957 : tree compare_val = build_int_cst (index_elt_type, i);
5958 : compare_val = build_vector_from_val (index_type, compare_val);
5959 : tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5960 : index, compare_val);
5961 :
5962 : /* Calculate the equivalent of:
5963 :
5964 : vec = seq ? reduc_inputs[0] : vector_identity;
5965 :
5966 : VEC is now suitable for a full vector reduction. */
5967 : tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5968 : sel, reduc_inputs[0], vector_identity);
5969 :
5970 : /* Do the reduction and convert it to the appropriate type. */
5971 : tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5972 : TREE_TYPE (vectype), vec);
5973 : scalar = gimple_convert (&seq, scalar_type, scalar);
5974 : scalar_results.safe_push (scalar);
5975 : }
5976 : gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5977 : }
5978 : else
5979 : {
5980 1679 : bool reduce_with_shift;
5981 1679 : tree vec_temp;
5982 :
5983 1679 : gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5984 :
5985 : /* See if the target wants to do the final (shift) reduction
5986 : in a vector mode of smaller size and first reduce upper/lower
5987 : halves against each other. */
5988 1870 : enum machine_mode mode1 = mode;
5989 1870 : tree stype = TREE_TYPE (vectype);
5990 1870 : if (compute_vectype != vectype)
5991 : {
5992 482 : stype = unsigned_type_for (stype);
5993 482 : gimple_seq stmts = NULL;
5994 1034 : for (unsigned i = 0; i < reduc_inputs.length (); ++i)
5995 : {
5996 552 : tree new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5997 552 : compute_vectype, reduc_inputs[i]);
5998 552 : reduc_inputs[i] = new_temp;
5999 : }
6000 482 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6001 : }
6002 1870 : unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6003 1870 : unsigned nunits1 = nunits;
6004 1870 : if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6005 1870 : && reduc_inputs.length () == 1)
6006 : {
6007 41 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6008 : /* For SLP reductions we have to make sure lanes match up, but
6009 : since we're doing individual element final reduction reducing
6010 : vector width here is even more important.
6011 : ??? We can also separate lanes with permutes, for the common
6012 : case of power-of-two group-size odd/even extracts would work. */
6013 41 : if (slp_reduc && nunits != nunits1)
6014 : {
6015 41 : nunits1 = least_common_multiple (nunits1, group_size);
6016 82 : gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6017 : }
6018 : }
6019 1829 : else if (!slp_reduc
6020 1829 : && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6021 0 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6022 :
6023 1870 : tree vectype1 = compute_vectype;
6024 1870 : if (mode1 != mode)
6025 : {
6026 47 : vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6027 47 : stype, nunits1);
6028 : /* First reduce the vector to the desired vector size we should
6029 : do shift reduction on by combining upper and lower halves. */
6030 47 : gimple_seq stmts = NULL;
6031 47 : new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6032 : code, &stmts);
6033 47 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6034 47 : reduc_inputs[0] = new_temp;
6035 : }
6036 :
6037 1870 : reduce_with_shift = have_whole_vector_shift (mode1);
6038 729 : if (!VECTOR_MODE_P (mode1)
6039 2597 : || !directly_supported_p (code, vectype1))
6040 : reduce_with_shift = false;
6041 :
6042 1853 : if (reduce_with_shift && (!slp_reduc || group_size == 1))
6043 : {
6044 1629 : int element_bitsize = vector_element_bits (vectype1);
6045 : /* Enforced by vectorizable_reduction, which disallows SLP reductions
6046 : for variable-length vectors and also requires direct target support
6047 : for loop reductions. */
6048 1629 : int nelements = TYPE_VECTOR_SUBPARTS (vectype1).to_constant ();
6049 1629 : vec_perm_builder sel;
6050 1629 : vec_perm_indices indices;
6051 :
6052 1629 : int elt_offset;
6053 :
6054 1629 : tree zero_vec = build_zero_cst (vectype1);
6055 : /* Case 2: Create:
6056 : for (offset = nelements/2; offset >= 1; offset/=2)
6057 : {
6058 : Create: va' = vec_shift <va, offset>
6059 : Create: va = vop <va, va'>
6060 : } */
6061 :
6062 1629 : if (dump_enabled_p ())
6063 365 : dump_printf_loc (MSG_NOTE, vect_location,
6064 : "Reduce using vector shifts\n");
6065 :
6066 1629 : gimple_seq stmts = NULL;
6067 1629 : new_temp = gimple_convert (&stmts, vectype1, reduc_inputs[0]);
6068 1629 : for (elt_offset = nelements / 2;
6069 3568 : elt_offset >= 1;
6070 1939 : elt_offset /= 2)
6071 : {
6072 1939 : calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6073 1939 : indices.new_vector (sel, 2, nelements);
6074 1939 : tree mask = vect_gen_perm_mask_any (vectype1, indices);
6075 1939 : new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6076 : new_temp, zero_vec, mask);
6077 1939 : new_temp = gimple_build (&stmts, code,
6078 : vectype1, new_name, new_temp);
6079 : }
6080 :
6081 : /* 2.4 Extract the final scalar result. Create:
6082 : s_out3 = extract_field <v_out2, bitpos> */
6083 :
6084 1629 : if (dump_enabled_p ())
6085 365 : dump_printf_loc (MSG_NOTE, vect_location,
6086 : "extract scalar result\n");
6087 :
6088 1629 : new_temp = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype1),
6089 1629 : new_temp, bitsize_int (element_bitsize),
6090 1629 : bitsize_zero_node);
6091 1629 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6092 1629 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6093 1629 : scalar_results.safe_push (new_temp);
6094 1629 : }
6095 : else
6096 : {
6097 : /* Case 3: Create:
6098 : s = extract_field <v_out2, 0>
6099 : for (offset = element_size;
6100 : offset < vector_size;
6101 : offset += element_size;)
6102 : {
6103 : Create: s' = extract_field <v_out2, offset>
6104 : Create: s = op <s, s'> // For non SLP cases
6105 : } */
6106 :
6107 241 : if (dump_enabled_p ())
6108 150 : dump_printf_loc (MSG_NOTE, vect_location,
6109 : "Reduce using scalar code.\n");
6110 :
6111 241 : tree compute_type = TREE_TYPE (vectype1);
6112 241 : unsigned element_bitsize = vector_element_bits (vectype1);
6113 241 : unsigned vec_size_in_bits = element_bitsize
6114 241 : * TYPE_VECTOR_SUBPARTS (vectype1).to_constant ();
6115 241 : tree bitsize = bitsize_int (element_bitsize);
6116 241 : gimple_seq stmts = NULL;
6117 647 : FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6118 : {
6119 406 : unsigned bit_offset;
6120 812 : new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6121 406 : vec_temp, bitsize, bitsize_zero_node);
6122 :
6123 : /* In SLP we don't need to apply reduction operation, so we just
6124 : collect s' values in SCALAR_RESULTS. */
6125 406 : if (slp_reduc)
6126 396 : scalar_results.safe_push (new_temp);
6127 :
6128 1000 : for (bit_offset = element_bitsize;
6129 1406 : bit_offset < vec_size_in_bits;
6130 1000 : bit_offset += element_bitsize)
6131 : {
6132 1000 : tree bitpos = bitsize_int (bit_offset);
6133 1000 : new_name = gimple_build (&stmts, BIT_FIELD_REF,
6134 : compute_type, vec_temp,
6135 : bitsize, bitpos);
6136 1000 : if (slp_reduc)
6137 : {
6138 : /* In SLP we don't need to apply reduction operation, so
6139 : we just collect s' values in SCALAR_RESULTS. */
6140 990 : new_temp = new_name;
6141 990 : scalar_results.safe_push (new_name);
6142 : }
6143 : else
6144 10 : new_temp = gimple_build (&stmts, code, compute_type,
6145 : new_name, new_temp);
6146 : }
6147 : }
6148 :
6149 : /* The only case where we need to reduce scalar results in a SLP
6150 : reduction, is unrolling. If the size of SCALAR_RESULTS is
6151 : greater than GROUP_SIZE, we reduce them combining elements modulo
6152 : GROUP_SIZE. */
6153 241 : if (slp_reduc)
6154 : {
6155 231 : tree res, first_res, new_res;
6156 :
6157 : /* Reduce multiple scalar results in case of SLP unrolling. */
6158 925 : for (j = group_size; scalar_results.iterate (j, &res);
6159 : j++)
6160 : {
6161 694 : first_res = scalar_results[j % group_size];
6162 694 : new_res = gimple_build (&stmts, code, compute_type,
6163 : first_res, res);
6164 694 : scalar_results[j % group_size] = new_res;
6165 : }
6166 231 : scalar_results.truncate (group_size);
6167 1154 : for (k = 0; k < group_size; k++)
6168 1384 : scalar_results[k] = gimple_convert (&stmts, scalar_type,
6169 692 : scalar_results[k]);
6170 : }
6171 : else
6172 : {
6173 : /* Reduction chain - we have one scalar to keep in
6174 : SCALAR_RESULTS. */
6175 10 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6176 10 : scalar_results.safe_push (new_temp);
6177 : }
6178 :
6179 241 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6180 : }
6181 :
6182 1870 : if ((VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6183 0 : && induc_val)
6184 : {
6185 : /* Earlier we set the initial value to be a vector if induc_val
6186 : values. Check the result and if it is induc_val then replace
6187 : with the original initial value, unless induc_val is
6188 : the same as initial_def already. */
6189 0 : tree zcompare = make_ssa_name (boolean_type_node);
6190 0 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6191 0 : scalar_results[0], induc_val);
6192 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6193 0 : tree initial_def = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[0];
6194 0 : tree tmp = make_ssa_name (new_scalar_dest);
6195 0 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6196 0 : initial_def, scalar_results[0]);
6197 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6198 0 : scalar_results[0] = tmp;
6199 : }
6200 : }
6201 :
6202 : /* 2.5 Adjust the final result by the initial value of the reduction
6203 : variable. (When such adjustment is not needed, then
6204 : 'adjustment_def' is zero). For example, if code is PLUS we create:
6205 : new_temp = loop_exit_def + adjustment_def */
6206 :
6207 22202 : if (adjustment_def)
6208 : {
6209 15908 : gcc_assert (!slp_reduc || group_size == 1);
6210 15908 : gimple_seq stmts = NULL;
6211 15908 : if (double_reduc)
6212 : {
6213 0 : gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6214 0 : adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6215 0 : new_temp = gimple_build (&stmts, code, vectype,
6216 0 : reduc_inputs[0], adjustment_def);
6217 : }
6218 : else
6219 : {
6220 15908 : new_temp = scalar_results[0];
6221 15908 : gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6222 15908 : adjustment_def = gimple_convert (&stmts, TREE_TYPE (compute_vectype),
6223 : adjustment_def);
6224 15908 : new_temp = gimple_convert (&stmts, TREE_TYPE (compute_vectype),
6225 : new_temp);
6226 15908 : new_temp = gimple_build (&stmts, code, TREE_TYPE (compute_vectype),
6227 : new_temp, adjustment_def);
6228 15908 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6229 : }
6230 :
6231 15908 : epilog_stmt = gimple_seq_last_stmt (stmts);
6232 15908 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6233 15908 : scalar_results[0] = new_temp;
6234 : }
6235 :
6236 : /* Record this operation if it could be reused by the epilogue loop. */
6237 22202 : if (VECT_REDUC_INFO_TYPE (reduc_info) == TREE_CODE_REDUCTION
6238 22202 : && reduc_inputs.length () == 1)
6239 22016 : loop_vinfo->reusable_accumulators.put (scalar_results[0],
6240 : { orig_reduc_input, reduc_info });
6241 :
6242 : /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6243 : phis with new adjusted scalar results, i.e., replace use <s_out0>
6244 : with use <s_out4>.
6245 :
6246 : Transform:
6247 : loop_exit:
6248 : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6249 : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6250 : v_out2 = reduce <v_out1>
6251 : s_out3 = extract_field <v_out2, 0>
6252 : s_out4 = adjust_result <s_out3>
6253 : use <s_out0>
6254 : use <s_out0>
6255 :
6256 : into:
6257 :
6258 : loop_exit:
6259 : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6260 : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6261 : v_out2 = reduce <v_out1>
6262 : s_out3 = extract_field <v_out2, 0>
6263 : s_out4 = adjust_result <s_out3>
6264 : use <s_out4>
6265 : use <s_out4> */
6266 :
6267 44404 : gcc_assert (live_out_stmts.size () == scalar_results.length ());
6268 22202 : auto_vec<gimple *> phis;
6269 44865 : for (k = 0; k < live_out_stmts.size (); k++)
6270 : {
6271 22663 : stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6272 22663 : tree scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6273 :
6274 : /* Find the loop-closed-use at the loop exit of the original scalar
6275 : result. (The reduction result is expected to have two immediate uses,
6276 : one at the latch block, and one at the loop exit). Note with
6277 : early break we can have two exit blocks, so pick the correct PHI. */
6278 115113 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6279 69787 : if (!is_gimple_debug (USE_STMT (use_p))
6280 69787 : && !flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6281 : {
6282 22658 : gcc_assert (is_a <gphi *> (USE_STMT (use_p)));
6283 22658 : if (gimple_bb (USE_STMT (use_p)) == loop_exit->dest)
6284 22650 : phis.safe_push (USE_STMT (use_p));
6285 22663 : }
6286 :
6287 45313 : FOR_EACH_VEC_ELT (phis, i, exit_phi)
6288 : {
6289 : /* Replace the uses: */
6290 22650 : orig_name = PHI_RESULT (exit_phi);
6291 :
6292 : /* Look for a single use at the target of the skip edge. */
6293 22650 : if (unify_with_main_loop_p)
6294 : {
6295 38 : use_operand_p use_p;
6296 38 : gimple *user;
6297 38 : if (!single_imm_use (orig_name, &use_p, &user))
6298 0 : gcc_unreachable ();
6299 38 : orig_name = gimple_get_lhs (user);
6300 : }
6301 :
6302 22650 : scalar_result = scalar_results[k];
6303 84028 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6304 : {
6305 38728 : gphi *use_phi = dyn_cast <gphi *> (use_stmt);
6306 116228 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6307 : {
6308 38750 : if (use_phi
6309 38750 : && (phi_arg_edge_from_use (use_p)->flags & EDGE_ABNORMAL))
6310 : {
6311 0 : gcc_assert (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (orig_name));
6312 0 : SSA_NAME_OCCURS_IN_ABNORMAL_PHI (scalar_result) = 1;
6313 : }
6314 38750 : SET_USE (use_p, scalar_result);
6315 : }
6316 38728 : update_stmt (use_stmt);
6317 22650 : }
6318 : }
6319 :
6320 22663 : phis.truncate (0);
6321 : }
6322 22202 : }
6323 :
6324 : /* Return a vector of type VECTYPE that is equal to the vector select
6325 : operation "MASK ? VEC : IDENTITY". Insert the select statements
6326 : before GSI. */
6327 :
6328 : static tree
6329 9 : merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6330 : tree vec, tree identity)
6331 : {
6332 9 : tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6333 9 : gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6334 : mask, vec, identity);
6335 9 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6336 9 : return cond;
6337 : }
6338 :
6339 : /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6340 : order, starting with LHS. Insert the extraction statements before GSI and
6341 : associate the new scalar SSA names with variable SCALAR_DEST.
6342 : If MASK is nonzero mask the input and then operate on it unconditionally.
6343 : Return the SSA name for the result. */
6344 :
6345 : static tree
6346 1101 : vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6347 : tree_code code, tree lhs, tree vector_rhs,
6348 : tree mask)
6349 : {
6350 1101 : tree vectype = TREE_TYPE (vector_rhs);
6351 1101 : tree scalar_type = TREE_TYPE (vectype);
6352 1101 : tree bitsize = TYPE_SIZE (scalar_type);
6353 1101 : unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6354 1101 : unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6355 :
6356 : /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6357 : to perform an unconditional element-wise reduction of it. */
6358 1101 : if (mask)
6359 : {
6360 77 : tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
6361 : "masked_vector_rhs");
6362 77 : tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
6363 : false);
6364 77 : tree vector_identity = build_vector_from_val (vectype, neutral_op);
6365 77 : gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
6366 : mask, vector_rhs, vector_identity);
6367 77 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6368 77 : vector_rhs = masked_vector_rhs;
6369 : }
6370 :
6371 1101 : for (unsigned HOST_WIDE_INT bit_offset = 0;
6372 5141 : bit_offset < vec_size_in_bits;
6373 4040 : bit_offset += element_bitsize)
6374 : {
6375 4040 : tree bitpos = bitsize_int (bit_offset);
6376 4040 : tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6377 : bitsize, bitpos);
6378 :
6379 4040 : gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6380 4040 : rhs = make_ssa_name (scalar_dest, stmt);
6381 4040 : gimple_assign_set_lhs (stmt, rhs);
6382 4040 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6383 : /* Fold the vector extract, combining it with a previous reversal
6384 : like seen in PR90579. */
6385 4040 : auto gsi2 = gsi_for_stmt (stmt);
6386 4040 : if (fold_stmt (&gsi2, follow_all_ssa_edges))
6387 356 : update_stmt (gsi_stmt (gsi2));
6388 :
6389 4040 : stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6390 4040 : tree new_name = make_ssa_name (scalar_dest, stmt);
6391 4040 : gimple_assign_set_lhs (stmt, new_name);
6392 4040 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6393 4040 : lhs = new_name;
6394 : }
6395 1101 : return lhs;
6396 : }
6397 :
6398 : /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6399 : type of the vector input. */
6400 :
6401 : static internal_fn
6402 2538 : get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6403 : {
6404 2538 : internal_fn mask_reduc_fn;
6405 2538 : internal_fn mask_len_reduc_fn;
6406 :
6407 2538 : switch (reduc_fn)
6408 : {
6409 0 : case IFN_FOLD_LEFT_PLUS:
6410 0 : mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6411 0 : mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6412 0 : break;
6413 :
6414 : default:
6415 : return IFN_LAST;
6416 : }
6417 :
6418 0 : if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6419 : OPTIMIZE_FOR_SPEED))
6420 : return mask_reduc_fn;
6421 0 : if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6422 : OPTIMIZE_FOR_SPEED))
6423 : return mask_len_reduc_fn;
6424 : return IFN_LAST;
6425 : }
6426 :
6427 : /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6428 : statement that sets the live-out value. REDUC_DEF_STMT is the phi
6429 : statement. CODE is the operation performed by STMT_INFO and OPS are
6430 : its scalar operands. REDUC_INDEX is the index of the operand in
6431 : OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6432 : implements in-order reduction, or IFN_LAST if we should open-code it.
6433 : VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6434 : that should be used to control the operation in a fully-masked loop. */
6435 :
6436 : static bool
6437 843 : vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6438 : stmt_vec_info stmt_info,
6439 : gimple_stmt_iterator *gsi,
6440 : slp_tree slp_node,
6441 : code_helper code, internal_fn reduc_fn,
6442 : int num_ops, tree vectype_in,
6443 : int reduc_index, vec_loop_masks *masks,
6444 : vec_loop_lens *lens)
6445 : {
6446 843 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6447 843 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
6448 843 : internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6449 :
6450 843 : gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6451 :
6452 843 : bool is_cond_op = false;
6453 843 : if (!code.is_tree_code ())
6454 : {
6455 23 : code = conditional_internal_fn_code (internal_fn (code));
6456 23 : gcc_assert (code != ERROR_MARK);
6457 : is_cond_op = true;
6458 : }
6459 :
6460 843 : gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
6461 :
6462 843 : gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6463 : TYPE_VECTOR_SUBPARTS (vectype_in)));
6464 :
6465 : /* ??? We should, when transforming the cycle PHI, record the existing
6466 : scalar def as vector def so looking up the vector def works. This
6467 : would also allow generalizing this for reduction paths of length > 1
6468 : and/or SLP reductions. */
6469 843 : slp_tree reduc_node = SLP_TREE_CHILDREN (slp_node)[reduc_index];
6470 843 : stmt_vec_info reduc_var_def = SLP_TREE_SCALAR_STMTS (reduc_node)[0];
6471 843 : tree reduc_var = gimple_get_lhs (STMT_VINFO_STMT (reduc_var_def));
6472 :
6473 : /* The operands either come from a binary operation or an IFN_COND operation.
6474 : The former is a gimple assign with binary rhs and the latter is a
6475 : gimple call with four arguments. */
6476 843 : gcc_assert (num_ops == 2 || num_ops == 4);
6477 :
6478 843 : auto_vec<tree> vec_oprnds0, vec_opmask;
6479 843 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[(is_cond_op ? 2 : 0)
6480 843 : + (1 - reduc_index)],
6481 : &vec_oprnds0);
6482 : /* For an IFN_COND_OP we also need the vector mask operand. */
6483 843 : if (is_cond_op)
6484 23 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], &vec_opmask);
6485 :
6486 : /* The transform below relies on preserving the original scalar PHI
6487 : and its latch def which we replace. So work backwards from there. */
6488 843 : tree scalar_dest
6489 843 : = gimple_phi_arg_def_from_edge (as_a <gphi *> (STMT_VINFO_STMT
6490 : (reduc_var_def)),
6491 843 : loop_latch_edge (loop));
6492 843 : stmt_vec_info scalar_dest_def_info
6493 843 : = vect_stmt_to_vectorize (loop_vinfo->lookup_def (scalar_dest));
6494 843 : tree scalar_type = TREE_TYPE (scalar_dest);
6495 :
6496 843 : int vec_num = vec_oprnds0.length ();
6497 843 : tree vec_elem_type = TREE_TYPE (vectype_out);
6498 843 : gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6499 :
6500 843 : tree vector_identity = NULL_TREE;
6501 843 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6502 : {
6503 2 : vector_identity = build_zero_cst (vectype_out);
6504 2 : if (!HONOR_SIGNED_ZEROS (vectype_out))
6505 : ;
6506 : else
6507 : {
6508 2 : gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
6509 2 : vector_identity = const_unop (NEGATE_EXPR, vectype_out,
6510 : vector_identity);
6511 : }
6512 : }
6513 :
6514 843 : tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6515 843 : int i;
6516 843 : tree def0;
6517 1944 : FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6518 : {
6519 1101 : gimple *new_stmt;
6520 1101 : tree mask = NULL_TREE;
6521 1101 : tree len = NULL_TREE;
6522 1101 : tree bias = NULL_TREE;
6523 1101 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6524 : {
6525 9 : tree loop_mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
6526 : vec_num, vectype_in, i);
6527 9 : if (is_cond_op)
6528 9 : mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (loop_mask),
6529 9 : loop_mask, vec_opmask[i], gsi);
6530 : else
6531 : mask = loop_mask;
6532 : }
6533 1092 : else if (is_cond_op)
6534 68 : mask = vec_opmask[i];
6535 1101 : if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
6536 : {
6537 0 : len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
6538 : i, 1, false);
6539 0 : signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
6540 0 : bias = build_int_cst (intQI_type_node, biasval);
6541 0 : if (!is_cond_op)
6542 0 : mask = build_minus_one_cst (truth_type_for (vectype_in));
6543 : }
6544 :
6545 : /* Handle MINUS by adding the negative. */
6546 1101 : if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6547 : {
6548 0 : tree negated = make_ssa_name (vectype_out);
6549 0 : new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6550 0 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6551 0 : def0 = negated;
6552 : }
6553 :
6554 9 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
6555 1110 : && mask && mask_reduc_fn == IFN_LAST)
6556 9 : def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6557 : vector_identity);
6558 :
6559 : /* On the first iteration the input is simply the scalar phi
6560 : result, and for subsequent iterations it is the output of
6561 : the preceding operation. */
6562 1101 : if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6563 : {
6564 0 : if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6565 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
6566 : def0, mask, len, bias);
6567 0 : else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
6568 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6569 : def0, mask);
6570 : else
6571 0 : new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6572 : def0);
6573 : /* For chained SLP reductions the output of the previous reduction
6574 : operation serves as the input of the next. For the final statement
6575 : the output cannot be a temporary - we reuse the original
6576 : scalar destination of the last statement. */
6577 0 : if (i != vec_num - 1)
6578 : {
6579 0 : gimple_set_lhs (new_stmt, scalar_dest_var);
6580 0 : reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6581 0 : gimple_set_lhs (new_stmt, reduc_var);
6582 : }
6583 : }
6584 : else
6585 : {
6586 1101 : reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
6587 : tree_code (code), reduc_var, def0,
6588 : mask);
6589 1101 : new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6590 : /* Remove the statement, so that we can use the same code paths
6591 : as for statements that we've just created. */
6592 1101 : gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6593 1101 : gsi_remove (&tmp_gsi, true);
6594 : }
6595 :
6596 1101 : if (i == vec_num - 1)
6597 : {
6598 843 : gimple_set_lhs (new_stmt, scalar_dest);
6599 843 : vect_finish_replace_stmt (loop_vinfo,
6600 : scalar_dest_def_info,
6601 : new_stmt);
6602 : }
6603 : else
6604 258 : vect_finish_stmt_generation (loop_vinfo,
6605 : scalar_dest_def_info,
6606 : new_stmt, gsi);
6607 :
6608 1101 : slp_node->push_vec_def (new_stmt);
6609 : }
6610 :
6611 843 : return true;
6612 843 : }
6613 :
6614 : /* Function is_nonwrapping_integer_induction.
6615 :
6616 : Check if STMT_VINO (which is part of loop LOOP) both increments and
6617 : does not cause overflow. */
6618 :
6619 : static bool
6620 408 : is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6621 : {
6622 408 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6623 408 : tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6624 408 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6625 408 : tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6626 408 : widest_int ni, max_loop_value, lhs_max;
6627 408 : wi::overflow_type overflow = wi::OVF_NONE;
6628 :
6629 : /* Make sure the loop is integer based. */
6630 408 : if (TREE_CODE (base) != INTEGER_CST
6631 109 : || TREE_CODE (step) != INTEGER_CST)
6632 : return false;
6633 :
6634 : /* Check that the max size of the loop will not wrap. */
6635 :
6636 109 : if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6637 : return true;
6638 :
6639 8 : if (! max_stmt_executions (loop, &ni))
6640 : return false;
6641 :
6642 8 : max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6643 8 : &overflow);
6644 8 : if (overflow)
6645 : return false;
6646 :
6647 8 : max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6648 16 : TYPE_SIGN (lhs_type), &overflow);
6649 8 : if (overflow)
6650 : return false;
6651 :
6652 8 : return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6653 8 : <= TYPE_PRECISION (lhs_type));
6654 408 : }
6655 :
6656 : /* Check if masking can be supported by inserting a conditional expression.
6657 : CODE is the code for the operation. COND_FN is the conditional internal
6658 : function, if it exists. VECTYPE_IN is the type of the vector input. */
6659 : static bool
6660 5102 : use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6661 : tree vectype_in)
6662 : {
6663 5102 : if (cond_fn != IFN_LAST
6664 5102 : && direct_internal_fn_supported_p (cond_fn, vectype_in,
6665 : OPTIMIZE_FOR_SPEED))
6666 : return false;
6667 :
6668 3596 : if (code.is_tree_code ())
6669 3590 : switch (tree_code (code))
6670 : {
6671 : case DOT_PROD_EXPR:
6672 : case SAD_EXPR:
6673 : return true;
6674 :
6675 : default:
6676 : break;
6677 : }
6678 : return false;
6679 : }
6680 :
6681 : /* Insert a conditional expression to enable masked vectorization. CODE is the
6682 : code for the operation. VOP is the array of operands. MASK is the loop
6683 : mask. GSI is a statement iterator used to place the new conditional
6684 : expression. */
6685 : static void
6686 4 : build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6687 : gimple_stmt_iterator *gsi)
6688 : {
6689 4 : switch (tree_code (code))
6690 : {
6691 4 : case DOT_PROD_EXPR:
6692 4 : {
6693 4 : tree vectype = TREE_TYPE (vop[1]);
6694 4 : tree zero = build_zero_cst (vectype);
6695 4 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6696 4 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6697 : mask, vop[1], zero);
6698 4 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6699 4 : vop[1] = masked_op1;
6700 4 : break;
6701 : }
6702 :
6703 0 : case SAD_EXPR:
6704 0 : {
6705 0 : tree vectype = TREE_TYPE (vop[1]);
6706 0 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6707 0 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6708 : mask, vop[1], vop[0]);
6709 0 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6710 0 : vop[1] = masked_op1;
6711 0 : break;
6712 : }
6713 :
6714 0 : default:
6715 0 : gcc_unreachable ();
6716 : }
6717 4 : }
6718 :
6719 : /* Given an operation with CODE in loop reduction path whose reduction PHI is
6720 : specified by REDUC_INFO, the operation has TYPE of scalar result, and its
6721 : input vectype is represented by VECTYPE_IN. The vectype of vectorized result
6722 : may be different from VECTYPE_IN, either in base type or vectype lanes,
6723 : lane-reducing operation is the case. This function check if it is possible,
6724 : and how to perform partial vectorization on the operation in the context
6725 : of LOOP_VINFO. */
6726 :
6727 : static void
6728 3209 : vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo,
6729 : vect_reduc_info reduc_info,
6730 : slp_tree slp_node,
6731 : code_helper code, tree type,
6732 : tree vectype_in)
6733 : {
6734 3209 : enum vect_reduction_type reduc_type = VECT_REDUC_INFO_TYPE (reduc_info);
6735 3209 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
6736 3209 : internal_fn cond_fn
6737 924 : = ((code.is_internal_fn ()
6738 924 : && internal_fn_mask_index ((internal_fn)code) != -1)
6739 3209 : ? (internal_fn)code : get_conditional_internal_fn (code, type));
6740 :
6741 3209 : if (reduc_type != FOLD_LEFT_REDUCTION
6742 2529 : && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6743 5695 : && (cond_fn == IFN_LAST
6744 2486 : || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6745 : OPTIMIZE_FOR_SPEED)))
6746 : {
6747 1514 : if (dump_enabled_p ())
6748 98 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6749 : "can't operate on partial vectors because"
6750 : " no conditional operation is available.\n");
6751 1514 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6752 : }
6753 1695 : else if (reduc_type == FOLD_LEFT_REDUCTION
6754 1695 : && reduc_fn == IFN_LAST
6755 1695 : && !expand_vec_cond_expr_p (vectype_in, truth_type_for (vectype_in)))
6756 : {
6757 0 : if (dump_enabled_p ())
6758 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6759 : "can't operate on partial vectors because"
6760 : " no conditional operation is available.\n");
6761 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6762 : }
6763 1695 : else if (reduc_type == FOLD_LEFT_REDUCTION
6764 680 : && internal_fn_mask_index (reduc_fn) == -1
6765 680 : && FLOAT_TYPE_P (vectype_in)
6766 2370 : && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
6767 : {
6768 0 : if (dump_enabled_p ())
6769 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6770 : "can't operate on partial vectors because"
6771 : " signed zeros cannot be preserved.\n");
6772 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6773 : }
6774 : else
6775 : {
6776 1695 : internal_fn mask_reduc_fn
6777 1695 : = get_masked_reduction_fn (reduc_fn, vectype_in);
6778 1695 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6779 1695 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
6780 1695 : unsigned nvectors = vect_get_num_copies (loop_vinfo, slp_node);
6781 :
6782 1695 : if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6783 0 : vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_in, 1);
6784 : else
6785 1695 : vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_in, NULL);
6786 : }
6787 3209 : }
6788 :
6789 : /* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
6790 : the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC,
6791 : and the analysis is for slp if SLP_NODE is not NULL.
6792 :
6793 : For a lane-reducing operation, the loop reduction path that it lies in,
6794 : may contain normal operation, or other lane-reducing operation of different
6795 : input type size, an example as:
6796 :
6797 : int sum = 0;
6798 : for (i)
6799 : {
6800 : ...
6801 : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
6802 : sum += w[i]; // widen-sum <vector(16) char>
6803 : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
6804 : sum += n[i]; // normal <vector(4) int>
6805 : ...
6806 : }
6807 :
6808 : Vectorization factor is essentially determined by operation whose input
6809 : vectype has the most lanes ("vector(16) char" in the example), while we
6810 : need to choose input vectype with the least lanes ("vector(4) int" in the
6811 : example) to determine effective number of vector reduction PHIs. */
6812 :
6813 : bool
6814 330379 : vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
6815 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
6816 : {
6817 330379 : gimple *stmt = stmt_info->stmt;
6818 :
6819 330379 : if (!lane_reducing_stmt_p (stmt))
6820 : return false;
6821 :
6822 454 : tree type = TREE_TYPE (gimple_assign_lhs (stmt));
6823 :
6824 454 : if (!INTEGRAL_TYPE_P (type))
6825 : return false;
6826 :
6827 : /* Do not try to vectorize bit-precision reductions. */
6828 454 : if (!type_has_mode_precision_p (type))
6829 : return false;
6830 :
6831 454 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
6832 :
6833 : /* TODO: Support lane-reducing operation that does not directly participate
6834 : in loop reduction. */
6835 454 : if (!reduc_info)
6836 : return false;
6837 :
6838 : /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
6839 : recoginized. */
6840 454 : gcc_assert (!nested_in_vect_loop_p (LOOP_VINFO_LOOP (loop_vinfo), stmt_info));
6841 454 : gcc_assert (VECT_REDUC_INFO_TYPE (reduc_info) == TREE_CODE_REDUCTION);
6842 :
6843 1816 : for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
6844 : {
6845 1362 : slp_tree slp_op;
6846 1362 : tree op;
6847 1362 : tree vectype;
6848 1362 : enum vect_def_type dt;
6849 :
6850 1362 : if (!vect_is_simple_use (loop_vinfo, slp_node, i, &op,
6851 : &slp_op, &dt, &vectype))
6852 : {
6853 0 : if (dump_enabled_p ())
6854 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6855 : "use not simple.\n");
6856 0 : return false;
6857 : }
6858 :
6859 1362 : if (!vectype)
6860 : {
6861 6 : vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
6862 : slp_op);
6863 6 : if (!vectype)
6864 : return false;
6865 : }
6866 :
6867 1362 : if (!vect_maybe_update_slp_op_vectype (slp_op, vectype))
6868 : {
6869 0 : if (dump_enabled_p ())
6870 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6871 : "incompatible vector types for invariants\n");
6872 0 : return false;
6873 : }
6874 :
6875 1362 : if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6876 454 : continue;
6877 :
6878 : /* There should be at most one cycle def in the stmt. */
6879 908 : if (VECTORIZABLE_CYCLE_DEF (dt))
6880 : return false;
6881 : }
6882 :
6883 454 : slp_tree node_in = SLP_TREE_CHILDREN (slp_node)[0];
6884 454 : tree vectype_in = SLP_TREE_VECTYPE (node_in);
6885 454 : gcc_assert (vectype_in);
6886 :
6887 : /* Compute number of effective vector statements for costing. */
6888 454 : unsigned int ncopies_for_cost = vect_get_num_copies (loop_vinfo, node_in);
6889 454 : gcc_assert (ncopies_for_cost >= 1);
6890 :
6891 454 : if (vect_is_emulated_mixed_dot_prod (slp_node))
6892 : {
6893 : /* We need extra two invariants: one that contains the minimum signed
6894 : value and one that contains half of its negative. */
6895 9 : int prologue_stmts = 2;
6896 9 : unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
6897 : scalar_to_vec, slp_node, 0,
6898 : vect_prologue);
6899 9 : if (dump_enabled_p ())
6900 0 : dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
6901 : "extra prologue_cost = %d .\n", cost);
6902 :
6903 : /* Three dot-products and a subtraction. */
6904 9 : ncopies_for_cost *= 4;
6905 : }
6906 :
6907 454 : record_stmt_cost (cost_vec, (int) ncopies_for_cost, vector_stmt, slp_node,
6908 : 0, vect_body);
6909 :
6910 454 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
6911 : {
6912 43 : enum tree_code code = gimple_assign_rhs_code (stmt);
6913 43 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
6914 43 : node_in, code, type,
6915 : vectype_in);
6916 : }
6917 :
6918 : /* Transform via vect_transform_reduction. */
6919 454 : SLP_TREE_TYPE (slp_node) = reduc_vec_info_type;
6920 454 : return true;
6921 : }
6922 :
6923 : /* Function vectorizable_reduction.
6924 :
6925 : Check if STMT_INFO performs a reduction operation that can be vectorized.
6926 : If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6927 : stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6928 : Return true if STMT_INFO is vectorizable in this way.
6929 :
6930 : This function also handles reduction idioms (patterns) that have been
6931 : recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6932 : may be of this form:
6933 : X = pattern_expr (arg0, arg1, ..., X)
6934 : and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6935 : sequence that had been detected and replaced by the pattern-stmt
6936 : (STMT_INFO).
6937 :
6938 : This function also handles reduction of condition expressions, for example:
6939 : for (int i = 0; i < N; i++)
6940 : if (a[i] < value)
6941 : last = a[i];
6942 : This is handled by vectorising the loop and creating an additional vector
6943 : containing the loop indexes for which "a[i] < value" was true. In the
6944 : function epilogue this is reduced to a single max value and then used to
6945 : index into the vector of results.
6946 :
6947 : In some cases of reduction patterns, the type of the reduction variable X is
6948 : different than the type of the other arguments of STMT_INFO.
6949 : In such cases, the vectype that is used when transforming STMT_INFO into
6950 : a vector stmt is different than the vectype that is used to determine the
6951 : vectorization factor, because it consists of a different number of elements
6952 : than the actual number of elements that are being operated upon in parallel.
6953 :
6954 : For example, consider an accumulation of shorts into an int accumulator.
6955 : On some targets it's possible to vectorize this pattern operating on 8
6956 : shorts at a time (hence, the vectype for purposes of determining the
6957 : vectorization factor should be V8HI); on the other hand, the vectype that
6958 : is used to create the vector form is actually V4SI (the type of the result).
6959 :
6960 : Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6961 : indicates what is the actual level of parallelism (V8HI in the example), so
6962 : that the right vectorization factor would be derived. This vectype
6963 : corresponds to the type of arguments to the reduction stmt, and should *NOT*
6964 : be used to create the vectorized stmt. The right vectype for the vectorized
6965 : stmt is obtained from the type of the result X:
6966 : get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6967 :
6968 : This means that, contrary to "regular" reductions (or "regular" stmts in
6969 : general), the following equation:
6970 : STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6971 : does *NOT* necessarily hold for reduction patterns. */
6972 :
6973 : bool
6974 329925 : vectorizable_reduction (loop_vec_info loop_vinfo,
6975 : stmt_vec_info stmt_info, slp_tree slp_node,
6976 : slp_instance slp_node_instance,
6977 : stmt_vector_for_cost *cost_vec)
6978 : {
6979 329925 : tree vectype_in = NULL_TREE;
6980 329925 : enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6981 329925 : stmt_vec_info cond_stmt_vinfo = NULL;
6982 329925 : int i;
6983 329925 : int ncopies;
6984 329925 : bool single_defuse_cycle = false;
6985 329925 : tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6986 329925 : tree cond_reduc_val = NULL_TREE;
6987 :
6988 : /* Make sure it was already recognized as a reduction computation. */
6989 329925 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6990 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6991 329925 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6992 : return false;
6993 :
6994 : /* The reduction meta. */
6995 57752 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
6996 :
6997 57752 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6998 : {
6999 1427 : gcc_assert (is_a <gphi *> (stmt_info->stmt));
7000 : /* We eventually need to set a vector type on invariant arguments. */
7001 : unsigned j;
7002 : slp_tree child;
7003 4273 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7004 2854 : if (!vect_maybe_update_slp_op_vectype (child,
7005 : SLP_TREE_VECTYPE (slp_node)))
7006 : {
7007 0 : if (dump_enabled_p ())
7008 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7009 : "incompatible vector types for "
7010 : "invariants\n");
7011 0 : return false;
7012 : }
7013 2854 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7014 2854 : && !useless_type_conversion_p (SLP_TREE_VECTYPE (slp_node),
7015 : SLP_TREE_VECTYPE (child)))
7016 : {
7017 : /* With bools we can have mask and non-mask precision vectors
7018 : or different non-mask precisions. while pattern recog is
7019 : supposed to guarantee consistency here, we do not have
7020 : pattern stmts for PHIs (PR123316).
7021 : Deal with that here instead of ICEing later. */
7022 8 : if (dump_enabled_p ())
7023 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7024 : "incompatible vector type setup from "
7025 : "bool pattern detection\n");
7026 8 : return false;
7027 : }
7028 : /* Analysis for double-reduction is done on the outer
7029 : loop PHI, nested cycles have no further restrictions. */
7030 1419 : SLP_TREE_TYPE (slp_node) = cycle_phi_info_type;
7031 1419 : return true;
7032 : }
7033 :
7034 56325 : if (!is_a <gphi *> (stmt_info->stmt))
7035 : {
7036 6876 : gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def);
7037 6876 : SLP_TREE_TYPE (slp_node) = reduc_vec_info_type;
7038 6876 : return true;
7039 : }
7040 :
7041 49449 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7042 49449 : stmt_vec_info phi_info = stmt_info;
7043 49449 : bool double_reduc = false;
7044 49449 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7045 : {
7046 : /* We arrive here for both the inner loop LC PHI and the
7047 : outer loop PHI. The latter is what we want to analyze the
7048 : reduction with. The LC PHI is handled by vectorizable_lc_phi. */
7049 287 : if (gimple_bb (stmt_info->stmt) != loop->header)
7050 0 : return false;
7051 :
7052 : /* Set loop and phi_info to the inner loop. */
7053 287 : use_operand_p use_p;
7054 287 : gimple *use_stmt;
7055 287 : bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7056 : &use_p, &use_stmt);
7057 287 : gcc_assert (res);
7058 287 : phi_info = loop_vinfo->lookup_stmt (use_stmt);
7059 287 : loop = loop->inner;
7060 287 : double_reduc = true;
7061 : }
7062 :
7063 49449 : const bool reduc_chain = reduc_info->is_reduc_chain;
7064 49449 : slp_node_instance->reduc_phis = slp_node;
7065 : /* ??? We're leaving slp_node to point to the PHIs, we only
7066 : need it to get at the number of vector stmts which wasn't
7067 : yet initialized for the instance root. */
7068 :
7069 : /* PHIs should not participate in patterns. */
7070 49449 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7071 49449 : gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7072 :
7073 : /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7074 : and compute the reduction chain length. Discover the real
7075 : reduction operation stmt on the way (slp_for_stmt_info). */
7076 49449 : unsigned reduc_chain_length = 0;
7077 49449 : stmt_info = NULL;
7078 49449 : slp_tree slp_for_stmt_info = NULL;
7079 49449 : slp_tree vdef_slp = slp_node_instance->root;
7080 108742 : while (vdef_slp != slp_node)
7081 : {
7082 60045 : int reduc_idx = SLP_TREE_REDUC_IDX (vdef_slp);
7083 60045 : if (reduc_idx == -1)
7084 : {
7085 744 : if (dump_enabled_p ())
7086 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7087 : "reduction chain broken by patterns.\n");
7088 752 : return false;
7089 : }
7090 59301 : stmt_vec_info vdef = SLP_TREE_REPRESENTATIVE (vdef_slp);
7091 59301 : if (is_a <gphi *> (vdef->stmt))
7092 : {
7093 574 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[reduc_idx];
7094 : /* Do not count PHIs towards the chain length. */
7095 574 : continue;
7096 : }
7097 58727 : gimple_match_op op;
7098 58727 : if (!gimple_extract_op (vdef->stmt, &op))
7099 : {
7100 0 : if (dump_enabled_p ())
7101 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7102 : "reduction chain includes unsupported"
7103 : " statement type.\n");
7104 0 : return false;
7105 : }
7106 58727 : if (CONVERT_EXPR_CODE_P (op.code))
7107 : {
7108 3312 : if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7109 : {
7110 8 : if (dump_enabled_p ())
7111 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7112 : "conversion in the reduction chain.\n");
7113 8 : return false;
7114 : }
7115 3304 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[0];
7116 : }
7117 : else
7118 : {
7119 : /* First non-conversion stmt. */
7120 55415 : if (!slp_for_stmt_info)
7121 48697 : slp_for_stmt_info = vdef_slp;
7122 :
7123 55415 : if (lane_reducing_op_p (op.code))
7124 : {
7125 : /* The last operand of lane-reducing operation is for
7126 : reduction. */
7127 454 : gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
7128 :
7129 454 : slp_tree op_node = SLP_TREE_CHILDREN (vdef_slp)[0];
7130 454 : tree vectype_op = SLP_TREE_VECTYPE (op_node);
7131 454 : tree type_op = TREE_TYPE (op.ops[0]);
7132 454 : if (!vectype_op)
7133 : {
7134 9 : vectype_op = get_vectype_for_scalar_type (loop_vinfo,
7135 : type_op);
7136 9 : if (!vectype_op
7137 9 : || !vect_maybe_update_slp_op_vectype (op_node,
7138 : vectype_op))
7139 0 : return false;
7140 : }
7141 :
7142 : /* To accommodate lane-reducing operations of mixed input
7143 : vectypes, choose input vectype with the least lanes for the
7144 : reduction PHI statement, which would result in the most
7145 : ncopies for vectorized reduction results. */
7146 454 : if (!vectype_in
7147 454 : || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7148 46 : < GET_MODE_SIZE (SCALAR_TYPE_MODE (type_op))))
7149 431 : vectype_in = vectype_op;
7150 : }
7151 54961 : else if (!vectype_in)
7152 48266 : vectype_in = SLP_TREE_VECTYPE (slp_node);
7153 55415 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[reduc_idx];
7154 : }
7155 58719 : reduc_chain_length++;
7156 : }
7157 48697 : if (!slp_for_stmt_info)
7158 : {
7159 0 : if (dump_enabled_p ())
7160 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7161 : "only noop-conversions in the reduction chain.\n");
7162 0 : return false;
7163 : }
7164 48697 : stmt_info = SLP_TREE_REPRESENTATIVE (slp_for_stmt_info);
7165 :
7166 : /* PHIs should not participate in patterns. */
7167 48697 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7168 :
7169 : /* 1. Is vectorizable reduction? */
7170 : /* Not supportable if the reduction variable is used in the loop, unless
7171 : it's a reduction chain. */
7172 48697 : if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7173 0 : && !reduc_chain)
7174 : return false;
7175 :
7176 : /* Reductions that are not used even in an enclosing outer-loop,
7177 : are expected to be "live" (used out of the loop). */
7178 48697 : if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7179 0 : && !STMT_VINFO_LIVE_P (stmt_info))
7180 : return false;
7181 :
7182 : /* 2. Has this been recognized as a reduction pattern?
7183 :
7184 : Check if STMT represents a pattern that has been recognized
7185 : in earlier analysis stages. For stmts that represent a pattern,
7186 : the STMT_VINFO_RELATED_STMT field records the last stmt in
7187 : the original sequence that constitutes the pattern. */
7188 :
7189 48697 : stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7190 48697 : if (orig_stmt_info)
7191 : {
7192 3271 : gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7193 3271 : gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7194 : }
7195 :
7196 : /* 3. Check the operands of the operation. The first operands are defined
7197 : inside the loop body. The last operand is the reduction variable,
7198 : which is defined by the loop-header-phi. */
7199 :
7200 48697 : tree vectype_out = SLP_TREE_VECTYPE (slp_for_stmt_info);
7201 48697 : VECT_REDUC_INFO_VECTYPE (reduc_info) = vectype_out;
7202 :
7203 48697 : gimple_match_op op;
7204 48697 : if (!gimple_extract_op (stmt_info->stmt, &op))
7205 0 : gcc_unreachable ();
7206 48697 : bool lane_reducing = lane_reducing_op_p (op.code);
7207 :
7208 48697 : if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7209 15140 : && !SCALAR_FLOAT_TYPE_P (op.type))
7210 : return false;
7211 :
7212 : /* Do not try to vectorize bit-precision reductions. */
7213 48697 : if (!type_has_mode_precision_p (op.type)
7214 1552 : && op.code != BIT_AND_EXPR
7215 1472 : && op.code != BIT_IOR_EXPR
7216 49132 : && op.code != BIT_XOR_EXPR)
7217 : return false;
7218 :
7219 : /* Lane-reducing ops also never can be used in a SLP reduction group
7220 : since we'll mix lanes belonging to different reductions. But it's
7221 : OK to use them in a reduction chain or when the reduction group
7222 : has just one element. */
7223 48387 : if (lane_reducing
7224 48387 : && !reduc_chain
7225 404 : && SLP_TREE_LANES (slp_node) > 1)
7226 : {
7227 0 : if (dump_enabled_p ())
7228 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7229 : "lane-reducing reduction in reduction group.\n");
7230 0 : return false;
7231 : }
7232 :
7233 : /* All uses but the last are expected to be defined in the loop.
7234 : The last use is the reduction variable. In case of nested cycle this
7235 : assumption is not true: we use reduc_index to record the index of the
7236 : reduction variable. */
7237 48387 : slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7238 48387 : tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7239 48387 : gcc_assert (op.code != COND_EXPR || !COMPARISON_CLASS_P (op.ops[0]));
7240 153627 : for (i = 0; i < (int) op.num_ops; i++)
7241 : {
7242 : /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7243 105240 : if (i == 0 && op.code == COND_EXPR)
7244 52808 : continue;
7245 :
7246 104433 : stmt_vec_info def_stmt_info;
7247 104433 : enum vect_def_type dt;
7248 104433 : if (!vect_is_simple_use (loop_vinfo, slp_for_stmt_info,
7249 : i, &op.ops[i], &slp_op[i], &dt,
7250 104433 : &vectype_op[i], &def_stmt_info))
7251 : {
7252 0 : if (dump_enabled_p ())
7253 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7254 : "use not simple.\n");
7255 0 : return false;
7256 : }
7257 :
7258 : /* Skip reduction operands, and for an IFN_COND_OP we might hit the
7259 : reduction operand twice (once as definition, once as else). */
7260 104433 : if (SLP_TREE_CHILDREN (slp_for_stmt_info)[i]
7261 208866 : == SLP_TREE_CHILDREN
7262 104433 : (slp_for_stmt_info)[SLP_TREE_REDUC_IDX (slp_for_stmt_info)])
7263 52001 : continue;
7264 :
7265 : /* There should be only one cycle def in the stmt, the one
7266 : leading to reduc_def. */
7267 52432 : if (SLP_TREE_CHILDREN (slp_for_stmt_info)[i]->cycle_info.id != -1)
7268 : return false;
7269 :
7270 52432 : if (!vectype_op[i])
7271 4506 : vectype_op[i]
7272 4506 : = get_vectype_for_scalar_type (loop_vinfo,
7273 4506 : TREE_TYPE (op.ops[i]), slp_op[i]);
7274 :
7275 : /* Record how the non-reduction-def value of COND_EXPR is defined.
7276 : ??? For a chain of multiple CONDs we'd have to match them up all. */
7277 52432 : if (op.code == COND_EXPR && reduc_chain_length == 1)
7278 : {
7279 784 : if (dt == vect_constant_def)
7280 : {
7281 95 : cond_reduc_dt = dt;
7282 95 : cond_reduc_val = op.ops[i];
7283 : }
7284 689 : else if (dt == vect_induction_def
7285 408 : && def_stmt_info
7286 1097 : && is_nonwrapping_integer_induction (def_stmt_info, loop))
7287 : {
7288 109 : cond_reduc_dt = dt;
7289 109 : cond_stmt_vinfo = def_stmt_info;
7290 : }
7291 : }
7292 : }
7293 :
7294 48387 : enum vect_reduction_type reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7295 : /* If we have a condition reduction, see if we can simplify it further. */
7296 48387 : if (reduction_type == COND_REDUCTION)
7297 : {
7298 795 : if (SLP_TREE_LANES (slp_node) != 1)
7299 : return false;
7300 :
7301 : /* When the condition uses the reduction value in the condition, fail. */
7302 771 : if (SLP_TREE_REDUC_IDX (slp_node) == 0)
7303 : {
7304 0 : if (dump_enabled_p ())
7305 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7306 : "condition depends on previous iteration\n");
7307 0 : return false;
7308 : }
7309 :
7310 771 : if (reduc_chain_length == 1
7311 771 : && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7312 : OPTIMIZE_FOR_SPEED)
7313 748 : || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7314 : vectype_in,
7315 : OPTIMIZE_FOR_SPEED)))
7316 : {
7317 0 : if (dump_enabled_p ())
7318 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7319 : "optimizing condition reduction with"
7320 : " FOLD_EXTRACT_LAST.\n");
7321 0 : VECT_REDUC_INFO_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7322 : }
7323 771 : else if (cond_reduc_dt == vect_induction_def)
7324 : {
7325 109 : tree base
7326 : = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7327 109 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7328 :
7329 109 : gcc_assert (TREE_CODE (base) == INTEGER_CST
7330 : && TREE_CODE (step) == INTEGER_CST);
7331 109 : cond_reduc_val = NULL_TREE;
7332 109 : enum tree_code cond_reduc_op_code = ERROR_MARK;
7333 109 : tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7334 109 : if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7335 : ;
7336 : /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7337 : above base; punt if base is the minimum value of the type for
7338 : MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7339 97 : else if (tree_int_cst_sgn (step) == -1)
7340 : {
7341 18 : cond_reduc_op_code = MIN_EXPR;
7342 18 : if (tree_int_cst_sgn (base) == -1)
7343 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7344 18 : else if (tree_int_cst_lt (base,
7345 18 : TYPE_MAX_VALUE (TREE_TYPE (base))))
7346 18 : cond_reduc_val
7347 18 : = int_const_binop (PLUS_EXPR, base, integer_one_node);
7348 : }
7349 : else
7350 : {
7351 79 : cond_reduc_op_code = MAX_EXPR;
7352 79 : if (tree_int_cst_sgn (base) == 1)
7353 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7354 79 : else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7355 : base))
7356 79 : cond_reduc_val
7357 79 : = int_const_binop (MINUS_EXPR, base, integer_one_node);
7358 : }
7359 97 : if (cond_reduc_val)
7360 : {
7361 97 : if (dump_enabled_p ())
7362 61 : dump_printf_loc (MSG_NOTE, vect_location,
7363 : "condition expression based on "
7364 : "integer induction.\n");
7365 97 : VECT_REDUC_INFO_CODE (reduc_info) = cond_reduc_op_code;
7366 97 : VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info)
7367 97 : = cond_reduc_val;
7368 97 : VECT_REDUC_INFO_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7369 : }
7370 : }
7371 662 : else if (cond_reduc_dt == vect_constant_def)
7372 : {
7373 85 : enum vect_def_type cond_initial_dt;
7374 85 : tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7375 85 : vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7376 85 : if (cond_initial_dt == vect_constant_def
7377 107 : && types_compatible_p (TREE_TYPE (cond_initial_val),
7378 22 : TREE_TYPE (cond_reduc_val)))
7379 : {
7380 22 : tree e = fold_binary (LE_EXPR, boolean_type_node,
7381 : cond_initial_val, cond_reduc_val);
7382 22 : if (e && (integer_onep (e) || integer_zerop (e)))
7383 : {
7384 22 : if (dump_enabled_p ())
7385 16 : dump_printf_loc (MSG_NOTE, vect_location,
7386 : "condition expression based on "
7387 : "compile time constant.\n");
7388 : /* Record reduction code at analysis stage. */
7389 22 : VECT_REDUC_INFO_CODE (reduc_info)
7390 22 : = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7391 22 : VECT_REDUC_INFO_TYPE (reduc_info) = CONST_COND_REDUCTION;
7392 : }
7393 : }
7394 : }
7395 : }
7396 :
7397 48363 : if (STMT_VINFO_LIVE_P (phi_info))
7398 : return false;
7399 :
7400 48363 : ncopies = vect_get_num_copies (loop_vinfo, slp_node);
7401 :
7402 48363 : gcc_assert (ncopies >= 1);
7403 :
7404 48363 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7405 :
7406 : /* 4.2. Check support for the epilog operation.
7407 :
7408 : If STMT represents a reduction pattern, then the type of the
7409 : reduction variable may be different than the type of the rest
7410 : of the arguments. For example, consider the case of accumulation
7411 : of shorts into an int accumulator; The original code:
7412 : S1: int_a = (int) short_a;
7413 : orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7414 :
7415 : was replaced with:
7416 : STMT: int_acc = widen_sum <short_a, int_acc>
7417 :
7418 : This means that:
7419 : 1. The tree-code that is used to create the vector operation in the
7420 : epilog code (that reduces the partial results) is not the
7421 : tree-code of STMT, but is rather the tree-code of the original
7422 : stmt from the pattern that STMT is replacing. I.e, in the example
7423 : above we want to use 'widen_sum' in the loop, but 'plus' in the
7424 : epilog.
7425 : 2. The type (mode) we use to check available target support
7426 : for the vector operation to be created in the *epilog*, is
7427 : determined by the type of the reduction variable (in the example
7428 : above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7429 : However the type (mode) we use to check available target support
7430 : for the vector operation to be created *inside the loop*, is
7431 : determined by the type of the other arguments to STMT (in the
7432 : example we'd check this: optab_handler (widen_sum_optab,
7433 : vect_short_mode)).
7434 :
7435 : This is contrary to "regular" reductions, in which the types of all
7436 : the arguments are the same as the type of the reduction variable.
7437 : For "regular" reductions we can therefore use the same vector type
7438 : (and also the same tree-code) when generating the epilog code and
7439 : when generating the code inside the loop. */
7440 :
7441 48363 : code_helper orig_code = VECT_REDUC_INFO_CODE (reduc_info);
7442 :
7443 : /* If conversion might have created a conditional operation like
7444 : IFN_COND_ADD already. Use the internal code for the following checks. */
7445 48363 : if (orig_code.is_internal_fn ())
7446 : {
7447 3682 : tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7448 3682 : orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7449 : }
7450 :
7451 48363 : VECT_REDUC_INFO_CODE (reduc_info) = orig_code;
7452 :
7453 48363 : reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7454 48363 : if (reduction_type == TREE_CODE_REDUCTION)
7455 : {
7456 : /* Check whether it's ok to change the order of the computation.
7457 : Generally, when vectorizing a reduction we change the order of the
7458 : computation. This may change the behavior of the program in some
7459 : cases, so we need to check that this is ok. One exception is when
7460 : vectorizing an outer-loop: the inner-loop is executed sequentially,
7461 : and therefore vectorizing reductions in the inner-loop during
7462 : outer-loop vectorization is safe. Likewise when we are vectorizing
7463 : a series of reductions using SLP and the VF is one the reductions
7464 : are performed in scalar order. */
7465 47592 : if (!reduc_chain
7466 47592 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7467 : ;
7468 47451 : else if (needs_fold_left_reduction_p (op.type, orig_code))
7469 : {
7470 : /* When vectorizing a reduction chain w/o SLP the reduction PHI
7471 : is not directy used in stmt. */
7472 4799 : if (reduc_chain_length != 1)
7473 : {
7474 67 : if (dump_enabled_p ())
7475 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7476 : "in-order reduction chain without SLP.\n");
7477 67 : return false;
7478 : }
7479 : /* Code generation doesn't support function calls other
7480 : than .COND_*. */
7481 4732 : if (!op.code.is_tree_code ()
7482 4866 : && !(op.code.is_internal_fn ()
7483 67 : && conditional_internal_fn_code (internal_fn (op.code))
7484 : != ERROR_MARK))
7485 : {
7486 18 : if (dump_enabled_p ())
7487 16 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7488 : "in-order reduction chain operation not "
7489 : "supported.\n");
7490 18 : return false;
7491 : }
7492 4714 : VECT_REDUC_INFO_TYPE (reduc_info)
7493 4714 : = reduction_type = FOLD_LEFT_REDUCTION;
7494 : }
7495 42652 : else if (!commutative_binary_op_p (orig_code, op.type)
7496 42652 : || !associative_binary_op_p (orig_code, op.type))
7497 : {
7498 152 : if (dump_enabled_p ())
7499 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7500 : "reduction: not commutative/associative\n");
7501 152 : return false;
7502 : }
7503 : }
7504 :
7505 4714 : if ((reduction_type == COND_REDUCTION
7506 : || reduction_type == INTEGER_INDUC_COND_REDUCTION
7507 : || reduction_type == CONST_COND_REDUCTION
7508 43412 : || reduction_type == EXTRACT_LAST_REDUCTION)
7509 : && 1
7510 771 : && ncopies > 1)
7511 : {
7512 276 : if (dump_enabled_p ())
7513 60 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7514 : "multiple types in condition reduction.\n");
7515 276 : return false;
7516 : }
7517 :
7518 : /* See if we can convert a mask vector to a corresponding bool data vector
7519 : to perform the epilogue reduction. */
7520 47850 : tree alt_vectype_out = NULL_TREE;
7521 47850 : if (VECTOR_BOOLEAN_TYPE_P (vectype_out))
7522 : {
7523 968 : alt_vectype_out
7524 1936 : = get_related_vectype_for_scalar_type (loop_vinfo->vector_mode,
7525 968 : TREE_TYPE (vectype_out),
7526 : TYPE_VECTOR_SUBPARTS
7527 : (vectype_out));
7528 968 : if (!alt_vectype_out
7529 968 : || maybe_ne (TYPE_VECTOR_SUBPARTS (alt_vectype_out),
7530 1917 : TYPE_VECTOR_SUBPARTS (vectype_out))
7531 1936 : || !expand_vec_cond_expr_p (alt_vectype_out, vectype_out))
7532 19 : alt_vectype_out = NULL_TREE;
7533 : }
7534 :
7535 47850 : internal_fn reduc_fn = IFN_LAST;
7536 47850 : if (reduction_type == TREE_CODE_REDUCTION
7537 47850 : || reduction_type == FOLD_LEFT_REDUCTION
7538 : || reduction_type == INTEGER_INDUC_COND_REDUCTION
7539 495 : || reduction_type == CONST_COND_REDUCTION)
7540 : {
7541 42752 : if (reduction_type == FOLD_LEFT_REDUCTION
7542 51393 : ? fold_left_reduction_fn (orig_code, &reduc_fn)
7543 42752 : : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7544 : {
7545 46790 : internal_fn sbool_fn = IFN_LAST;
7546 46790 : if (reduc_fn == IFN_LAST)
7547 : ;
7548 44872 : else if ((!VECTOR_BOOLEAN_TYPE_P (vectype_out)
7549 968 : || (GET_MODE_CLASS (TYPE_MODE (vectype_out))
7550 : == MODE_VECTOR_BOOL))
7551 88776 : && direct_internal_fn_supported_p (reduc_fn, vectype_out,
7552 : OPTIMIZE_FOR_SPEED))
7553 : ;
7554 10203 : else if (VECTOR_BOOLEAN_TYPE_P (vectype_out)
7555 968 : && sbool_reduction_fn_for_fn (reduc_fn, &sbool_fn)
7556 11171 : && direct_internal_fn_supported_p (sbool_fn, vectype_out,
7557 : OPTIMIZE_FOR_SPEED))
7558 73 : reduc_fn = sbool_fn;
7559 10130 : else if (reduction_type != FOLD_LEFT_REDUCTION
7560 10130 : && alt_vectype_out
7561 10130 : && direct_internal_fn_supported_p (reduc_fn, alt_vectype_out,
7562 : OPTIMIZE_FOR_SPEED))
7563 724 : VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info) = alt_vectype_out;
7564 : else
7565 : {
7566 9406 : if (dump_enabled_p ())
7567 846 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7568 : "reduc op not supported by target.\n");
7569 :
7570 9406 : reduc_fn = IFN_LAST;
7571 : }
7572 : }
7573 : else
7574 : {
7575 676 : if (dump_enabled_p ())
7576 48 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7577 : "no reduc code for scalar code.\n");
7578 :
7579 676 : return false;
7580 : }
7581 46790 : if (reduc_fn == IFN_LAST
7582 46790 : && VECTOR_BOOLEAN_TYPE_P (vectype_out))
7583 : {
7584 171 : if (!alt_vectype_out)
7585 : {
7586 12 : if (dump_enabled_p ())
7587 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7588 : "cannot turn mask into bool data vector for "
7589 : "reduction epilogue.\n");
7590 12 : return false;
7591 : }
7592 159 : VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info) = alt_vectype_out;
7593 : }
7594 : }
7595 384 : else if (reduction_type == COND_REDUCTION)
7596 : {
7597 384 : int scalar_precision
7598 384 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7599 384 : cr_index_scalar_type = make_unsigned_type (scalar_precision);
7600 384 : cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7601 : vectype_out);
7602 :
7603 384 : if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7604 : OPTIMIZE_FOR_SPEED))
7605 12 : reduc_fn = IFN_REDUC_MAX;
7606 : }
7607 47162 : VECT_REDUC_INFO_FN (reduc_info) = reduc_fn;
7608 :
7609 47162 : if (reduction_type != EXTRACT_LAST_REDUCTION
7610 : && reduc_fn == IFN_LAST
7611 : && !nunits_out.is_constant ())
7612 : {
7613 : if (dump_enabled_p ())
7614 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7615 : "missing target support for reduction on"
7616 : " variable-length vectors.\n");
7617 : return false;
7618 : }
7619 :
7620 : /* For SLP reductions, see if there is a neutral value we can use. */
7621 47162 : tree neutral_op = NULL_TREE;
7622 47162 : tree initial_value = NULL_TREE;
7623 47162 : if (reduc_chain)
7624 1374 : initial_value = vect_phi_initial_value (reduc_def_phi);
7625 47162 : neutral_op = neutral_op_for_reduction (TREE_TYPE
7626 : (gimple_phi_result (reduc_def_phi)),
7627 : orig_code, initial_value);
7628 47162 : VECT_REDUC_INFO_NEUTRAL_OP (reduc_info) = neutral_op;
7629 :
7630 47162 : if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7631 : {
7632 : /* We can't support in-order reductions of code such as this:
7633 :
7634 : for (int i = 0; i < n1; ++i)
7635 : for (int j = 0; j < n2; ++j)
7636 : l += a[j];
7637 :
7638 : since GCC effectively transforms the loop when vectorizing:
7639 :
7640 : for (int i = 0; i < n1 / VF; ++i)
7641 : for (int j = 0; j < n2; ++j)
7642 : for (int k = 0; k < VF; ++k)
7643 : l += a[j];
7644 :
7645 : which is a reassociation of the original operation. */
7646 56 : if (dump_enabled_p ())
7647 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7648 : "in-order double reduction not supported.\n");
7649 :
7650 56 : return false;
7651 : }
7652 :
7653 47106 : if (reduction_type == FOLD_LEFT_REDUCTION
7654 3982 : && SLP_TREE_LANES (slp_node) > 1
7655 117 : && !reduc_chain)
7656 : {
7657 : /* We cannot use in-order reductions in this case because there is
7658 : an implicit reassociation of the operations involved. */
7659 55 : if (dump_enabled_p ())
7660 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7661 : "in-order unchained SLP reductions not supported.\n");
7662 55 : return false;
7663 : }
7664 :
7665 : /* For double reductions, and for SLP reductions with a neutral value,
7666 : we construct a variable-length initial vector by loading a vector
7667 : full of the neutral value and then shift-and-inserting the start
7668 : values into the low-numbered elements. This is however not needed
7669 : when neutral and initial value are equal or we can handle the
7670 : initial value via adjustment in the epilogue. */
7671 47051 : if ((double_reduc || neutral_op)
7672 : && !nunits_out.is_constant ()
7673 : && reduction_type != INTEGER_INDUC_COND_REDUCTION
7674 : && !((SLP_TREE_LANES (slp_node) == 1 || reduc_chain)
7675 : && neutral_op
7676 : && (!double_reduc
7677 : || operand_equal_p (neutral_op,
7678 : vect_phi_initial_value (reduc_def_phi))))
7679 : && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7680 : vectype_out, OPTIMIZE_FOR_BOTH))
7681 : {
7682 : if (dump_enabled_p ())
7683 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7684 : "reduction on variable-length vectors requires"
7685 : " target support for a vector-shift-and-insert"
7686 : " operation.\n");
7687 : return false;
7688 : }
7689 :
7690 : /* Check extra constraints for variable-length unchained SLP reductions. */
7691 47051 : if (!reduc_chain
7692 : && !nunits_out.is_constant ())
7693 : {
7694 : /* We checked above that we could build the initial vector when
7695 : there's a neutral element value. Check here for the case in
7696 : which each SLP statement has its own initial value and in which
7697 : that value needs to be repeated for every instance of the
7698 : statement within the initial vector. */
7699 : unsigned int group_size = SLP_TREE_LANES (slp_node);
7700 : if (!neutral_op
7701 : && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7702 : TREE_TYPE (vectype_out)))
7703 : {
7704 : if (dump_enabled_p ())
7705 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7706 : "unsupported form of SLP reduction for"
7707 : " variable-length vectors: cannot build"
7708 : " initial vector.\n");
7709 : return false;
7710 : }
7711 : /* The epilogue code relies on the number of elements being a multiple
7712 : of the group size. The duplicate-and-interleave approach to setting
7713 : up the initial vector does too. */
7714 : if (!multiple_p (nunits_out, group_size))
7715 : {
7716 : if (dump_enabled_p ())
7717 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7718 : "unsupported form of SLP reduction for"
7719 : " variable-length vectors: the vector size"
7720 : " is not a multiple of the number of results.\n");
7721 : return false;
7722 : }
7723 : }
7724 :
7725 47051 : if (reduction_type == COND_REDUCTION)
7726 : {
7727 384 : widest_int ni;
7728 :
7729 384 : if (! max_loop_iterations (loop, &ni))
7730 : {
7731 14 : if (dump_enabled_p ())
7732 0 : dump_printf_loc (MSG_NOTE, vect_location,
7733 : "loop count not known, cannot create cond "
7734 : "reduction.\n");
7735 14 : return false;
7736 : }
7737 : /* Convert backedges to iterations. */
7738 370 : ni += 1;
7739 :
7740 : /* The additional index will be the same type as the condition. Check
7741 : that the loop can fit into this less one (because we'll use up the
7742 : zero slot for when there are no matches). */
7743 370 : tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7744 370 : if (wi::geu_p (ni, wi::to_widest (max_index)))
7745 : {
7746 90 : if (dump_enabled_p ())
7747 54 : dump_printf_loc (MSG_NOTE, vect_location,
7748 : "loop size is greater than data size.\n");
7749 90 : return false;
7750 : }
7751 384 : }
7752 :
7753 : /* In case the vectorization factor (VF) is bigger than the number
7754 : of elements that we can fit in a vectype (nunits), we have to generate
7755 : more than one vector stmt - i.e - we need to "unroll" the
7756 : vector stmt by a factor VF/nunits. For more details see documentation
7757 : in vectorizable_operation. */
7758 :
7759 : /* If the reduction is used in an outer loop we need to generate
7760 : VF intermediate results, like so (e.g. for ncopies=2):
7761 : r0 = phi (init, r0)
7762 : r1 = phi (init, r1)
7763 : r0 = x0 + r0;
7764 : r1 = x1 + r1;
7765 : (i.e. we generate VF results in 2 registers).
7766 : In this case we have a separate def-use cycle for each copy, and therefore
7767 : for each copy we get the vector def for the reduction variable from the
7768 : respective phi node created for this copy.
7769 :
7770 : Otherwise (the reduction is unused in the loop nest), we can combine
7771 : together intermediate results, like so (e.g. for ncopies=2):
7772 : r = phi (init, r)
7773 : r = x0 + r;
7774 : r = x1 + r;
7775 : (i.e. we generate VF/2 results in a single register).
7776 : In this case for each copy we get the vector def for the reduction variable
7777 : from the vectorized reduction operation generated in the previous iteration.
7778 :
7779 : This only works when we see both the reduction PHI and its only consumer
7780 : in vectorizable_reduction and there are no intermediate stmts
7781 : participating. When unrolling we want each unrolled iteration to have its
7782 : own reduction accumulator since one of the main goals of unrolling a
7783 : reduction is to reduce the aggregate loop-carried latency. */
7784 46947 : if (ncopies > 1
7785 46947 : && !reduc_chain
7786 5248 : && SLP_TREE_LANES (slp_node) == 1
7787 5088 : && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7788 5069 : && reduc_chain_length == 1
7789 4766 : && loop_vinfo->suggested_unroll_factor == 1)
7790 46947 : single_defuse_cycle = true;
7791 :
7792 46947 : if (single_defuse_cycle && !lane_reducing)
7793 : {
7794 4199 : gcc_assert (op.code != COND_EXPR);
7795 :
7796 : /* 4. check support for the operation in the loop
7797 :
7798 : This isn't necessary for the lane reduction codes, since they
7799 : can only be produced by pattern matching, and it's up to the
7800 : pattern matcher to test for support. The main reason for
7801 : specifically skipping this step is to avoid rechecking whether
7802 : mixed-sign dot-products can be implemented using signed
7803 : dot-products. */
7804 4199 : machine_mode vec_mode = TYPE_MODE (vectype_in);
7805 4199 : if (!directly_supported_p (op.code, vectype_in, optab_vector))
7806 : {
7807 711 : if (dump_enabled_p ())
7808 24 : dump_printf (MSG_NOTE, "op not supported by target.\n");
7809 1422 : if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7810 711 : || !vect_can_vectorize_without_simd_p (op.code))
7811 : single_defuse_cycle = false;
7812 : else
7813 5 : if (dump_enabled_p ())
7814 0 : dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7815 : }
7816 :
7817 4199 : if (vect_emulated_vector_p (vectype_in)
7818 4199 : && !vect_can_vectorize_without_simd_p (op.code))
7819 : {
7820 0 : if (dump_enabled_p ())
7821 0 : dump_printf (MSG_NOTE, "using word mode not possible.\n");
7822 0 : return false;
7823 : }
7824 : }
7825 46947 : if (dump_enabled_p () && single_defuse_cycle)
7826 650 : dump_printf_loc (MSG_NOTE, vect_location,
7827 : "using single def-use cycle for reduction by reducing "
7828 : "multiple vectors to one in the loop body\n");
7829 46947 : VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7830 :
7831 : /* For lane-reducing operation, the below processing related to single
7832 : defuse-cycle will be done in its own vectorizable function. One more
7833 : thing to note is that the operation must not be involved in fold-left
7834 : reduction. */
7835 46947 : single_defuse_cycle &= !lane_reducing;
7836 :
7837 46947 : if (single_defuse_cycle || reduction_type == FOLD_LEFT_REDUCTION)
7838 23964 : for (i = 0; i < (int) op.num_ops; i++)
7839 16604 : if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7840 : {
7841 0 : if (dump_enabled_p ())
7842 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7843 : "incompatible vector types for invariants\n");
7844 0 : return false;
7845 : }
7846 :
7847 46947 : vect_model_reduction_cost (loop_vinfo, slp_for_stmt_info, reduc_fn,
7848 : reduction_type, ncopies, cost_vec);
7849 : /* Cost the reduction op inside the loop if transformed via
7850 : vect_transform_reduction for non-lane-reducing operation. Otherwise
7851 : this is costed by the separate vectorizable_* routines. */
7852 46947 : if (single_defuse_cycle)
7853 3493 : record_stmt_cost (cost_vec, ncopies, vector_stmt,
7854 : slp_for_stmt_info, 0, vect_body);
7855 :
7856 46947 : if (dump_enabled_p ()
7857 46947 : && reduction_type == FOLD_LEFT_REDUCTION)
7858 219 : dump_printf_loc (MSG_NOTE, vect_location,
7859 : "using an in-order (fold-left) reduction.\n");
7860 46947 : SLP_TREE_TYPE (slp_node) = cycle_phi_info_type;
7861 :
7862 : /* All but single defuse-cycle optimized and fold-left reductions go
7863 : through their own vectorizable_* routines. */
7864 46947 : stmt_vec_info tem
7865 46947 : = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (slp_node_instance));
7866 46947 : if (!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
7867 39587 : STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7868 : else
7869 : {
7870 7360 : STMT_VINFO_DEF_TYPE (tem) = vect_reduction_def;
7871 7360 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7872 3166 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
7873 : slp_node, op.code, op.type,
7874 : vectype_in);
7875 : }
7876 : return true;
7877 : }
7878 :
7879 : /* STMT_INFO is a dot-product reduction whose multiplication operands
7880 : have different signs. Emit a sequence to emulate the operation
7881 : using a series of signed DOT_PROD_EXPRs and return the last
7882 : statement generated. VEC_DEST is the result of the vector operation
7883 : and VOP lists its inputs. */
7884 :
7885 : static gassign *
7886 4 : vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7887 : gimple_stmt_iterator *gsi, tree vec_dest,
7888 : tree vop[3])
7889 : {
7890 4 : tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
7891 4 : tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
7892 4 : tree narrow_elttype = TREE_TYPE (narrow_vectype);
7893 4 : gimple *new_stmt;
7894 :
7895 : /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
7896 4 : if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
7897 0 : std::swap (vop[0], vop[1]);
7898 :
7899 : /* Convert all inputs to signed types. */
7900 16 : for (int i = 0; i < 3; ++i)
7901 12 : if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
7902 : {
7903 4 : tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
7904 4 : new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
7905 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7906 4 : vop[i] = tmp;
7907 : }
7908 :
7909 : /* In the comments below we assume 8-bit inputs for simplicity,
7910 : but the approach works for any full integer type. */
7911 :
7912 : /* Create a vector of -128. */
7913 4 : tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
7914 4 : tree min_narrow = build_vector_from_val (narrow_vectype,
7915 : min_narrow_elttype);
7916 :
7917 : /* Create a vector of 64. */
7918 4 : auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
7919 4 : tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
7920 4 : half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
7921 :
7922 : /* Emit: SUB_RES = VOP[0] - 128. */
7923 4 : tree sub_res = make_ssa_name (narrow_vectype);
7924 4 : new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
7925 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7926 :
7927 : /* Emit:
7928 :
7929 : STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
7930 : STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
7931 : STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
7932 :
7933 : on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
7934 : Doing the two 64 * y steps first allows more time to compute x. */
7935 4 : tree stage1 = make_ssa_name (wide_vectype);
7936 4 : new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
7937 : vop[1], half_narrow, vop[2]);
7938 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7939 :
7940 4 : tree stage2 = make_ssa_name (wide_vectype);
7941 4 : new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
7942 : vop[1], half_narrow, stage1);
7943 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7944 :
7945 4 : tree stage3 = make_ssa_name (wide_vectype);
7946 4 : new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
7947 : sub_res, vop[1], stage2);
7948 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7949 :
7950 : /* Convert STAGE3 to the reduction type. */
7951 4 : return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
7952 4 : }
7953 :
7954 : /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7955 : value. */
7956 :
7957 : bool
7958 2573 : vect_transform_reduction (loop_vec_info loop_vinfo,
7959 : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7960 : slp_tree slp_node)
7961 : {
7962 2573 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
7963 2573 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7964 2573 : unsigned vec_num;
7965 :
7966 2573 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
7967 :
7968 2573 : if (nested_in_vect_loop_p (loop, stmt_info))
7969 : {
7970 0 : loop = loop->inner;
7971 0 : gcc_assert (VECT_REDUC_INFO_DEF_TYPE (reduc_info)
7972 : == vect_double_reduction_def);
7973 : }
7974 :
7975 2573 : gimple_match_op op;
7976 2573 : if (!gimple_extract_op (stmt_info->stmt, &op))
7977 0 : gcc_unreachable ();
7978 :
7979 : /* All uses but the last are expected to be defined in the loop.
7980 : The last use is the reduction variable. In case of nested cycle this
7981 : assumption is not true: we use reduc_index to record the index of the
7982 : reduction variable. */
7983 2573 : int reduc_index = SLP_TREE_REDUC_IDX (slp_node);
7984 2573 : tree vectype_in = SLP_TREE_VECTYPE (slp_node);
7985 2573 : if (lane_reducing_op_p (op.code))
7986 252 : vectype_in = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (slp_node)[0]);
7987 :
7988 2573 : vec_num = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
7989 :
7990 2573 : code_helper code = canonicalize_code (op.code, op.type);
7991 2573 : internal_fn cond_fn
7992 476 : = ((code.is_internal_fn ()
7993 476 : && internal_fn_mask_index ((internal_fn)code) != -1)
7994 2573 : ? (internal_fn)code : get_conditional_internal_fn (code, op.type));
7995 :
7996 2573 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7997 2573 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
7998 2573 : bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7999 :
8000 : /* Transform. */
8001 2573 : tree new_temp = NULL_TREE;
8002 18011 : auto_vec<tree> vec_oprnds[3];
8003 :
8004 2573 : if (dump_enabled_p ())
8005 745 : dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8006 :
8007 : /* A binary COND_OP reduction must have the same definition and else
8008 : value. */
8009 3049 : bool cond_fn_p = code.is_internal_fn ()
8010 476 : && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8011 476 : if (cond_fn_p)
8012 : {
8013 476 : gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8014 : || code == IFN_COND_MUL || code == IFN_COND_AND
8015 : || code == IFN_COND_IOR || code == IFN_COND_XOR
8016 : || code == IFN_COND_MIN || code == IFN_COND_MAX);
8017 476 : gcc_assert (op.num_ops == 4
8018 : && (op.ops[reduc_index]
8019 : == op.ops[internal_fn_else_index ((internal_fn) code)]));
8020 : }
8021 :
8022 2573 : bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8023 :
8024 2573 : vect_reduction_type reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
8025 2573 : if (reduction_type == FOLD_LEFT_REDUCTION)
8026 : {
8027 843 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
8028 843 : gcc_assert (code.is_tree_code () || cond_fn_p);
8029 843 : return vectorize_fold_left_reduction
8030 843 : (loop_vinfo, stmt_info, gsi, slp_node,
8031 843 : code, reduc_fn, op.num_ops, vectype_in,
8032 843 : reduc_index, masks, lens);
8033 : }
8034 :
8035 1730 : bool single_defuse_cycle = VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info);
8036 1730 : bool lane_reducing = lane_reducing_op_p (code);
8037 1478 : gcc_assert (single_defuse_cycle || lane_reducing);
8038 :
8039 1730 : if (lane_reducing)
8040 : {
8041 : /* The last operand of lane-reducing op is for reduction. */
8042 252 : gcc_assert (reduc_index == (int) op.num_ops - 1);
8043 : }
8044 :
8045 : /* Create the destination vector */
8046 1730 : tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8047 1730 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8048 :
8049 : /* Get NCOPIES vector definitions for all operands except the reduction
8050 : definition. */
8051 1730 : if (!cond_fn_p)
8052 : {
8053 1277 : gcc_assert (reduc_index >= 0 && reduc_index <= 2);
8054 2107 : vect_get_vec_defs (loop_vinfo, slp_node,
8055 1277 : single_defuse_cycle && reduc_index == 0
8056 : ? NULL_TREE : op.ops[0], &vec_oprnds[0],
8057 1277 : single_defuse_cycle && reduc_index == 1
8058 : ? NULL_TREE : op.ops[1], &vec_oprnds[1],
8059 1277 : op.num_ops == 3
8060 252 : && !(single_defuse_cycle && reduc_index == 2)
8061 : ? op.ops[2] : NULL_TREE, &vec_oprnds[2]);
8062 : }
8063 : else
8064 : {
8065 : /* For a conditional operation pass the truth type as mask
8066 : vectype. */
8067 453 : gcc_assert (single_defuse_cycle
8068 : && (reduc_index == 1 || reduc_index == 2));
8069 453 : vect_get_vec_defs (loop_vinfo, slp_node, op.ops[0],
8070 : &vec_oprnds[0],
8071 : reduc_index == 1 ? NULL_TREE : op.ops[1],
8072 : &vec_oprnds[1],
8073 : reduc_index == 2 ? NULL_TREE : op.ops[2],
8074 : &vec_oprnds[2]);
8075 : }
8076 :
8077 : /* For single def-use cycles get one copy of the vectorized reduction
8078 : definition. */
8079 1730 : if (single_defuse_cycle)
8080 : {
8081 1645 : vect_get_vec_defs (loop_vinfo, slp_node,
8082 : reduc_index == 0 ? op.ops[0] : NULL_TREE,
8083 : &vec_oprnds[0],
8084 : reduc_index == 1 ? op.ops[1] : NULL_TREE,
8085 : &vec_oprnds[1],
8086 : reduc_index == 2 ? op.ops[2] : NULL_TREE,
8087 : &vec_oprnds[2]);
8088 : }
8089 85 : else if (lane_reducing)
8090 : {
8091 : /* For normal reduction, consistency between vectorized def/use is
8092 : naturally ensured when mapping from scalar statement. But if lane-
8093 : reducing op is involved in reduction, thing would become somewhat
8094 : complicated in that the op's result and operand for accumulation are
8095 : limited to less lanes than other operands, which certainly causes
8096 : def/use mismatch on adjacent statements around the op if do not have
8097 : any kind of specific adjustment. One approach is to refit lane-
8098 : reducing op in the way of introducing new trivial pass-through copies
8099 : to fix possible def/use gap, so as to make it behave like a normal op.
8100 : And vector reduction PHIs are always generated to the full extent, no
8101 : matter lane-reducing op exists or not. If some copies or PHIs are
8102 : actually superfluous, they would be cleaned up by passes after
8103 : vectorization. An example for single-lane slp, lane-reducing ops
8104 : with mixed input vectypes in a reduction chain, is given as below.
8105 : Similarly, this handling is applicable for multiple-lane slp as well.
8106 :
8107 : int sum = 1;
8108 : for (i)
8109 : {
8110 : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
8111 : sum += w[i]; // widen-sum <vector(16) char>
8112 : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
8113 : sum += n[i]; // normal <vector(4) int>
8114 : }
8115 :
8116 : The vector size is 128-bit,vectorization factor is 16. Reduction
8117 : statements would be transformed as:
8118 :
8119 : vector<4> int sum_v0 = { 0, 0, 0, 1 };
8120 : vector<4> int sum_v1 = { 0, 0, 0, 0 };
8121 : vector<4> int sum_v2 = { 0, 0, 0, 0 };
8122 : vector<4> int sum_v3 = { 0, 0, 0, 0 };
8123 :
8124 : for (i / 16)
8125 : {
8126 : sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
8127 : sum_v1 = sum_v1; // copy
8128 : sum_v2 = sum_v2; // copy
8129 : sum_v3 = sum_v3; // copy
8130 :
8131 : sum_v0 = sum_v0; // copy
8132 : sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
8133 : sum_v2 = sum_v2; // copy
8134 : sum_v3 = sum_v3; // copy
8135 :
8136 : sum_v0 = sum_v0; // copy
8137 : sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1);
8138 : sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2);
8139 : sum_v3 = sum_v3; // copy
8140 :
8141 : sum_v0 += n_v0[i: 0 ~ 3 ];
8142 : sum_v1 += n_v1[i: 4 ~ 7 ];
8143 : sum_v2 += n_v2[i: 8 ~ 11];
8144 : sum_v3 += n_v3[i: 12 ~ 15];
8145 : }
8146 :
8147 : Moreover, for a higher instruction parallelism in final vectorized
8148 : loop, it is considered to make those effective vector lane-reducing
8149 : ops be distributed evenly among all def-use cycles. In the above
8150 : example, DOT_PROD, WIDEN_SUM and SADs are generated into disparate
8151 : cycles, instruction dependency among them could be eliminated. */
8152 85 : unsigned effec_ncopies = vec_oprnds[0].length ();
8153 85 : unsigned total_ncopies = vec_oprnds[reduc_index].length ();
8154 :
8155 85 : gcc_assert (effec_ncopies <= total_ncopies);
8156 :
8157 85 : if (effec_ncopies < total_ncopies)
8158 : {
8159 255 : for (unsigned i = 0; i < op.num_ops - 1; i++)
8160 : {
8161 340 : gcc_assert (vec_oprnds[i].length () == effec_ncopies);
8162 170 : vec_oprnds[i].safe_grow_cleared (total_ncopies);
8163 : }
8164 : }
8165 :
8166 85 : tree reduc_vectype_in = vectype_in;
8167 85 : gcc_assert (reduc_vectype_in);
8168 :
8169 85 : unsigned effec_reduc_ncopies
8170 85 : = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
8171 :
8172 85 : gcc_assert (effec_ncopies <= effec_reduc_ncopies);
8173 :
8174 85 : if (effec_ncopies < effec_reduc_ncopies)
8175 : {
8176 : /* Find suitable def-use cycles to generate vectorized statements
8177 : into, and reorder operands based on the selection. */
8178 0 : unsigned curr_pos = VECT_REDUC_INFO_RESULT_POS (reduc_info);
8179 0 : unsigned next_pos = (curr_pos + effec_ncopies) % effec_reduc_ncopies;
8180 :
8181 0 : gcc_assert (curr_pos < effec_reduc_ncopies);
8182 0 : VECT_REDUC_INFO_RESULT_POS (reduc_info) = next_pos;
8183 :
8184 0 : if (curr_pos)
8185 : {
8186 0 : unsigned count = effec_reduc_ncopies - effec_ncopies;
8187 0 : unsigned start = curr_pos - count;
8188 :
8189 0 : if ((int) start < 0)
8190 : {
8191 0 : count = curr_pos;
8192 0 : start = 0;
8193 : }
8194 :
8195 0 : for (unsigned i = 0; i < op.num_ops - 1; i++)
8196 : {
8197 0 : for (unsigned j = effec_ncopies; j > start; j--)
8198 : {
8199 0 : unsigned k = j - 1;
8200 0 : std::swap (vec_oprnds[i][k], vec_oprnds[i][k + count]);
8201 0 : gcc_assert (!vec_oprnds[i][k]);
8202 : }
8203 : }
8204 : }
8205 : }
8206 : }
8207 :
8208 1730 : bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (slp_node);
8209 2967 : unsigned num = vec_oprnds[reduc_index == 0 ? 1 : 0].length ();
8210 1730 : unsigned mask_index = 0;
8211 :
8212 7583 : for (unsigned i = 0; i < num; ++i)
8213 : {
8214 5853 : gimple *new_stmt;
8215 5853 : tree vop[3] = { vec_oprnds[0][i], vec_oprnds[1][i], NULL_TREE };
8216 5853 : if (!vop[0] || !vop[1])
8217 : {
8218 456 : tree reduc_vop = vec_oprnds[reduc_index][i];
8219 :
8220 : /* If could not generate an effective vector statement for current
8221 : portion of reduction operand, insert a trivial copy to simply
8222 : handle over the operand to other dependent statements. */
8223 456 : gcc_assert (reduc_vop);
8224 :
8225 456 : if (TREE_CODE (reduc_vop) == SSA_NAME
8226 456 : && !SSA_NAME_IS_DEFAULT_DEF (reduc_vop))
8227 456 : new_stmt = SSA_NAME_DEF_STMT (reduc_vop);
8228 : else
8229 : {
8230 0 : new_temp = make_ssa_name (vec_dest);
8231 0 : new_stmt = gimple_build_assign (new_temp, reduc_vop);
8232 0 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
8233 : gsi);
8234 : }
8235 : }
8236 5397 : else if (masked_loop_p && !mask_by_cond_expr)
8237 : {
8238 : /* No conditional ifns have been defined for lane-reducing op
8239 : yet. */
8240 16 : gcc_assert (!lane_reducing);
8241 :
8242 16 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8243 : vec_num, vectype_in,
8244 : mask_index++);
8245 16 : gcall *call;
8246 24 : if (code.is_internal_fn () && cond_fn_p)
8247 : {
8248 16 : gcc_assert (op.num_ops >= 3
8249 : && internal_fn_mask_index (internal_fn (code)) == 0);
8250 8 : vop[2] = vec_oprnds[2][i];
8251 8 : mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask),
8252 : mask, vop[0], gsi);
8253 8 : call = gimple_build_call_internal (cond_fn, 4, mask, vop[1],
8254 : vop[2], vop[reduc_index]);
8255 : }
8256 : else
8257 8 : call = gimple_build_call_internal (cond_fn, 4, mask, vop[0],
8258 : vop[1], vop[reduc_index]);
8259 16 : new_temp = make_ssa_name (vec_dest, call);
8260 16 : gimple_call_set_lhs (call, new_temp);
8261 16 : gimple_call_set_nothrow (call, true);
8262 16 : vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8263 16 : new_stmt = call;
8264 : }
8265 : else
8266 : {
8267 5381 : if (op.num_ops >= 3)
8268 1747 : vop[2] = vec_oprnds[2][i];
8269 :
8270 5381 : if (masked_loop_p && mask_by_cond_expr)
8271 : {
8272 4 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8273 : vec_num, vectype_in,
8274 : mask_index++);
8275 4 : build_vect_cond_expr (code, vop, mask, gsi);
8276 : }
8277 :
8278 5381 : if (emulated_mixed_dot_prod)
8279 4 : new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8280 : vec_dest, vop);
8281 :
8282 6719 : else if (code.is_internal_fn () && !cond_fn_p)
8283 0 : new_stmt = gimple_build_call_internal (internal_fn (code),
8284 : op.num_ops,
8285 : vop[0], vop[1], vop[2]);
8286 6719 : else if (code.is_internal_fn () && cond_fn_p)
8287 1342 : new_stmt = gimple_build_call_internal (internal_fn (code),
8288 : op.num_ops,
8289 : vop[0], vop[1], vop[2],
8290 : vop[reduc_index]);
8291 : else
8292 4035 : new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8293 : vop[0], vop[1], vop[2]);
8294 5381 : new_temp = make_ssa_name (vec_dest, new_stmt);
8295 5381 : gimple_set_lhs (new_stmt, new_temp);
8296 5381 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8297 : }
8298 :
8299 5853 : if (single_defuse_cycle && i < num - 1)
8300 3524 : vec_oprnds[reduc_index].safe_push (gimple_get_lhs (new_stmt));
8301 : else
8302 2329 : slp_node->push_vec_def (new_stmt);
8303 : }
8304 :
8305 : return true;
8306 10292 : }
8307 :
8308 : /* Transform phase of a cycle PHI. */
8309 :
8310 : bool
8311 23709 : vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8312 : stmt_vec_info stmt_info,
8313 : slp_tree slp_node, slp_instance slp_node_instance)
8314 : {
8315 23709 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
8316 23709 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8317 23709 : int i;
8318 23709 : bool nested_cycle = false;
8319 23709 : int vec_num;
8320 :
8321 23843 : if (nested_in_vect_loop_p (loop, stmt_info))
8322 : {
8323 : loop = loop->inner;
8324 : nested_cycle = true;
8325 : }
8326 :
8327 23709 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
8328 23709 : if (reduc_info
8329 23067 : && (VECT_REDUC_INFO_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8330 23067 : || VECT_REDUC_INFO_TYPE (reduc_info) == FOLD_LEFT_REDUCTION))
8331 : /* Leave the scalar phi in place. */
8332 : return true;
8333 :
8334 22224 : if (reduc_info && reduc_info->is_reduc_chain && dump_enabled_p ())
8335 118 : dump_printf_loc (MSG_NOTE, vect_location,
8336 : "vectorizing a reduction chain\n");
8337 :
8338 22866 : vec_num = vect_get_num_copies (loop_vinfo, slp_node);
8339 :
8340 : /* Check whether we should use a single PHI node and accumulate
8341 : vectors to one before the backedge. */
8342 22866 : if (reduc_info && VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info))
8343 22866 : vec_num = 1;
8344 :
8345 : /* Create the destination vector */
8346 22866 : gphi *phi = as_a <gphi *> (stmt_info->stmt);
8347 22866 : tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8348 : vectype_out);
8349 :
8350 : /* Get the loop-entry arguments. */
8351 22866 : auto_vec<tree> vec_initial_defs;
8352 22866 : vec_initial_defs.reserve (vec_num);
8353 : /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8354 : and we can't use zero for induc_val, use initial_def. Similarly
8355 : for REDUC_MIN and initial_def larger than the base. */
8356 22866 : if (reduc_info
8357 22224 : && VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8358 : {
8359 62 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
8360 62 : tree initial_def = vect_phi_initial_value (phi);
8361 62 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).safe_push (initial_def);
8362 62 : tree induc_val = VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info);
8363 62 : if (TREE_CODE (initial_def) == INTEGER_CST
8364 60 : && !integer_zerop (induc_val)
8365 122 : && ((VECT_REDUC_INFO_CODE (reduc_info) == MAX_EXPR
8366 42 : && tree_int_cst_lt (initial_def, induc_val))
8367 58 : || (VECT_REDUC_INFO_CODE (reduc_info) == MIN_EXPR
8368 18 : && tree_int_cst_lt (induc_val, initial_def))))
8369 : {
8370 2 : induc_val = initial_def;
8371 : /* Communicate we used the initial_def to epilouge
8372 : generation. */
8373 2 : VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8374 : }
8375 62 : vec_initial_defs.quick_push
8376 62 : (build_vector_from_val (vectype_out, induc_val));
8377 62 : }
8378 22804 : else if (nested_cycle)
8379 : {
8380 726 : unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8381 726 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8382 : &vec_initial_defs);
8383 : }
8384 : else
8385 : {
8386 22078 : gcc_assert (slp_node == slp_node_instance->reduc_phis);
8387 22078 : vec<tree> &initial_values = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info);
8388 22078 : vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8389 :
8390 22078 : unsigned int num_phis = stmts.length ();
8391 22078 : if (reduc_info->is_reduc_chain)
8392 188 : num_phis = 1;
8393 22078 : initial_values.reserve (num_phis);
8394 44612 : for (unsigned int i = 0; i < num_phis; ++i)
8395 : {
8396 22534 : gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8397 22534 : initial_values.quick_push (vect_phi_initial_value (this_phi));
8398 : }
8399 22078 : tree neutral_op = VECT_REDUC_INFO_NEUTRAL_OP (reduc_info);
8400 22078 : if (vec_num == 1
8401 22078 : && vect_find_reusable_accumulator (loop_vinfo,
8402 : reduc_info, vectype_out))
8403 : ;
8404 : /* Try to simplify the vector initialization by applying an
8405 : adjustment after the reduction has been performed. This
8406 : can also break a critical path but on the other hand
8407 : requires to keep the initial value live across the loop. */
8408 17894 : else if (neutral_op
8409 17332 : && initial_values.length () == 1
8410 17147 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8411 34966 : && !operand_equal_p (neutral_op, initial_values[0]))
8412 : {
8413 12225 : VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info)
8414 12225 : = initial_values[0];
8415 12225 : initial_values[0] = neutral_op;
8416 : }
8417 22078 : if (!VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)
8418 4184 : || loop_vinfo->main_loop_edge)
8419 43710 : get_initial_defs_for_reduction (loop_vinfo, reduc_info, vectype_out,
8420 : &vec_initial_defs, vec_num,
8421 : stmts.length (), neutral_op);
8422 : }
8423 :
8424 22866 : if (reduc_info)
8425 22224 : if (auto *accumulator = VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info))
8426 : {
8427 4184 : tree def = accumulator->reduc_input;
8428 4184 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8429 : {
8430 4181 : unsigned int nreduc;
8431 8362 : bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8432 4181 : (TREE_TYPE (def)),
8433 4181 : TYPE_VECTOR_SUBPARTS (vectype_out),
8434 : &nreduc);
8435 0 : gcc_assert (res);
8436 4181 : gimple_seq stmts = NULL;
8437 : /* Reduce the single vector to a smaller one. */
8438 4181 : if (nreduc != 1)
8439 : {
8440 : /* Perform the reduction in the appropriate type. */
8441 4181 : tree rvectype = vectype_out;
8442 4181 : if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8443 4181 : TREE_TYPE (TREE_TYPE (def))))
8444 235 : rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8445 : TYPE_VECTOR_SUBPARTS
8446 470 : (vectype_out));
8447 4181 : def = vect_create_partial_epilog (def, rvectype,
8448 : VECT_REDUC_INFO_CODE
8449 : (reduc_info),
8450 : &stmts);
8451 : }
8452 : /* The epilogue loop might use a different vector mode, like
8453 : VNx2DI vs. V2DI. */
8454 4181 : if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8455 : {
8456 0 : tree reduc_type = build_vector_type_for_mode
8457 0 : (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8458 0 : def = gimple_convert (&stmts, reduc_type, def);
8459 : }
8460 : /* Adjust the input so we pick up the partially reduced value
8461 : for the skip edge in vect_create_epilog_for_reduction. */
8462 4181 : accumulator->reduc_input = def;
8463 : /* And the reduction could be carried out using a different sign. */
8464 4181 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8465 235 : def = gimple_convert (&stmts, vectype_out, def);
8466 4181 : edge e;
8467 4181 : if ((e = loop_vinfo->main_loop_edge)
8468 4181 : || (e = loop_vinfo->skip_this_loop_edge))
8469 : {
8470 : /* While we'd like to insert on the edge this will split
8471 : blocks and disturb bookkeeping, we also will eventually
8472 : need this on the skip edge. Rely on sinking to
8473 : fixup optimal placement and insert in the pred. */
8474 3958 : gimple_stmt_iterator gsi = gsi_last_bb (e->src);
8475 : /* Insert before a cond that eventually skips the
8476 : epilogue. */
8477 3958 : if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8478 3941 : gsi_prev (&gsi);
8479 3958 : gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8480 : }
8481 : else
8482 223 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8483 : stmts);
8484 : }
8485 4184 : if (loop_vinfo->main_loop_edge)
8486 3961 : vec_initial_defs[0]
8487 3961 : = vect_get_main_loop_result (loop_vinfo, def,
8488 3961 : vec_initial_defs[0]);
8489 : else
8490 223 : vec_initial_defs.safe_push (def);
8491 : }
8492 :
8493 : /* Generate the reduction PHIs upfront. */
8494 47512 : for (i = 0; i < vec_num; i++)
8495 : {
8496 24646 : tree vec_init_def = vec_initial_defs[i];
8497 : /* Create the reduction-phi that defines the reduction
8498 : operand. */
8499 24646 : gphi *new_phi = create_phi_node (vec_dest, loop->header);
8500 24646 : add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8501 : UNKNOWN_LOCATION);
8502 :
8503 : /* The loop-latch arg is set in epilogue processing. */
8504 :
8505 24646 : slp_node->push_vec_def (new_phi);
8506 : }
8507 :
8508 22866 : return true;
8509 22866 : }
8510 :
8511 : /* Vectorizes LC PHIs. */
8512 :
8513 : bool
8514 169054 : vectorizable_lc_phi (loop_vec_info loop_vinfo,
8515 : stmt_vec_info stmt_info,
8516 : slp_tree slp_node)
8517 : {
8518 169054 : if (!loop_vinfo
8519 169054 : || !is_a <gphi *> (stmt_info->stmt)
8520 203469 : || gimple_phi_num_args (stmt_info->stmt) != 1)
8521 : return false;
8522 :
8523 761 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8524 0 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8525 : return false;
8526 :
8527 : /* Deal with copies from externs or constants that disguise as
8528 : loop-closed PHI nodes (PR97886). */
8529 761 : if (!vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8530 : SLP_TREE_VECTYPE (slp_node)))
8531 : {
8532 0 : if (dump_enabled_p ())
8533 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8534 : "incompatible vector types for invariants\n");
8535 0 : return false;
8536 : }
8537 :
8538 : /* ??? This can happen with data vs. mask uses of boolean. */
8539 761 : if (!useless_type_conversion_p (SLP_TREE_VECTYPE (slp_node),
8540 761 : SLP_TREE_VECTYPE
8541 : (SLP_TREE_CHILDREN (slp_node)[0])))
8542 : {
8543 0 : if (dump_enabled_p ())
8544 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8545 : "missed mask promotion\n");
8546 0 : return false;
8547 : }
8548 :
8549 761 : SLP_TREE_TYPE (slp_node) = lc_phi_info_type;
8550 761 : return true;
8551 : }
8552 :
8553 : bool
8554 504 : vect_transform_lc_phi (loop_vec_info loop_vinfo,
8555 : stmt_vec_info stmt_info,
8556 : slp_tree slp_node)
8557 : {
8558 :
8559 504 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8560 504 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8561 504 : basic_block bb = gimple_bb (stmt_info->stmt);
8562 504 : edge e = single_pred_edge (bb);
8563 504 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8564 504 : auto_vec<tree> vec_oprnds;
8565 1008 : vect_get_vec_defs (loop_vinfo, slp_node,
8566 504 : gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8567 1118 : for (unsigned i = 0; i < vec_oprnds.length (); i++)
8568 : {
8569 : /* Create the vectorized LC PHI node. */
8570 614 : gphi *new_phi = create_phi_node (vec_dest, bb);
8571 614 : add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8572 614 : slp_node->push_vec_def (new_phi);
8573 : }
8574 :
8575 504 : return true;
8576 504 : }
8577 :
8578 : /* Vectorizes PHIs. */
8579 :
8580 : bool
8581 138446 : vectorizable_phi (bb_vec_info vinfo,
8582 : stmt_vec_info stmt_info,
8583 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8584 : {
8585 138446 : if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8586 : return false;
8587 :
8588 71134 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8589 : return false;
8590 :
8591 71134 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8592 :
8593 71134 : if (cost_vec) /* transformation not required. */
8594 : {
8595 : slp_tree child;
8596 : unsigned i;
8597 194369 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8598 137362 : if (!child)
8599 : {
8600 0 : if (dump_enabled_p ())
8601 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8602 : "PHI node with unvectorized backedge def\n");
8603 0 : return false;
8604 : }
8605 137362 : else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8606 : {
8607 18 : if (dump_enabled_p ())
8608 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8609 : "incompatible vector types for invariants\n");
8610 18 : return false;
8611 : }
8612 137344 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8613 137344 : && !useless_type_conversion_p (vectype,
8614 : SLP_TREE_VECTYPE (child)))
8615 : {
8616 : /* With bools we can have mask and non-mask precision vectors
8617 : or different non-mask precisions. while pattern recog is
8618 : supposed to guarantee consistency here bugs in it can cause
8619 : mismatches (PR103489 and PR103800 for example).
8620 : Deal with them here instead of ICEing later. */
8621 18 : if (dump_enabled_p ())
8622 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8623 : "incompatible vector type setup from "
8624 : "bool pattern detection\n");
8625 18 : return false;
8626 : }
8627 :
8628 : /* For single-argument PHIs assume coalescing which means zero cost
8629 : for the scalar and the vector PHIs. This avoids artificially
8630 : favoring the vector path (but may pessimize it in some cases). */
8631 57007 : if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8632 51720 : record_stmt_cost (cost_vec, vect_get_num_copies (vinfo, slp_node),
8633 : vector_stmt, slp_node, vectype, 0, vect_body);
8634 57007 : SLP_TREE_TYPE (slp_node) = phi_info_type;
8635 57007 : return true;
8636 : }
8637 :
8638 14091 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8639 14091 : basic_block bb = gimple_bb (stmt_info->stmt);
8640 14091 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8641 14091 : auto_vec<gphi *> new_phis;
8642 51045 : for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8643 : {
8644 36954 : slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8645 :
8646 : /* Skip not yet vectorized defs. */
8647 37401 : if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8648 36954 : && SLP_TREE_VEC_DEFS (child).is_empty ())
8649 447 : continue;
8650 :
8651 36507 : auto_vec<tree> vec_oprnds;
8652 36507 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8653 36507 : if (!new_phis.exists ())
8654 : {
8655 14091 : new_phis.create (vec_oprnds.length ());
8656 29801 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
8657 : {
8658 : /* Create the vectorized LC PHI node. */
8659 15710 : new_phis.quick_push (create_phi_node (vec_dest, bb));
8660 15710 : slp_node->push_vec_def (new_phis[j]);
8661 : }
8662 : }
8663 36507 : edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8664 79758 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
8665 43251 : add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8666 36507 : }
8667 : /* We should have at least one already vectorized child. */
8668 14091 : gcc_assert (new_phis.exists ());
8669 :
8670 14091 : return true;
8671 14091 : }
8672 :
8673 : /* Vectorizes first order recurrences. An overview of the transformation
8674 : is described below. Suppose we have the following loop.
8675 :
8676 : int t = 0;
8677 : for (int i = 0; i < n; ++i)
8678 : {
8679 : b[i] = a[i] - t;
8680 : t = a[i];
8681 : }
8682 :
8683 : There is a first-order recurrence on 'a'. For this loop, the scalar IR
8684 : looks (simplified) like:
8685 :
8686 : scalar.preheader:
8687 : init = 0;
8688 :
8689 : scalar.body:
8690 : i = PHI <0(scalar.preheader), i+1(scalar.body)>
8691 : _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8692 : _1 = a[i]
8693 : b[i] = _1 - _2
8694 : if (i < n) goto scalar.body
8695 :
8696 : In this example, _2 is a recurrence because it's value depends on the
8697 : previous iteration. We vectorize this as (VF = 4)
8698 :
8699 : vector.preheader:
8700 : vect_init = vect_cst(..., ..., ..., 0)
8701 :
8702 : vector.body
8703 : i = PHI <0(vector.preheader), i+4(vector.body)>
8704 : vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8705 : vect_2 = a[i, i+1, i+2, i+3];
8706 : vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8707 : b[i, i+1, i+2, i+3] = vect_2 - vect_3
8708 : if (..) goto vector.body
8709 :
8710 : In this function, vectorizable_recurr, we code generate both the
8711 : vector PHI node and the permute since those together compute the
8712 : vectorized value of the scalar PHI. We do not yet have the
8713 : backedge value to fill in there nor into the vec_perm. Those
8714 : are filled in vect_schedule_scc.
8715 :
8716 : TODO: Since the scalar loop does not have a use of the recurrence
8717 : outside of the loop the natural way to implement peeling via
8718 : vectorizing the live value doesn't work. For now peeling of loops
8719 : with a recurrence is not implemented. For SLP the supported cases
8720 : are restricted to those requiring a single vector recurrence PHI. */
8721 :
8722 : bool
8723 168333 : vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8724 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8725 : {
8726 168333 : if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8727 : return false;
8728 :
8729 33694 : gphi *phi = as_a<gphi *> (stmt_info->stmt);
8730 :
8731 : /* So far we only support first-order recurrence auto-vectorization. */
8732 33694 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8733 : return false;
8734 :
8735 400 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8736 400 : unsigned ncopies = vect_get_num_copies (loop_vinfo, slp_node);
8737 400 : poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8738 400 : unsigned dist = SLP_TREE_LANES (slp_node);
8739 : /* We need to be able to make progress with a single vector. */
8740 400 : if (maybe_gt (dist * 2, nunits))
8741 : {
8742 0 : if (dump_enabled_p ())
8743 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8744 : "first order recurrence exceeds half of "
8745 : "a vector\n");
8746 0 : return false;
8747 : }
8748 :
8749 : /* We need to be able to build a { ..., a, b } init vector with
8750 : dist number of distinct trailing values. Always possible
8751 : when dist == 1 or when nunits is constant or when the initializations
8752 : are uniform. */
8753 400 : tree uniform_initval = NULL_TREE;
8754 400 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8755 1624 : for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
8756 : {
8757 436 : gphi *phi = as_a <gphi *> (s->stmt);
8758 436 : if (! uniform_initval)
8759 400 : uniform_initval = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8760 36 : else if (! operand_equal_p (uniform_initval,
8761 36 : PHI_ARG_DEF_FROM_EDGE (phi, pe)))
8762 : {
8763 : uniform_initval = NULL_TREE;
8764 : break;
8765 : }
8766 : }
8767 400 : if (!uniform_initval && !nunits.is_constant ())
8768 : {
8769 : if (dump_enabled_p ())
8770 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8771 : "cannot build initialization vector for "
8772 : "first order recurrence\n");
8773 : return false;
8774 : }
8775 :
8776 : /* First-order recurrence autovectorization needs to handle permutation
8777 : with indices = [nunits-1, nunits, nunits+1, ...]. */
8778 400 : vec_perm_builder sel (nunits, 1, 3);
8779 1600 : for (int i = 0; i < 3; ++i)
8780 1200 : sel.quick_push (nunits - dist + i);
8781 400 : vec_perm_indices indices (sel, 2, nunits);
8782 :
8783 400 : if (cost_vec) /* transformation not required. */
8784 : {
8785 360 : if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8786 : indices))
8787 : return false;
8788 :
8789 : /* We eventually need to set a vector type on invariant
8790 : arguments. */
8791 : unsigned j;
8792 : slp_tree child;
8793 744 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8794 496 : if (!vect_maybe_update_slp_op_vectype (child, vectype))
8795 : {
8796 0 : if (dump_enabled_p ())
8797 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8798 : "incompatible vector types for "
8799 : "invariants\n");
8800 0 : return false;
8801 : }
8802 :
8803 : /* Verify we have set up compatible types. */
8804 248 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8805 248 : slp_tree latch_def = SLP_TREE_CHILDREN (slp_node)[le->dest_idx];
8806 248 : tree latch_vectype = SLP_TREE_VECTYPE (latch_def);
8807 248 : if (!types_compatible_p (latch_vectype, vectype))
8808 : return false;
8809 :
8810 : /* The recurrence costs the initialization vector and one permute
8811 : for each copy. With SLP the prologue value is explicitly
8812 : represented and costed separately. */
8813 248 : unsigned prologue_cost = 0;
8814 248 : unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8815 : slp_node, 0, vect_body);
8816 248 : if (dump_enabled_p ())
8817 48 : dump_printf_loc (MSG_NOTE, vect_location,
8818 : "vectorizable_recurr: inside_cost = %d, "
8819 : "prologue_cost = %d .\n", inside_cost,
8820 : prologue_cost);
8821 :
8822 248 : SLP_TREE_TYPE (slp_node) = recurr_info_type;
8823 248 : return true;
8824 : }
8825 :
8826 40 : tree vec_init;
8827 40 : if (! uniform_initval)
8828 : {
8829 6 : vec<constructor_elt, va_gc> *v = NULL;
8830 6 : vec_alloc (v, nunits.to_constant ());
8831 33 : for (unsigned i = 0; i < nunits.to_constant () - dist; ++i)
8832 27 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
8833 : build_zero_cst (TREE_TYPE (vectype)));
8834 39 : for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
8835 : {
8836 21 : gphi *phi = as_a <gphi *> (s->stmt);
8837 21 : tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8838 21 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
8839 21 : TREE_TYPE (preheader)))
8840 : {
8841 0 : gimple_seq stmts = NULL;
8842 0 : preheader = gimple_convert (&stmts,
8843 0 : TREE_TYPE (vectype), preheader);
8844 0 : gsi_insert_seq_on_edge_immediate (pe, stmts);
8845 : }
8846 21 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, preheader);
8847 : }
8848 6 : vec_init = build_constructor (vectype, v);
8849 : }
8850 : else
8851 : vec_init = uniform_initval;
8852 40 : vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8853 :
8854 : /* Create the vectorized first-order PHI node. */
8855 40 : tree vec_dest = vect_get_new_vect_var (vectype,
8856 : vect_simple_var, "vec_recur_");
8857 40 : basic_block bb = gimple_bb (phi);
8858 40 : gphi *new_phi = create_phi_node (vec_dest, bb);
8859 40 : add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8860 :
8861 : /* Insert shuffles the first-order recurrence autovectorization.
8862 : result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
8863 40 : tree perm = vect_gen_perm_mask_checked (vectype, indices);
8864 :
8865 : /* Insert the required permute after the latch definition. The
8866 : second and later operands are tentative and will be updated when we have
8867 : vectorized the latch definition. */
8868 40 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8869 40 : gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8870 40 : gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8871 40 : gsi_next (&gsi2);
8872 :
8873 117 : for (unsigned i = 0; i < ncopies; ++i)
8874 : {
8875 77 : vec_dest = make_ssa_name (vectype);
8876 77 : gassign *vperm
8877 117 : = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8878 40 : i == 0 ? gimple_phi_result (new_phi) : NULL,
8879 : NULL, perm);
8880 77 : vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8881 :
8882 77 : slp_node->push_vec_def (vperm);
8883 : }
8884 :
8885 : return true;
8886 400 : }
8887 :
8888 : /* Return true if VECTYPE represents a vector that requires lowering
8889 : by the vector lowering pass. */
8890 :
8891 : bool
8892 648181 : vect_emulated_vector_p (tree vectype)
8893 : {
8894 1296362 : return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8895 650888 : && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8896 2689 : || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8897 : }
8898 :
8899 : /* Return true if we can emulate CODE on an integer mode representation
8900 : of a vector. */
8901 :
8902 : bool
8903 10881 : vect_can_vectorize_without_simd_p (tree_code code)
8904 : {
8905 10881 : switch (code)
8906 : {
8907 : case PLUS_EXPR:
8908 : case MINUS_EXPR:
8909 : case NEGATE_EXPR:
8910 : case BIT_AND_EXPR:
8911 : case BIT_IOR_EXPR:
8912 : case BIT_XOR_EXPR:
8913 : case BIT_NOT_EXPR:
8914 : return true;
8915 :
8916 10114 : default:
8917 10114 : return false;
8918 : }
8919 : }
8920 :
8921 : /* Likewise, but taking a code_helper. */
8922 :
8923 : bool
8924 154 : vect_can_vectorize_without_simd_p (code_helper code)
8925 : {
8926 154 : return (code.is_tree_code ()
8927 154 : && vect_can_vectorize_without_simd_p (tree_code (code)));
8928 : }
8929 :
8930 : /* Create vector init for vectorized iv. */
8931 : static tree
8932 916 : vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8933 : tree step_expr, poly_uint64 nunits,
8934 : tree vectype,
8935 : enum vect_induction_op_type induction_type)
8936 : {
8937 916 : unsigned HOST_WIDE_INT const_nunits;
8938 916 : tree vec_shift, vec_init, new_name;
8939 916 : unsigned i;
8940 916 : tree itype = TREE_TYPE (vectype);
8941 :
8942 : /* iv_loop is the loop to be vectorized. Create:
8943 : vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
8944 916 : new_name = gimple_convert (stmts, itype, init_expr);
8945 916 : switch (induction_type)
8946 : {
8947 18 : case vect_step_op_shr:
8948 18 : case vect_step_op_shl:
8949 : /* Build the Initial value from shift_expr. */
8950 18 : vec_init = gimple_build_vector_from_val (stmts,
8951 : vectype,
8952 : new_name);
8953 18 : vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
8954 : build_zero_cst (itype), step_expr);
8955 18 : vec_init = gimple_build (stmts,
8956 : (induction_type == vect_step_op_shr
8957 : ? RSHIFT_EXPR : LSHIFT_EXPR),
8958 : vectype, vec_init, vec_shift);
8959 18 : break;
8960 :
8961 822 : case vect_step_op_neg:
8962 822 : {
8963 822 : vec_init = gimple_build_vector_from_val (stmts,
8964 : vectype,
8965 : new_name);
8966 822 : tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
8967 : vectype, vec_init);
8968 : /* The encoding has 2 interleaved stepped patterns. */
8969 822 : vec_perm_builder sel (nunits, 2, 3);
8970 822 : sel.quick_grow (6);
8971 4110 : for (i = 0; i < 3; i++)
8972 : {
8973 2466 : sel[2 * i] = i;
8974 2466 : sel[2 * i + 1] = i + nunits;
8975 : }
8976 822 : vec_perm_indices indices (sel, 2, nunits);
8977 : /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
8978 : fail when vec_init is const vector. In that situation vec_perm is not
8979 : really needed. */
8980 822 : tree perm_mask_even
8981 822 : = vect_gen_perm_mask_any (vectype, indices);
8982 822 : vec_init = gimple_build (stmts, VEC_PERM_EXPR,
8983 : vectype,
8984 : vec_init, vec_neg,
8985 : perm_mask_even);
8986 822 : }
8987 822 : break;
8988 :
8989 76 : case vect_step_op_mul:
8990 76 : {
8991 : /* Use unsigned mult to avoid UD integer overflow. */
8992 76 : gcc_assert (nunits.is_constant (&const_nunits));
8993 76 : tree utype = unsigned_type_for (itype);
8994 76 : tree uvectype = build_vector_type (utype,
8995 76 : TYPE_VECTOR_SUBPARTS (vectype));
8996 76 : new_name = gimple_convert (stmts, utype, new_name);
8997 76 : vec_init = gimple_build_vector_from_val (stmts,
8998 : uvectype,
8999 : new_name);
9000 76 : tree_vector_builder elts (uvectype, const_nunits, 1);
9001 76 : tree elt_step = build_one_cst (utype);
9002 :
9003 76 : elts.quick_push (elt_step);
9004 660 : for (i = 1; i < const_nunits; i++)
9005 : {
9006 : /* Create: new_name_i = new_name + step_expr. */
9007 508 : elt_step = gimple_build (stmts, MULT_EXPR,
9008 : utype, elt_step, step_expr);
9009 508 : elts.quick_push (elt_step);
9010 : }
9011 : /* Create a vector from [new_name_0, new_name_1, ...,
9012 : new_name_nunits-1]. */
9013 76 : tree vec_mul = gimple_build_vector (stmts, &elts);
9014 76 : vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9015 : vec_init, vec_mul);
9016 76 : vec_init = gimple_convert (stmts, vectype, vec_init);
9017 76 : }
9018 76 : break;
9019 :
9020 0 : default:
9021 0 : gcc_unreachable ();
9022 : }
9023 :
9024 916 : return vec_init;
9025 : }
9026 :
9027 : /* Peel init_expr by skip_niter for induction_type. */
9028 : tree
9029 84 : vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9030 : tree skip_niters, tree step_expr,
9031 : enum vect_induction_op_type induction_type,
9032 : bool early_exit_p)
9033 : {
9034 84 : gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST || early_exit_p);
9035 84 : tree type = TREE_TYPE (init_expr);
9036 84 : unsigned prec = TYPE_PRECISION (type);
9037 84 : switch (induction_type)
9038 : {
9039 : /* neg inductions are typically not used for loop termination conditions but
9040 : are typically implemented as b = -b. That is every scalar iteration b is
9041 : negated. That means that for the initial value of b we will have to
9042 : determine whether the number of skipped iteration is a multiple of 2
9043 : because every 2 scalar iterations we are back at "b". */
9044 0 : case vect_step_op_neg:
9045 : /* For early exits the neg induction will always be the same value at the
9046 : start of the iteration. */
9047 0 : if (early_exit_p)
9048 : break;
9049 :
9050 0 : if (TREE_INT_CST_LOW (skip_niters) % 2)
9051 0 : init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9052 : /* else no change. */
9053 : break;
9054 :
9055 12 : case vect_step_op_shr:
9056 12 : case vect_step_op_shl:
9057 12 : skip_niters = fold_build1 (NOP_EXPR, type, skip_niters);
9058 12 : step_expr = fold_build1 (NOP_EXPR, type, step_expr);
9059 12 : step_expr = fold_build2 (MULT_EXPR, type, step_expr, skip_niters);
9060 : /* When shift mount >= precision, need to avoid UD.
9061 : In the original loop, there's no UD, and according to semantic,
9062 : init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9063 12 : if ((!tree_fits_uhwi_p (step_expr)
9064 12 : || tree_to_uhwi (step_expr) >= prec)
9065 6 : && !early_exit_p)
9066 : {
9067 6 : if (induction_type == vect_step_op_shl
9068 6 : || TYPE_UNSIGNED (type))
9069 4 : init_expr = build_zero_cst (type);
9070 : else
9071 2 : init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9072 : init_expr,
9073 4 : wide_int_to_tree (type, prec - 1));
9074 : }
9075 : else
9076 : {
9077 8 : init_expr = fold_build2 ((induction_type == vect_step_op_shr
9078 : ? RSHIFT_EXPR : LSHIFT_EXPR),
9079 : type, init_expr, step_expr);
9080 6 : init_expr = force_gimple_operand (init_expr, stmts, false, NULL);
9081 : }
9082 : break;
9083 :
9084 72 : case vect_step_op_mul:
9085 72 : {
9086 : /* Due to UB we can't support vect_step_op_mul with early break for now.
9087 : so assert and block. */
9088 72 : gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9089 72 : tree utype = unsigned_type_for (type);
9090 72 : init_expr = gimple_convert (stmts, utype, init_expr);
9091 72 : wide_int skipn = wi::to_wide (skip_niters);
9092 72 : wide_int begin = wi::to_wide (step_expr);
9093 72 : auto_mpz base, exp, mod, res;
9094 72 : wi::to_mpz (begin, base, TYPE_SIGN (type));
9095 72 : wi::to_mpz (skipn, exp, UNSIGNED);
9096 72 : mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9097 72 : mpz_powm (res, base, exp, mod);
9098 72 : begin = wi::from_mpz (utype, res, true);
9099 72 : tree mult_expr = wide_int_to_tree (utype, begin);
9100 72 : init_expr = gimple_build (stmts, MULT_EXPR, utype,
9101 : init_expr, mult_expr);
9102 72 : init_expr = gimple_convert (stmts, type, init_expr);
9103 72 : }
9104 72 : break;
9105 :
9106 0 : default:
9107 0 : gcc_unreachable ();
9108 : }
9109 :
9110 84 : return init_expr;
9111 : }
9112 :
9113 : /* Create vector step for vectorized iv. */
9114 : static tree
9115 1202 : vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9116 : poly_uint64 vf,
9117 : enum vect_induction_op_type induction_type)
9118 : {
9119 1202 : tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9120 1202 : tree new_name = NULL;
9121 : /* Step should be pow (step, vf) for mult induction. */
9122 1202 : if (induction_type == vect_step_op_mul)
9123 : {
9124 76 : gcc_assert (vf.is_constant ());
9125 76 : wide_int begin = wi::to_wide (step_expr);
9126 :
9127 584 : for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9128 508 : begin = wi::mul (begin, wi::to_wide (step_expr));
9129 :
9130 76 : new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9131 76 : }
9132 1126 : else if (induction_type == vect_step_op_neg)
9133 : /* Do nothing. */
9134 : ;
9135 : else
9136 18 : new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9137 : expr, step_expr);
9138 1202 : return new_name;
9139 : }
9140 :
9141 : static tree
9142 1202 : vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9143 : stmt_vec_info stmt_info,
9144 : tree new_name, tree vectype,
9145 : enum vect_induction_op_type induction_type)
9146 : {
9147 : /* No step is needed for neg induction. */
9148 1202 : if (induction_type == vect_step_op_neg)
9149 : return NULL;
9150 :
9151 94 : tree t = unshare_expr (new_name);
9152 94 : gcc_assert (CONSTANT_CLASS_P (new_name)
9153 : || TREE_CODE (new_name) == SSA_NAME);
9154 94 : tree new_vec = build_vector_from_val (vectype, t);
9155 94 : tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9156 : new_vec, vectype, NULL);
9157 94 : return vec_step;
9158 : }
9159 :
9160 : /* Update vectorized iv with vect_step, induc_def is init. */
9161 : static tree
9162 1390 : vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9163 : tree induc_def, tree vec_step,
9164 : enum vect_induction_op_type induction_type)
9165 : {
9166 1390 : tree vec_def = induc_def;
9167 1390 : switch (induction_type)
9168 : {
9169 76 : case vect_step_op_mul:
9170 76 : {
9171 : /* Use unsigned mult to avoid UD integer overflow. */
9172 76 : tree uvectype = unsigned_type_for (vectype);
9173 76 : vec_def = gimple_convert (stmts, uvectype, vec_def);
9174 76 : vec_step = gimple_convert (stmts, uvectype, vec_step);
9175 76 : vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9176 : vec_def, vec_step);
9177 76 : vec_def = gimple_convert (stmts, vectype, vec_def);
9178 : }
9179 76 : break;
9180 :
9181 12 : case vect_step_op_shr:
9182 12 : vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9183 : vec_def, vec_step);
9184 12 : break;
9185 :
9186 6 : case vect_step_op_shl:
9187 6 : vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9188 : vec_def, vec_step);
9189 6 : break;
9190 : case vect_step_op_neg:
9191 : vec_def = induc_def;
9192 : /* Do nothing. */
9193 : break;
9194 0 : default:
9195 0 : gcc_unreachable ();
9196 : }
9197 :
9198 1390 : return vec_def;
9199 :
9200 : }
9201 :
9202 : /* Function vectorizable_nonlinear_induction
9203 :
9204 : Check if STMT_INFO performs an nonlinear induction computation that can be
9205 : vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9206 : a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9207 : basic block.
9208 : Return true if STMT_INFO is vectorizable in this way. */
9209 :
9210 : static bool
9211 9662 : vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9212 : stmt_vec_info stmt_info,
9213 : slp_tree slp_node,
9214 : stmt_vector_for_cost *cost_vec)
9215 : {
9216 9662 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9217 9662 : unsigned ncopies;
9218 9662 : bool nested_in_vect_loop = false;
9219 9662 : class loop *iv_loop;
9220 9662 : tree vec_def;
9221 9662 : edge pe = loop_preheader_edge (loop);
9222 9662 : basic_block new_bb;
9223 9662 : tree vec_init, vec_step;
9224 9662 : tree new_name;
9225 9662 : gimple *new_stmt;
9226 9662 : gphi *induction_phi;
9227 9662 : tree induc_def, vec_dest;
9228 9662 : tree init_expr, step_expr;
9229 9662 : tree niters_skip;
9230 9662 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9231 9662 : unsigned i;
9232 9662 : gimple_stmt_iterator si;
9233 :
9234 9662 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9235 :
9236 9662 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9237 9662 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9238 9662 : enum vect_induction_op_type induction_type
9239 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9240 :
9241 9662 : gcc_assert (induction_type > vect_step_op_add);
9242 :
9243 9662 : ncopies = vect_get_num_copies (loop_vinfo, slp_node);
9244 9662 : gcc_assert (ncopies >= 1);
9245 :
9246 : /* FORNOW. Only handle nonlinear induction in the same loop. */
9247 9662 : if (nested_in_vect_loop_p (loop, stmt_info))
9248 : {
9249 0 : if (dump_enabled_p ())
9250 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9251 : "nonlinear induction in nested loop.\n");
9252 0 : return false;
9253 : }
9254 :
9255 9662 : iv_loop = loop;
9256 9662 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9257 :
9258 : /* TODO: Support multi-lane SLP for nonlinear iv. There should be separate
9259 : vector iv update for each iv and a permutation to generate wanted
9260 : vector iv. */
9261 9662 : if (SLP_TREE_LANES (slp_node) > 1)
9262 : {
9263 0 : if (dump_enabled_p ())
9264 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9265 : "SLP induction not supported for nonlinear"
9266 : " induction.\n");
9267 0 : return false;
9268 : }
9269 :
9270 9662 : if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9271 : {
9272 0 : if (dump_enabled_p ())
9273 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9274 : "floating point nonlinear induction vectorization"
9275 : " not supported.\n");
9276 0 : return false;
9277 : }
9278 :
9279 9662 : step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9280 9662 : init_expr = vect_phi_initial_value (phi);
9281 9662 : gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9282 : && TREE_CODE (step_expr) == INTEGER_CST);
9283 : /* step_expr should be aligned with init_expr,
9284 : .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9285 9662 : step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9286 :
9287 9662 : if (TREE_CODE (init_expr) == INTEGER_CST)
9288 3009 : init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9289 6653 : else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9290 : {
9291 : /* INIT_EXPR could be a bit_field, bail out for such case. */
9292 4 : if (dump_enabled_p ())
9293 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9294 : "nonlinear induction vectorization failed:"
9295 : " component type of vectype is not a nop conversion"
9296 : " from type of init_expr.\n");
9297 4 : return false;
9298 : }
9299 :
9300 9658 : switch (induction_type)
9301 : {
9302 2538 : case vect_step_op_neg:
9303 2538 : if (maybe_eq (TYPE_VECTOR_SUBPARTS (vectype), 1u))
9304 : return false;
9305 2534 : if (TREE_CODE (init_expr) != INTEGER_CST
9306 190 : && TREE_CODE (init_expr) != REAL_CST)
9307 : {
9308 : /* Check for backend support of NEGATE_EXPR and vec_perm. */
9309 190 : if (!directly_supported_p (NEGATE_EXPR, vectype))
9310 0 : return false;
9311 :
9312 : /* The encoding has 2 interleaved stepped patterns. */
9313 190 : vec_perm_builder sel (nunits, 2, 3);
9314 190 : machine_mode mode = TYPE_MODE (vectype);
9315 190 : sel.quick_grow (6);
9316 950 : for (i = 0; i < 3; i++)
9317 : {
9318 570 : sel[i * 2] = i;
9319 570 : sel[i * 2 + 1] = i + nunits;
9320 : }
9321 190 : vec_perm_indices indices (sel, 2, nunits);
9322 190 : if (!can_vec_perm_const_p (mode, mode, indices))
9323 0 : return false;
9324 190 : }
9325 : break;
9326 :
9327 1058 : case vect_step_op_mul:
9328 1058 : {
9329 : /* Check for backend support of MULT_EXPR. */
9330 1058 : if (!directly_supported_p (MULT_EXPR, vectype))
9331 : return false;
9332 :
9333 : /* ?? How to construct vector step for variable number vector.
9334 : [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9335 : if (!vf.is_constant ())
9336 : return false;
9337 : }
9338 : break;
9339 :
9340 5744 : case vect_step_op_shr:
9341 : /* Check for backend support of RSHIFT_EXPR. */
9342 5744 : if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9343 : return false;
9344 :
9345 : /* Don't shift more than type precision to avoid UD. */
9346 26 : if (!tree_fits_uhwi_p (step_expr)
9347 26 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9348 : TYPE_PRECISION (TREE_TYPE (init_expr))))
9349 : return false;
9350 : break;
9351 :
9352 318 : case vect_step_op_shl:
9353 : /* Check for backend support of RSHIFT_EXPR. */
9354 318 : if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9355 : return false;
9356 :
9357 : /* Don't shift more than type precision to avoid UD. */
9358 12 : if (!tree_fits_uhwi_p (step_expr)
9359 12 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9360 : TYPE_PRECISION (TREE_TYPE (init_expr))))
9361 : return false;
9362 :
9363 : break;
9364 :
9365 0 : default:
9366 0 : gcc_unreachable ();
9367 : }
9368 :
9369 3390 : if (cost_vec) /* transformation not required. */
9370 : {
9371 2474 : unsigned inside_cost = 0, prologue_cost = 0;
9372 : /* loop cost for vec_loop. Neg induction doesn't have any
9373 : inside_cost. */
9374 2474 : inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9375 : slp_node, 0, vect_body);
9376 :
9377 : /* loop cost for vec_loop. Neg induction doesn't have any
9378 : inside_cost. */
9379 2474 : if (induction_type == vect_step_op_neg)
9380 1712 : inside_cost = 0;
9381 :
9382 : /* prologue cost for vec_init and vec_step. */
9383 2474 : prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9384 : slp_node, 0, vect_prologue);
9385 :
9386 2474 : if (dump_enabled_p ())
9387 60 : dump_printf_loc (MSG_NOTE, vect_location,
9388 : "vect_model_induction_cost: inside_cost = %d, "
9389 : "prologue_cost = %d. \n", inside_cost,
9390 : prologue_cost);
9391 :
9392 2474 : SLP_TREE_TYPE (slp_node) = induc_vec_info_type;
9393 2474 : DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9394 2474 : return true;
9395 : }
9396 :
9397 : /* Transform. */
9398 :
9399 : /* Compute a vector variable, initialized with the first VF values of
9400 : the induction variable. E.g., for an iv with IV_PHI='X' and
9401 : evolution S, for a vector of 4 units, we want to compute:
9402 : [X, X + S, X + 2*S, X + 3*S]. */
9403 :
9404 916 : if (dump_enabled_p ())
9405 32 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9406 :
9407 916 : pe = loop_preheader_edge (iv_loop);
9408 : /* Find the first insertion point in the BB. */
9409 916 : basic_block bb = gimple_bb (phi);
9410 916 : si = gsi_after_labels (bb);
9411 :
9412 916 : gimple_seq stmts = NULL;
9413 :
9414 916 : niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9415 : /* If we are using the loop mask to "peel" for alignment then we need
9416 : to adjust the start value here. */
9417 916 : if (niters_skip != NULL_TREE)
9418 0 : init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9419 : step_expr, induction_type, false);
9420 :
9421 916 : vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9422 : step_expr, nunits, vectype,
9423 : induction_type);
9424 916 : if (stmts)
9425 : {
9426 162 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9427 162 : gcc_assert (!new_bb);
9428 : }
9429 :
9430 916 : stmts = NULL;
9431 916 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9432 : vf, induction_type);
9433 916 : if (stmts)
9434 : {
9435 0 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9436 0 : gcc_assert (!new_bb);
9437 : }
9438 :
9439 916 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9440 : new_name, vectype,
9441 : induction_type);
9442 : /* Create the following def-use cycle:
9443 : loop prolog:
9444 : vec_init = ...
9445 : vec_step = ...
9446 : loop:
9447 : vec_iv = PHI <vec_init, vec_loop>
9448 : ...
9449 : STMT
9450 : ...
9451 : vec_loop = vec_iv + vec_step; */
9452 :
9453 : /* Create the induction-phi that defines the induction-operand. */
9454 916 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9455 916 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
9456 916 : induc_def = PHI_RESULT (induction_phi);
9457 :
9458 : /* Create the iv update inside the loop. */
9459 916 : stmts = NULL;
9460 916 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9461 : induc_def, vec_step,
9462 : induction_type);
9463 :
9464 916 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9465 916 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9466 :
9467 : /* Set the arguments of the phi node: */
9468 916 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9469 916 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9470 : UNKNOWN_LOCATION);
9471 :
9472 916 : slp_node->push_vec_def (induction_phi);
9473 :
9474 : /* In case that vectorization factor (VF) is bigger than the number
9475 : of elements that we can fit in a vectype (nunits), we have to generate
9476 : more than one vector stmt - i.e - we need to "unroll" the
9477 : vector stmt by a factor VF/nunits. For more details see documentation
9478 : in vectorizable_operation. */
9479 :
9480 916 : if (ncopies > 1)
9481 : {
9482 286 : stmts = NULL;
9483 : /* FORNOW. This restriction should be relaxed. */
9484 286 : gcc_assert (!nested_in_vect_loop);
9485 :
9486 286 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9487 : nunits, induction_type);
9488 :
9489 286 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9490 : new_name, vectype,
9491 : induction_type);
9492 286 : vec_def = induc_def;
9493 1046 : for (i = 1; i < ncopies; i++)
9494 : {
9495 : /* vec_i = vec_prev + vec_step. */
9496 474 : stmts = NULL;
9497 474 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9498 : vec_def, vec_step,
9499 : induction_type);
9500 474 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9501 474 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9502 474 : slp_node->push_vec_def (new_stmt);
9503 : }
9504 : }
9505 :
9506 916 : if (dump_enabled_p ())
9507 64 : dump_printf_loc (MSG_NOTE, vect_location,
9508 : "transform induction: created def-use cycle: %G%G",
9509 32 : (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9510 :
9511 : return true;
9512 : }
9513 :
9514 : /* Function vectorizable_induction
9515 :
9516 : Check if STMT_INFO performs an induction computation that can be vectorized.
9517 : If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9518 : phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9519 : Return true if STMT_INFO is vectorizable in this way. */
9520 :
9521 : bool
9522 290851 : vectorizable_induction (loop_vec_info loop_vinfo,
9523 : stmt_vec_info stmt_info,
9524 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9525 : {
9526 290851 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9527 290851 : bool nested_in_vect_loop = false;
9528 290851 : class loop *iv_loop;
9529 290851 : tree vec_def;
9530 290851 : edge pe = loop_preheader_edge (loop);
9531 290851 : basic_block new_bb;
9532 290851 : tree vec_init = NULL_TREE, vec_step, t;
9533 290851 : tree new_name;
9534 290851 : gphi *induction_phi;
9535 290851 : tree induc_def, vec_dest;
9536 290851 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9537 290851 : unsigned i;
9538 290851 : tree expr;
9539 290851 : tree index_vectype = NULL_TREE;
9540 290851 : gimple_stmt_iterator si;
9541 290851 : enum vect_induction_op_type induction_type
9542 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9543 :
9544 318994 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9545 156212 : if (!phi)
9546 : return false;
9547 :
9548 156212 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
9549 : return false;
9550 :
9551 : /* Make sure it was recognized as induction computation. */
9552 156212 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9553 : return false;
9554 :
9555 : /* Handle nonlinear induction in a separate place. */
9556 152581 : if (induction_type != vect_step_op_add)
9557 9662 : return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9558 9662 : slp_node, cost_vec);
9559 :
9560 142919 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9561 142919 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9562 :
9563 : /* FORNOW. These restrictions should be relaxed. */
9564 142919 : if (nested_in_vect_loop_p (loop, stmt_info))
9565 : {
9566 740 : imm_use_iterator imm_iter;
9567 740 : use_operand_p use_p;
9568 740 : gimple *exit_phi;
9569 740 : edge latch_e;
9570 740 : tree loop_arg;
9571 :
9572 740 : exit_phi = NULL;
9573 740 : latch_e = loop_latch_edge (loop->inner);
9574 740 : loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9575 2256 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9576 : {
9577 800 : gimple *use_stmt = USE_STMT (use_p);
9578 800 : if (is_gimple_debug (use_stmt))
9579 36 : continue;
9580 :
9581 764 : if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9582 : {
9583 : exit_phi = use_stmt;
9584 : break;
9585 : }
9586 740 : }
9587 740 : if (exit_phi)
9588 : {
9589 24 : stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9590 24 : if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9591 8 : && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9592 : {
9593 16 : if (dump_enabled_p ())
9594 16 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9595 : "inner-loop induction only used outside "
9596 : "of the outer vectorized loop.\n");
9597 16 : return false;
9598 : }
9599 : }
9600 :
9601 724 : nested_in_vect_loop = true;
9602 724 : iv_loop = loop->inner;
9603 : }
9604 : else
9605 : iv_loop = loop;
9606 142903 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9607 :
9608 142903 : if (!nunits.is_constant () && SLP_TREE_LANES (slp_node) != 1)
9609 : {
9610 : /* The current SLP code creates the step value element-by-element. */
9611 : if (dump_enabled_p ())
9612 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9613 : "SLP induction not supported for variable-length"
9614 : " vectors.\n");
9615 : return false;
9616 : }
9617 :
9618 142903 : if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9619 : {
9620 12 : if (dump_enabled_p ())
9621 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9622 : "floating point induction vectorization disabled\n");
9623 12 : return false;
9624 : }
9625 :
9626 142891 : tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9627 142891 : gcc_assert (step_expr != NULL_TREE);
9628 285736 : if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
9629 285640 : && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
9630 : {
9631 12 : if (dump_enabled_p ())
9632 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9633 : "bit-precision induction vectorization not "
9634 : "supported.\n");
9635 12 : return false;
9636 : }
9637 142879 : tree stept = TREE_TYPE (step_expr);
9638 142879 : tree step_vectype = get_same_sized_vectype (stept, vectype);
9639 142879 : stept = TREE_TYPE (step_vectype);
9640 :
9641 : /* Check for target support of the vectorized arithmetic used here. */
9642 142879 : if (!target_supports_op_p (step_vectype, PLUS_EXPR, optab_default)
9643 142879 : || !target_supports_op_p (step_vectype, MINUS_EXPR, optab_default))
9644 24472 : return false;
9645 118407 : if (!nunits.is_constant ())
9646 : {
9647 : if (!target_supports_op_p (step_vectype, MULT_EXPR, optab_default))
9648 : return false;
9649 : /* FLOAT_EXPR when computing VEC_INIT for float inductions. */
9650 : if (SCALAR_FLOAT_TYPE_P (stept))
9651 : {
9652 : tree index_type = build_nonstandard_integer_type
9653 : (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (stept)), 1);
9654 :
9655 : index_vectype = build_vector_type (index_type, nunits);
9656 : if (!can_float_p (TYPE_MODE (step_vectype),
9657 : TYPE_MODE (index_vectype), 1))
9658 : return false;
9659 : }
9660 : }
9661 :
9662 118407 : unsigned nvects = vect_get_num_copies (loop_vinfo, slp_node);
9663 118407 : if (cost_vec) /* transformation not required. */
9664 : {
9665 309465 : unsigned inside_cost = 0, prologue_cost = 0;
9666 : /* We eventually need to set a vector type on invariant
9667 : arguments. */
9668 : unsigned j;
9669 : slp_tree child;
9670 309465 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9671 206310 : if (!vect_maybe_update_slp_op_vectype
9672 206310 : (child, SLP_TREE_VECTYPE (slp_node)))
9673 : {
9674 0 : if (dump_enabled_p ())
9675 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9676 : "incompatible vector types for "
9677 : "invariants\n");
9678 0 : return false;
9679 : }
9680 : /* loop cost for vec_loop. */
9681 103155 : inside_cost = record_stmt_cost (cost_vec, nvects,
9682 : vector_stmt, slp_node, 0, vect_body);
9683 : /* prologue cost for vec_init (if not nested) and step. */
9684 103155 : prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9685 : scalar_to_vec,
9686 : slp_node, 0, vect_prologue);
9687 103155 : if (dump_enabled_p ())
9688 4006 : dump_printf_loc (MSG_NOTE, vect_location,
9689 : "vect_model_induction_cost: inside_cost = %d, "
9690 : "prologue_cost = %d .\n", inside_cost,
9691 : prologue_cost);
9692 :
9693 103155 : SLP_TREE_TYPE (slp_node) = induc_vec_info_type;
9694 103155 : DUMP_VECT_SCOPE ("vectorizable_induction");
9695 103155 : return true;
9696 : }
9697 :
9698 : /* Transform. */
9699 :
9700 : /* Compute a vector variable, initialized with the first VF values of
9701 : the induction variable. E.g., for an iv with IV_PHI='X' and
9702 : evolution S, for a vector of 4 units, we want to compute:
9703 : [X, X + S, X + 2*S, X + 3*S]. */
9704 :
9705 15252 : if (dump_enabled_p ())
9706 2767 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9707 :
9708 15252 : pe = loop_preheader_edge (iv_loop);
9709 : /* Find the first insertion point in the BB. */
9710 15252 : basic_block bb = gimple_bb (phi);
9711 15252 : si = gsi_after_labels (bb);
9712 :
9713 : /* For SLP induction we have to generate several IVs as for example
9714 : with group size 3 we need
9715 : [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9716 : [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9717 15252 : gimple_stmt_iterator incr_si;
9718 15252 : bool insert_after;
9719 15252 : standard_iv_increment_position (iv_loop, &incr_si, &insert_after);
9720 :
9721 : /* The initial values are vectorized, but any lanes > group_size
9722 : need adjustment. */
9723 15252 : slp_tree init_node
9724 15252 : = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9725 :
9726 : /* Gather steps. Since we do not vectorize inductions as
9727 : cycles we have to reconstruct the step from SCEV data. */
9728 15252 : unsigned group_size = SLP_TREE_LANES (slp_node);
9729 15252 : tree *steps = XALLOCAVEC (tree, group_size);
9730 15252 : tree *inits = XALLOCAVEC (tree, group_size);
9731 15252 : stmt_vec_info phi_info;
9732 46990 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9733 : {
9734 16486 : steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9735 16486 : if (!init_node)
9736 16247 : inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9737 : pe->dest_idx);
9738 : }
9739 :
9740 : /* Now generate the IVs. */
9741 30504 : gcc_assert (multiple_p (nunits * nvects, group_size));
9742 15252 : unsigned nivs;
9743 15252 : unsigned HOST_WIDE_INT const_nunits;
9744 15252 : if (nested_in_vect_loop)
9745 : nivs = nvects;
9746 15034 : else if (nunits.is_constant (&const_nunits))
9747 : {
9748 : /* Compute the number of distinct IVs we need. First reduce
9749 : group_size if it is a multiple of const_nunits so we get
9750 : one IV for a group_size of 4 but const_nunits 2. */
9751 15034 : unsigned group_sizep = group_size;
9752 15034 : if (group_sizep % const_nunits == 0)
9753 111 : group_sizep = group_sizep / const_nunits;
9754 15034 : nivs = least_common_multiple (group_sizep, const_nunits) / const_nunits;
9755 : }
9756 : else
9757 : {
9758 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
9759 : nivs = 1;
9760 : }
9761 15252 : gimple_seq init_stmts = NULL;
9762 15252 : tree lupdate_mul = NULL_TREE;
9763 218 : if (!nested_in_vect_loop)
9764 : {
9765 15034 : if (nunits.is_constant (&const_nunits))
9766 : {
9767 : /* The number of iterations covered in one vector iteration. */
9768 15034 : unsigned lup_mul = (nvects * const_nunits) / group_size;
9769 15034 : lupdate_mul
9770 15034 : = build_vector_from_val (step_vectype,
9771 15034 : SCALAR_FLOAT_TYPE_P (stept)
9772 28 : ? build_real_from_wide (stept, lup_mul,
9773 : UNSIGNED)
9774 30040 : : build_int_cstu (stept, lup_mul));
9775 : }
9776 : else
9777 : {
9778 : if (SCALAR_FLOAT_TYPE_P (stept))
9779 : {
9780 : tree tem = build_int_cst (integer_type_node, vf);
9781 : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
9782 : }
9783 : else
9784 : lupdate_mul = build_int_cst (stept, vf);
9785 : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
9786 : lupdate_mul);
9787 : }
9788 : }
9789 15252 : tree peel_mul = NULL_TREE;
9790 15252 : if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9791 : {
9792 0 : if (SCALAR_FLOAT_TYPE_P (stept))
9793 0 : peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9794 : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9795 : else
9796 0 : peel_mul = gimple_convert (&init_stmts, stept,
9797 : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9798 0 : peel_mul = gimple_build_vector_from_val (&init_stmts,
9799 : step_vectype, peel_mul);
9800 : }
9801 15252 : tree step_mul = NULL_TREE;
9802 15252 : unsigned ivn;
9803 15252 : auto_vec<tree> vec_steps;
9804 31080 : for (ivn = 0; ivn < nivs; ++ivn)
9805 : {
9806 15828 : gimple_seq stmts = NULL;
9807 15828 : bool invariant = true;
9808 15828 : if (nunits.is_constant (&const_nunits))
9809 : {
9810 15828 : tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9811 15828 : tree_vector_builder init_elts (vectype, const_nunits, 1);
9812 15828 : tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9813 103418 : for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9814 : {
9815 : /* The scalar steps of the IVs. */
9816 87590 : tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9817 87590 : elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9818 87590 : step_elts.quick_push (elt);
9819 87590 : if (!init_node)
9820 : {
9821 : /* The scalar inits of the IVs if not vectorized. */
9822 86340 : elt = inits[(ivn*const_nunits + eltn) % group_size];
9823 86340 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
9824 86340 : TREE_TYPE (elt)))
9825 264 : elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9826 264 : TREE_TYPE (vectype), elt);
9827 86340 : init_elts.quick_push (elt);
9828 : }
9829 : /* The number of steps to add to the initial values. */
9830 87590 : unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9831 175180 : mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9832 175078 : ? build_real_from_wide (stept, mul_elt,
9833 : UNSIGNED)
9834 175078 : : build_int_cstu (stept, mul_elt));
9835 : }
9836 15828 : vec_step = gimple_build_vector (&init_stmts, &step_elts);
9837 15828 : step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9838 15828 : if (!init_node)
9839 15576 : vec_init = gimple_build_vector (&init_stmts, &init_elts);
9840 15828 : }
9841 : else
9842 : {
9843 : tree step = gimple_convert (&init_stmts, stept, steps[0]);
9844 : if (init_node)
9845 : ;
9846 : else if (INTEGRAL_TYPE_P (stept))
9847 : {
9848 : new_name = gimple_convert (&init_stmts, stept, inits[0]);
9849 : /* Build the initial value directly as a VEC_SERIES_EXPR. */
9850 : vec_init = gimple_build (&init_stmts, VEC_SERIES_EXPR,
9851 : step_vectype, new_name, step);
9852 : if (!useless_type_conversion_p (vectype, step_vectype))
9853 : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9854 : vectype, vec_init);
9855 : }
9856 : else
9857 : {
9858 : /* Build:
9859 : [base, base, base, ...]
9860 : + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
9861 : gcc_assert (SCALAR_FLOAT_TYPE_P (stept));
9862 : gcc_assert (flag_associative_math);
9863 : gcc_assert (index_vectype != NULL_TREE);
9864 :
9865 : tree index = build_index_vector (index_vectype, 0, 1);
9866 : new_name = gimple_convert (&init_stmts, stept, inits[0]);
9867 : tree base_vec = gimple_build_vector_from_val (&init_stmts,
9868 : step_vectype,
9869 : new_name);
9870 : tree step_vec = gimple_build_vector_from_val (&init_stmts,
9871 : step_vectype,
9872 : step);
9873 : vec_init = gimple_build (&init_stmts, FLOAT_EXPR,
9874 : step_vectype, index);
9875 : vec_init = gimple_build (&init_stmts, MULT_EXPR,
9876 : step_vectype, vec_init, step_vec);
9877 : vec_init = gimple_build (&init_stmts, PLUS_EXPR,
9878 : step_vectype, vec_init, base_vec);
9879 : if (!useless_type_conversion_p (vectype, step_vectype))
9880 : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9881 : vectype, vec_init);
9882 : }
9883 : /* iv_loop is nested in the loop to be vectorized. Generate:
9884 : vec_step = [S, S, S, S] */
9885 : t = unshare_expr (step);
9886 : gcc_assert (CONSTANT_CLASS_P (t)
9887 : || TREE_CODE (t) == SSA_NAME);
9888 : vec_step = gimple_build_vector_from_val (&init_stmts,
9889 : step_vectype, t);
9890 : }
9891 15828 : vec_steps.safe_push (vec_step);
9892 15828 : if (peel_mul)
9893 : {
9894 0 : if (!step_mul)
9895 : {
9896 0 : gcc_assert (!nunits.is_constant ());
9897 : step_mul = gimple_build (&init_stmts,
9898 : MINUS_EXPR, step_vectype,
9899 : build_zero_cst (step_vectype), peel_mul);
9900 : }
9901 : else
9902 0 : step_mul = gimple_build (&init_stmts,
9903 : MINUS_EXPR, step_vectype,
9904 : step_mul, peel_mul);
9905 : }
9906 :
9907 : /* Create the induction-phi that defines the induction-operand. */
9908 15828 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9909 : "vec_iv_");
9910 15828 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
9911 15828 : induc_def = PHI_RESULT (induction_phi);
9912 :
9913 : /* Create the iv update inside the loop */
9914 15828 : tree up = vec_step;
9915 15828 : if (lupdate_mul)
9916 : {
9917 15576 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
9918 : {
9919 : /* When we're using loop_len produced by SELEC_VL, the
9920 : non-final iterations are not always processing VF
9921 : elements. So vectorize induction variable instead of
9922 :
9923 : _21 = vect_vec_iv_.6_22 + { VF, ... };
9924 :
9925 : We should generate:
9926 :
9927 : _35 = .SELECT_VL (ivtmp_33, VF);
9928 : vect_cst__22 = [vec_duplicate_expr] _35;
9929 : _21 = vect_vec_iv_.6_22 + vect_cst__22; */
9930 0 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
9931 0 : tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1,
9932 : vectype, 0, 0, false);
9933 0 : if (SCALAR_FLOAT_TYPE_P (stept))
9934 0 : expr = gimple_build (&stmts, FLOAT_EXPR, stept, len);
9935 : else
9936 0 : expr = gimple_convert (&stmts, stept, len);
9937 0 : lupdate_mul = gimple_build_vector_from_val (&stmts, step_vectype,
9938 : expr);
9939 0 : up = gimple_build (&stmts, MULT_EXPR,
9940 : step_vectype, vec_step, lupdate_mul);
9941 : }
9942 : else
9943 15576 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9944 : vec_step, lupdate_mul);
9945 : }
9946 15828 : vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9947 15828 : vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, up);
9948 15828 : vec_def = gimple_convert (&stmts, vectype, vec_def);
9949 15828 : insert_iv_increment (&incr_si, insert_after, stmts);
9950 15828 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9951 : UNKNOWN_LOCATION);
9952 :
9953 15828 : if (init_node)
9954 252 : vec_init = vect_get_slp_vect_def (init_node, ivn);
9955 15828 : if (!nested_in_vect_loop
9956 15828 : && step_mul
9957 15828 : && !integer_zerop (step_mul))
9958 : {
9959 15135 : gcc_assert (invariant);
9960 15135 : vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9961 15135 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9962 : vec_step, step_mul);
9963 15135 : vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9964 : vec_def, up);
9965 15135 : vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9966 : }
9967 :
9968 : /* Set the arguments of the phi node: */
9969 15828 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9970 :
9971 15828 : slp_node->push_vec_def (induction_phi);
9972 : }
9973 15252 : if (!nested_in_vect_loop)
9974 : {
9975 : /* Fill up to the number of vectors we need for the whole group. */
9976 15034 : if (nunits.is_constant (&const_nunits))
9977 15034 : nivs = least_common_multiple (group_size, const_nunits) / const_nunits;
9978 : else
9979 : nivs = 1;
9980 15034 : vec_steps.reserve (nivs-ivn);
9981 30089 : for (; ivn < nivs; ++ivn)
9982 : {
9983 21 : slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
9984 21 : vec_steps.quick_push (vec_steps[0]);
9985 : }
9986 : }
9987 :
9988 : /* Re-use IVs when we can. We are generating further vector
9989 : stmts by adding VF' * stride to the IVs generated above. */
9990 15252 : if (ivn < nvects)
9991 : {
9992 3391 : if (nunits.is_constant (&const_nunits))
9993 : {
9994 3391 : unsigned vfp = (least_common_multiple (group_size, const_nunits)
9995 3391 : / group_size);
9996 3391 : lupdate_mul
9997 3391 : = build_vector_from_val (step_vectype,
9998 3391 : SCALAR_FLOAT_TYPE_P (stept)
9999 8 : ? build_real_from_wide (stept,
10000 8 : vfp, UNSIGNED)
10001 6774 : : build_int_cstu (stept, vfp));
10002 : }
10003 : else
10004 : {
10005 : if (SCALAR_FLOAT_TYPE_P (stept))
10006 : {
10007 : tree tem = build_int_cst (integer_type_node, nunits);
10008 : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
10009 : }
10010 : else
10011 : lupdate_mul = build_int_cst (stept, nunits);
10012 : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
10013 : lupdate_mul);
10014 : }
10015 10902 : for (; ivn < nvects; ++ivn)
10016 : {
10017 7511 : gimple *iv
10018 7511 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10019 7511 : tree def = gimple_get_lhs (iv);
10020 7511 : if (ivn < 2*nivs)
10021 3483 : vec_steps[ivn - nivs]
10022 3483 : = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10023 3483 : vec_steps[ivn - nivs], lupdate_mul);
10024 7511 : gimple_seq stmts = NULL;
10025 7511 : def = gimple_convert (&stmts, step_vectype, def);
10026 22533 : def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10027 7511 : def, vec_steps[ivn % nivs]);
10028 7511 : def = gimple_convert (&stmts, vectype, def);
10029 7511 : if (gimple_code (iv) == GIMPLE_PHI)
10030 3483 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10031 : else
10032 : {
10033 4028 : gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10034 4028 : gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10035 : }
10036 7511 : slp_node->push_vec_def (def);
10037 : }
10038 : }
10039 :
10040 15252 : new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10041 15252 : gcc_assert (!new_bb);
10042 :
10043 15252 : return true;
10044 15252 : }
10045 :
10046 : /* Function vectorizable_live_operation_1.
10047 :
10048 : helper function for vectorizable_live_operation. */
10049 :
10050 : static tree
10051 2823 : vectorizable_live_operation_1 (loop_vec_info loop_vinfo, basic_block exit_bb,
10052 : tree vectype, slp_tree slp_node,
10053 : tree bitsize, tree bitstart, tree vec_lhs,
10054 : tree lhs_type, gimple_stmt_iterator *exit_gsi)
10055 : {
10056 2823 : gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10057 :
10058 2823 : tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10059 2823 : gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10060 5648 : for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10061 2825 : SET_PHI_ARG_DEF (phi, i, vec_lhs);
10062 :
10063 2823 : gimple_seq stmts = NULL;
10064 2823 : tree new_tree;
10065 :
10066 : /* If bitstart is 0 then we can use a BIT_FIELD_REF */
10067 2823 : if (integer_zerop (bitstart))
10068 : {
10069 213 : tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10070 : vec_lhs_phi, bitsize, bitstart);
10071 :
10072 : /* Convert the extracted vector element to the scalar type. */
10073 213 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10074 : }
10075 2610 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10076 : {
10077 : /* Emit:
10078 :
10079 : SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - 1>
10080 :
10081 : where VEC_LHS is the vectorized live-out result, LEN is the length of
10082 : the vector, BIAS is the load-store bias. The bias should not be used
10083 : at all since we are not using load/store operations, but LEN will be
10084 : REALLEN + BIAS, so subtract it to get to the correct position. */
10085 0 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
10086 0 : gimple_seq tem = NULL;
10087 0 : gimple_stmt_iterator gsi = gsi_last (tem);
10088 0 : tree len = vect_get_loop_len (loop_vinfo, &gsi,
10089 : &LOOP_VINFO_LENS (loop_vinfo),
10090 : 1, vectype, 0, 1, false);
10091 0 : gimple_seq_add_seq (&stmts, tem);
10092 :
10093 : /* LAST_INDEX = LEN - 1. */
10094 0 : tree last_index = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (len),
10095 0 : len, build_one_cst (TREE_TYPE (len)));
10096 :
10097 : /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - 1>. */
10098 0 : tree scalar_res
10099 0 : = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10100 : vec_lhs_phi, last_index);
10101 :
10102 : /* Convert the extracted vector element to the scalar type. */
10103 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10104 : }
10105 2610 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10106 : {
10107 : /* Emit:
10108 :
10109 : SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10110 :
10111 : where VEC_LHS is the vectorized live-out result and MASK is
10112 : the loop mask for the final iteration. */
10113 0 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
10114 0 : tree scalar_type = TREE_TYPE (vectype);
10115 0 : gimple_seq tem = NULL;
10116 0 : gimple_stmt_iterator gsi = gsi_last (tem);
10117 0 : tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10118 : &LOOP_VINFO_MASKS (loop_vinfo),
10119 : 1, vectype, 0);
10120 0 : tree scalar_res;
10121 0 : gimple_seq_add_seq (&stmts, tem);
10122 :
10123 0 : scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10124 : mask, vec_lhs_phi);
10125 :
10126 : /* Convert the extracted vector element to the scalar type. */
10127 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10128 : }
10129 : else
10130 : {
10131 2610 : tree bftype = TREE_TYPE (vectype);
10132 2610 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10133 85 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10134 2610 : new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10135 2610 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10136 : &stmts, true, NULL_TREE);
10137 : }
10138 :
10139 2823 : *exit_gsi = gsi_after_labels (exit_bb);
10140 2823 : if (stmts)
10141 2823 : gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10142 :
10143 2823 : return new_tree;
10144 : }
10145 :
10146 : /* Function vectorizable_live_operation.
10147 :
10148 : STMT_INFO computes a value that is used outside the loop. Check if
10149 : it can be supported. */
10150 :
10151 : bool
10152 259732 : vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10153 : slp_tree slp_node, slp_instance slp_node_instance,
10154 : int slp_index, bool vec_stmt_p,
10155 : stmt_vector_for_cost *cost_vec)
10156 : {
10157 259732 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10158 259732 : imm_use_iterator imm_iter;
10159 259732 : tree lhs, lhs_type, bitsize;
10160 259732 : tree vectype = SLP_TREE_VECTYPE (slp_node);
10161 259732 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10162 259732 : gimple *use_stmt;
10163 259732 : use_operand_p use_p;
10164 259732 : auto_vec<tree> vec_oprnds;
10165 259732 : int vec_entry = 0;
10166 259732 : poly_uint64 vec_index = 0;
10167 :
10168 259732 : gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10169 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10170 :
10171 : /* If a stmt of a reduction is live, vectorize it via
10172 : vect_create_epilog_for_reduction. vectorizable_reduction assessed
10173 : validity so just trigger the transform here. */
10174 259732 : if (vect_is_reduction (slp_node))
10175 : {
10176 57906 : if (!vec_stmt_p)
10177 : return true;
10178 : /* For SLP reductions we vectorize the epilogue for all involved stmts
10179 : together. For SLP reduction chains we only get here once. */
10180 23481 : if (SLP_INSTANCE_KIND (slp_node_instance) == slp_inst_kind_reduc_group
10181 23222 : && slp_index != 0)
10182 : return true;
10183 23022 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
10184 23022 : if (VECT_REDUC_INFO_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10185 23022 : || VECT_REDUC_INFO_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10186 : return true;
10187 :
10188 22179 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
10189 22179 : || !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10190 22170 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10191 : slp_node_instance,
10192 : LOOP_VINFO_MAIN_EXIT (loop_vinfo));
10193 :
10194 : /* If early break we only have to materialize the reduction on the merge
10195 : block, but we have to find an alternate exit first. */
10196 22179 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10197 : {
10198 28 : slp_tree phis_node = slp_node_instance->reduc_phis;
10199 28 : stmt_info = SLP_TREE_REPRESENTATIVE (phis_node);
10200 89 : for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10201 28 : if (exit != LOOP_VINFO_MAIN_EXIT (loop_vinfo))
10202 : {
10203 23 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10204 : phis_node, slp_node_instance,
10205 : exit);
10206 23 : break;
10207 28 : }
10208 28 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10209 9 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10210 : phis_node, slp_node_instance,
10211 : LOOP_VINFO_MAIN_EXIT
10212 : (loop_vinfo));
10213 : }
10214 :
10215 22179 : return true;
10216 : }
10217 :
10218 : /* If STMT is not relevant and it is a simple assignment and its inputs are
10219 : invariant then it can remain in place, unvectorized. The original last
10220 : scalar value that it computes will be used. */
10221 201826 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
10222 : {
10223 0 : gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10224 0 : if (dump_enabled_p ())
10225 0 : dump_printf_loc (MSG_NOTE, vect_location,
10226 : "statement is simple and uses invariant. Leaving in "
10227 : "place.\n");
10228 0 : return true;
10229 : }
10230 :
10231 201826 : gcc_assert (slp_index >= 0);
10232 :
10233 : /* Get the last occurrence of the scalar index from the concatenation of
10234 : all the slp vectors. Calculate which slp vector it is and the index
10235 : within. */
10236 201826 : int num_scalar = SLP_TREE_LANES (slp_node);
10237 201826 : int num_vec = vect_get_num_copies (vinfo, slp_node);
10238 201826 : poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10239 :
10240 : /* Calculate which vector contains the result, and which lane of
10241 : that vector we need. */
10242 201826 : if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10243 : {
10244 : if (dump_enabled_p ())
10245 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10246 : "Cannot determine which vector holds the"
10247 : " final result.\n");
10248 : return false;
10249 : }
10250 :
10251 201826 : if (!vec_stmt_p)
10252 : {
10253 : /* No transformation required. */
10254 161977 : if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10255 : {
10256 27726 : if (SLP_TREE_LANES (slp_node) != 1)
10257 : {
10258 19 : if (dump_enabled_p ())
10259 19 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10260 : "can't operate on partial vectors "
10261 : "because an SLP statement is live after "
10262 : "the loop.\n");
10263 19 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10264 : }
10265 27707 : else if (num_vec > 1)
10266 : {
10267 16086 : if (dump_enabled_p ())
10268 51 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10269 : "can't operate on partial vectors "
10270 : "because ncopies is greater than 1.\n");
10271 16086 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10272 : }
10273 : else
10274 : {
10275 11621 : if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10276 : OPTIMIZE_FOR_SPEED))
10277 0 : vect_record_loop_mask (loop_vinfo,
10278 : &LOOP_VINFO_MASKS (loop_vinfo),
10279 : 1, vectype, NULL);
10280 11621 : else if (can_vec_extract_var_idx_p (
10281 11621 : TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10282 0 : vect_record_loop_len (loop_vinfo,
10283 : &LOOP_VINFO_LENS (loop_vinfo),
10284 : 1, vectype, 1);
10285 : else
10286 : {
10287 11621 : if (dump_enabled_p ())
10288 630 : dump_printf_loc (
10289 630 : MSG_MISSED_OPTIMIZATION, vect_location,
10290 : "can't operate on partial vectors "
10291 : "because the target doesn't support extract "
10292 : "last reduction.\n");
10293 11621 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10294 : }
10295 : }
10296 : }
10297 : /* ??? Enable for loop costing as well. */
10298 27726 : if (!loop_vinfo)
10299 90885 : record_stmt_cost (cost_vec, 1, vec_to_scalar, slp_node,
10300 : 0, vect_epilogue);
10301 161977 : return true;
10302 : }
10303 :
10304 : /* Use the lhs of the original scalar statement. */
10305 39849 : gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10306 39849 : if (dump_enabled_p ())
10307 1023 : dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10308 : "stmt %G", stmt);
10309 :
10310 39849 : lhs = gimple_get_lhs (stmt);
10311 39849 : lhs_type = TREE_TYPE (lhs);
10312 :
10313 39849 : bitsize = vector_element_bits_tree (vectype);
10314 :
10315 : /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10316 39849 : gcc_assert (!loop_vinfo
10317 : || ((!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10318 : && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10319 : || SLP_TREE_LANES (slp_node) == 1));
10320 :
10321 : /* Get the correct slp vectorized stmt. */
10322 39849 : tree vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10323 39849 : gimple *vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10324 :
10325 : /* In case we need to early break vectorize also get the first stmt. */
10326 39849 : tree vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10327 :
10328 : /* Get entry to use. */
10329 39849 : tree bitstart = bitsize_int (vec_index);
10330 39849 : bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10331 :
10332 39849 : if (loop_vinfo)
10333 : {
10334 : /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10335 : requirement, insert one phi node for it. It looks like:
10336 : loop;
10337 : BB:
10338 : # lhs' = PHI <lhs>
10339 : ==>
10340 : loop;
10341 : BB:
10342 : # vec_lhs' = PHI <vec_lhs>
10343 : new_tree = lane_extract <vec_lhs', ...>;
10344 : lhs' = new_tree; */
10345 :
10346 2882 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10347 : /* Check if we have a loop where the chosen exit is not the main exit,
10348 : in these cases for an early break we restart the iteration the vector code
10349 : did. For the live values we want the value at the start of the iteration
10350 : rather than at the end. */
10351 2882 : edge main_e = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
10352 2882 : bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10353 14981 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10354 9217 : if (!is_gimple_debug (use_stmt)
10355 9217 : && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10356 2823 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10357 : {
10358 2823 : edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10359 2823 : phi_arg_index_from_use (use_p));
10360 2823 : gcc_assert (loop_exit_edge_p (loop, e));
10361 2823 : bool main_exit_edge = e == main_e;
10362 2823 : tree tmp_vec_lhs = vec_lhs;
10363 2823 : tree tmp_bitstart = bitstart;
10364 :
10365 : /* For early exit where the exit is not in the BB that leads
10366 : to the latch then we're restarting the iteration in the
10367 : scalar loop. So get the first live value. */
10368 2823 : bool early_break_first_element_p
10369 2823 : = all_exits_as_early_p || !main_exit_edge;
10370 2823 : if (early_break_first_element_p)
10371 : {
10372 195 : tmp_vec_lhs = vec_lhs0;
10373 195 : tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10374 : }
10375 :
10376 2823 : gimple_stmt_iterator exit_gsi;
10377 2823 : tree new_tree
10378 2823 : = vectorizable_live_operation_1 (loop_vinfo,
10379 : e->dest, vectype,
10380 : slp_node, bitsize,
10381 : tmp_bitstart, tmp_vec_lhs,
10382 : lhs_type, &exit_gsi);
10383 :
10384 2823 : auto gsi = gsi_for_stmt (use_stmt);
10385 2823 : tree lhs_phi = gimple_phi_result (use_stmt);
10386 2823 : remove_phi_node (&gsi, false);
10387 2823 : gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10388 2823 : gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10389 2823 : break;
10390 2882 : }
10391 :
10392 : /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
10393 12158 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10394 6394 : gcc_assert (is_gimple_debug (use_stmt)
10395 2882 : || flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10396 : }
10397 : else
10398 : {
10399 : /* For basic-block vectorization simply insert the lane-extraction. */
10400 36967 : tree bftype = TREE_TYPE (vectype);
10401 36967 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10402 0 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10403 36967 : tree new_tree = build3 (BIT_FIELD_REF, bftype,
10404 : vec_lhs, bitsize, bitstart);
10405 36967 : gimple_seq stmts = NULL;
10406 36967 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10407 : &stmts, true, NULL_TREE);
10408 36967 : if (TREE_CODE (new_tree) == SSA_NAME
10409 73934 : && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10410 2 : SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10411 36967 : if (is_a <gphi *> (vec_stmt))
10412 : {
10413 2593 : gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10414 2593 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10415 : }
10416 : else
10417 : {
10418 34374 : gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10419 34374 : gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10420 : }
10421 :
10422 : /* Replace use of lhs with newly computed result. If the use stmt is a
10423 : single arg PHI, just replace all uses of PHI result. It's necessary
10424 : because lcssa PHI defining lhs may be before newly inserted stmt. */
10425 36967 : use_operand_p use_p;
10426 36967 : stmt_vec_info use_stmt_info;
10427 236823 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10428 162889 : if (!is_gimple_debug (use_stmt)
10429 162889 : && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10430 112568 : || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10431 : {
10432 : /* ??? This can happen when the live lane ends up being
10433 : rooted in a vector construction code-generated by an
10434 : external SLP node (and code-generation for that already
10435 : happened). See gcc.dg/vect/bb-slp-47.c.
10436 : Doing this is what would happen if that vector CTOR
10437 : were not code-generated yet so it is not too bad.
10438 : ??? In fact we'd likely want to avoid this situation
10439 : in the first place. */
10440 64307 : if (TREE_CODE (new_tree) == SSA_NAME
10441 63945 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10442 63945 : && gimple_code (use_stmt) != GIMPLE_PHI
10443 120820 : && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10444 : use_stmt))
10445 : {
10446 362 : if (dump_enabled_p ())
10447 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10448 : "Using original scalar computation for "
10449 : "live lane because use preceeds vector "
10450 : "def\n");
10451 362 : continue;
10452 : }
10453 : /* ??? It can also happen that we end up pulling a def into
10454 : a loop where replacing out-of-loop uses would require
10455 : a new LC SSA PHI node. Retain the original scalar in
10456 : those cases as well. PR98064. */
10457 65290 : if (TREE_CODE (new_tree) == SSA_NAME
10458 63583 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10459 63583 : && (gimple_bb (use_stmt)->loop_father
10460 63583 : != gimple_bb (vec_stmt)->loop_father)
10461 71220 : && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10462 7637 : gimple_bb (use_stmt)->loop_father))
10463 : {
10464 1707 : if (dump_enabled_p ())
10465 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10466 : "Using original scalar computation for "
10467 : "live lane because there is an out-of-loop "
10468 : "definition for it\n");
10469 1707 : continue;
10470 : }
10471 190148 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10472 64136 : SET_USE (use_p, new_tree);
10473 61876 : update_stmt (use_stmt);
10474 36967 : }
10475 : }
10476 :
10477 : return true;
10478 259732 : }
10479 :
10480 : /* Given loop represented by LOOP_VINFO, return true if computation of
10481 : LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10482 : otherwise. */
10483 :
10484 : static bool
10485 61375 : loop_niters_no_overflow (loop_vec_info loop_vinfo)
10486 : {
10487 61375 : gcc_assert (!LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo));
10488 :
10489 : /* Constant case. */
10490 61375 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10491 : {
10492 35816 : tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10493 35816 : tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10494 :
10495 35816 : gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10496 35816 : gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10497 35816 : if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10498 : return true;
10499 : }
10500 :
10501 25559 : widest_int max;
10502 25559 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10503 : /* Check the upper bound of loop niters. */
10504 25559 : if (get_max_loop_iterations (loop, &max))
10505 : {
10506 25559 : tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10507 25559 : signop sgn = TYPE_SIGN (type);
10508 25559 : widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10509 25559 : if (max < type_max)
10510 25334 : return true;
10511 25559 : }
10512 : return false;
10513 25559 : }
10514 :
10515 : /* Return a mask type with half the number of elements as OLD_TYPE,
10516 : given that it should have mode NEW_MODE. */
10517 :
10518 : tree
10519 3714 : vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10520 : {
10521 3714 : poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10522 3714 : return build_truth_vector_type_for_mode (nunits, new_mode);
10523 : }
10524 :
10525 : /* Return a mask type with twice as many elements as OLD_TYPE,
10526 : given that it should have mode NEW_MODE. */
10527 :
10528 : tree
10529 5915 : vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10530 : {
10531 5915 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10532 5915 : return build_truth_vector_type_for_mode (nunits, new_mode);
10533 : }
10534 :
10535 : /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10536 : contain a sequence of NVECTORS masks that each control a vector of type
10537 : VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10538 : these vector masks with the vector version of SCALAR_MASK. */
10539 :
10540 : void
10541 77672 : vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10542 : unsigned int nvectors, tree vectype, tree scalar_mask)
10543 : {
10544 77672 : gcc_assert (nvectors != 0);
10545 :
10546 77672 : if (scalar_mask)
10547 : {
10548 3638 : scalar_cond_masked_key cond (scalar_mask, nvectors);
10549 3638 : loop_vinfo->scalar_cond_masked_set.add (cond);
10550 : }
10551 :
10552 77672 : masks->mask_set.add (std::make_pair (vectype, nvectors));
10553 77672 : }
10554 :
10555 : /* Given a complete set of masks MASKS, extract mask number INDEX
10556 : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10557 : where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10558 :
10559 : See the comment above vec_loop_masks for more details about the mask
10560 : arrangement. */
10561 :
10562 : tree
10563 208 : vect_get_loop_mask (loop_vec_info loop_vinfo,
10564 : gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10565 : unsigned int nvectors, tree vectype, unsigned int index)
10566 : {
10567 208 : if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10568 : == vect_partial_vectors_while_ult)
10569 : {
10570 0 : rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10571 0 : tree mask_type = rgm->type;
10572 :
10573 : /* Populate the rgroup's mask array, if this is the first time we've
10574 : used it. */
10575 0 : if (rgm->controls.is_empty ())
10576 : {
10577 0 : rgm->controls.safe_grow_cleared (nvectors, true);
10578 0 : for (unsigned int i = 0; i < nvectors; ++i)
10579 : {
10580 0 : tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10581 : /* Provide a dummy definition until the real one is available. */
10582 0 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10583 0 : rgm->controls[i] = mask;
10584 : }
10585 : }
10586 :
10587 0 : tree mask = rgm->controls[index];
10588 0 : if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10589 0 : TYPE_VECTOR_SUBPARTS (vectype)))
10590 : {
10591 : /* A loop mask for data type X can be reused for data type Y
10592 : if X has N times more elements than Y and if Y's elements
10593 : are N times bigger than X's. In this case each sequence
10594 : of N elements in the loop mask will be all-zero or all-one.
10595 : We can then view-convert the mask so that each sequence of
10596 : N elements is replaced by a single element. */
10597 0 : gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10598 : TYPE_VECTOR_SUBPARTS (vectype)));
10599 0 : gimple_seq seq = NULL;
10600 0 : mask_type = truth_type_for (vectype);
10601 0 : mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10602 0 : if (seq)
10603 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10604 : }
10605 0 : return mask;
10606 : }
10607 208 : else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10608 : == vect_partial_vectors_avx512)
10609 : {
10610 : /* The number of scalars per iteration and the number of vectors are
10611 : both compile-time constants. */
10612 208 : unsigned int nscalars_per_iter
10613 208 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10614 208 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10615 :
10616 208 : rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10617 :
10618 : /* The stored nV is dependent on the mask type produced. */
10619 208 : gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10620 : TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10621 : == rgm->factor);
10622 208 : nvectors = rgm->factor;
10623 :
10624 : /* Populate the rgroup's mask array, if this is the first time we've
10625 : used it. */
10626 208 : if (rgm->controls.is_empty ())
10627 : {
10628 20 : rgm->controls.safe_grow_cleared (nvectors, true);
10629 106 : for (unsigned int i = 0; i < nvectors; ++i)
10630 : {
10631 86 : tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
10632 : /* Provide a dummy definition until the real one is available. */
10633 86 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10634 86 : rgm->controls[i] = mask;
10635 : }
10636 : }
10637 208 : if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
10638 : TYPE_VECTOR_SUBPARTS (vectype)))
10639 160 : return rgm->controls[index];
10640 :
10641 : /* Split the vector if needed. Since we are dealing with integer mode
10642 : masks with AVX512 we can operate on the integer representation
10643 : performing the whole vector shifting. */
10644 48 : unsigned HOST_WIDE_INT factor;
10645 48 : bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
10646 48 : TYPE_VECTOR_SUBPARTS (vectype), &factor);
10647 0 : gcc_assert (ok);
10648 48 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
10649 48 : tree mask_type = truth_type_for (vectype);
10650 48 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
10651 48 : unsigned vi = index / factor;
10652 48 : unsigned vpart = index % factor;
10653 48 : tree vec = rgm->controls[vi];
10654 48 : gimple_seq seq = NULL;
10655 48 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
10656 48 : lang_hooks.types.type_for_mode
10657 48 : (TYPE_MODE (rgm->type), 1), vec);
10658 : /* For integer mode masks simply shift the right bits into position. */
10659 48 : if (vpart != 0)
10660 40 : vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
10661 : build_int_cst (integer_type_node,
10662 80 : (TYPE_VECTOR_SUBPARTS (vectype)
10663 40 : * vpart)));
10664 48 : vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
10665 48 : (TYPE_MODE (mask_type), 1), vec);
10666 48 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
10667 48 : if (seq)
10668 48 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10669 48 : return vec;
10670 : }
10671 : else
10672 0 : gcc_unreachable ();
10673 : }
10674 :
10675 : /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10676 : lengths for controlling an operation on VECTYPE. The operation splits
10677 : each element of VECTYPE into FACTOR separate subelements, measuring the
10678 : length as a number of these subelements. */
10679 :
10680 : void
10681 0 : vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10682 : unsigned int nvectors, tree vectype, unsigned int factor)
10683 : {
10684 0 : gcc_assert (nvectors != 0);
10685 0 : if (lens->length () < nvectors)
10686 0 : lens->safe_grow_cleared (nvectors, true);
10687 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
10688 :
10689 : /* The number of scalars per iteration, scalar occupied bytes and
10690 : the number of vectors are both compile-time constants. */
10691 0 : unsigned int nscalars_per_iter
10692 0 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10693 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10694 :
10695 0 : if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10696 : {
10697 : /* For now, we only support cases in which all loads and stores fall back
10698 : to VnQI or none do. */
10699 0 : gcc_assert (!rgl->max_nscalars_per_iter
10700 : || (rgl->factor == 1 && factor == 1)
10701 : || (rgl->max_nscalars_per_iter * rgl->factor
10702 : == nscalars_per_iter * factor));
10703 0 : rgl->max_nscalars_per_iter = nscalars_per_iter;
10704 0 : rgl->type = vectype;
10705 0 : rgl->factor = factor;
10706 : }
10707 0 : }
10708 :
10709 : /* Given a complete set of lengths LENS, extract length number INDEX
10710 : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10711 : where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
10712 : multipled by the number of elements that should be processed.
10713 : Insert any set-up statements before GSI. */
10714 :
10715 : tree
10716 0 : vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10717 : vec_loop_lens *lens, unsigned int nvectors, tree vectype,
10718 : unsigned int index, unsigned int factor, bool adjusted)
10719 : {
10720 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
10721 0 : bool use_bias_adjusted_len =
10722 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10723 :
10724 : /* Populate the rgroup's len array, if this is the first time we've
10725 : used it. */
10726 0 : if (rgl->controls.is_empty ())
10727 : {
10728 0 : rgl->controls.safe_grow_cleared (nvectors, true);
10729 0 : for (unsigned int i = 0; i < nvectors; ++i)
10730 : {
10731 0 : tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10732 0 : gcc_assert (len_type != NULL_TREE);
10733 :
10734 0 : tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10735 :
10736 : /* Provide a dummy definition until the real one is available. */
10737 0 : SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10738 0 : rgl->controls[i] = len;
10739 :
10740 0 : if (use_bias_adjusted_len)
10741 : {
10742 0 : gcc_assert (i == 0);
10743 0 : tree adjusted_len =
10744 0 : make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10745 0 : SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10746 0 : rgl->bias_adjusted_ctrl = adjusted_len;
10747 : }
10748 : }
10749 : }
10750 :
10751 0 : if (use_bias_adjusted_len && adjusted)
10752 0 : return rgl->bias_adjusted_ctrl;
10753 :
10754 0 : tree loop_len = rgl->controls[index];
10755 0 : if (rgl->factor == 1 && factor == 1)
10756 : {
10757 0 : poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
10758 0 : poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
10759 0 : if (maybe_ne (nunits1, nunits2))
10760 : {
10761 : /* A loop len for data type X can be reused for data type Y
10762 : if X has N times more elements than Y and if Y's elements
10763 : are N times bigger than X's. */
10764 0 : gcc_assert (multiple_p (nunits1, nunits2));
10765 0 : factor = exact_div (nunits1, nunits2).to_constant ();
10766 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10767 0 : gimple_seq seq = NULL;
10768 0 : loop_len = gimple_build (&seq, EXACT_DIV_EXPR, iv_type, loop_len,
10769 0 : build_int_cst (iv_type, factor));
10770 0 : if (seq)
10771 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10772 : }
10773 0 : }
10774 0 : else if (factor && rgl->factor != factor)
10775 : {
10776 : /* The number of scalars per iteration, scalar occupied bytes and
10777 : the number of vectors are both compile-time constants. */
10778 0 : unsigned int nscalars_per_iter
10779 0 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10780 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10781 0 : unsigned int rglvecsize = rgl->factor * rgl->max_nscalars_per_iter;
10782 0 : unsigned int vecsize = nscalars_per_iter * factor;
10783 0 : if (rglvecsize > vecsize)
10784 : {
10785 0 : unsigned int fac = rglvecsize / vecsize;
10786 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10787 0 : gimple_seq seq = NULL;
10788 0 : loop_len = gimple_build (&seq, EXACT_DIV_EXPR, iv_type, loop_len,
10789 0 : build_int_cst (iv_type, fac));
10790 0 : if (seq)
10791 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10792 : }
10793 0 : else if (rglvecsize < vecsize)
10794 : {
10795 0 : unsigned int fac = vecsize / rglvecsize;
10796 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10797 0 : gimple_seq seq = NULL;
10798 0 : loop_len = gimple_build (&seq, MULT_EXPR, iv_type, loop_len,
10799 0 : build_int_cst (iv_type, fac));
10800 0 : if (seq)
10801 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10802 : }
10803 : }
10804 : return loop_len;
10805 : }
10806 :
10807 : /* Generate the tree for the loop len mask and return it. Given the lens,
10808 : nvectors, vectype, index and factor to gen the len mask as below.
10809 :
10810 : tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
10811 : */
10812 : tree
10813 0 : vect_gen_loop_len_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10814 : gimple_stmt_iterator *cond_gsi, vec_loop_lens *lens,
10815 : unsigned int nvectors, tree vectype, tree stmt,
10816 : unsigned int index, unsigned int factor)
10817 : {
10818 0 : tree all_one_mask = build_all_ones_cst (vectype);
10819 0 : tree all_zero_mask = build_zero_cst (vectype);
10820 0 : tree len = vect_get_loop_len (loop_vinfo, gsi, lens, nvectors, vectype, index,
10821 : factor, true);
10822 0 : tree bias = build_int_cst (intQI_type_node,
10823 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo));
10824 0 : tree len_mask = make_temp_ssa_name (TREE_TYPE (stmt), NULL, "vec_len_mask");
10825 0 : gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, stmt,
10826 : all_one_mask, all_zero_mask, len,
10827 : bias);
10828 0 : gimple_call_set_lhs (call, len_mask);
10829 0 : gsi_insert_before (cond_gsi, call, GSI_SAME_STMT);
10830 :
10831 0 : return len_mask;
10832 : }
10833 :
10834 : /* Scale profiling counters by estimation for LOOP which is vectorized
10835 : by factor VF.
10836 : If FLAT is true, the loop we started with had unrealistically flat
10837 : profile. */
10838 :
10839 : static void
10840 61418 : scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
10841 : {
10842 : /* For flat profiles do not scale down proportionally by VF and only
10843 : cap by known iteration count bounds. */
10844 61418 : if (flat)
10845 : {
10846 34387 : if (dump_file && (dump_flags & TDF_DETAILS))
10847 5255 : fprintf (dump_file,
10848 : "Vectorized loop profile seems flat; not scaling iteration "
10849 : "count down by the vectorization factor %i\n", vf);
10850 34387 : scale_loop_profile (loop, profile_probability::always (),
10851 : get_likely_max_loop_iterations_int (loop));
10852 34387 : return;
10853 : }
10854 : /* Loop body executes VF fewer times and exit increases VF times. */
10855 27031 : profile_count entry_count = loop_preheader_edge (loop)->count ();
10856 :
10857 : /* If we have unreliable loop profile avoid dropping entry
10858 : count below header count. This can happen since loops
10859 : has unrealistically low trip counts. */
10860 27031 : while (vf > 1
10861 28162 : && loop->header->count > entry_count
10862 57353 : && loop->header->count < entry_count * vf)
10863 : {
10864 2160 : if (dump_file && (dump_flags & TDF_DETAILS))
10865 153 : fprintf (dump_file,
10866 : "Vectorization factor %i seems too large for profile "
10867 : "prevoiusly believed to be consistent; reducing.\n", vf);
10868 2160 : vf /= 2;
10869 : }
10870 :
10871 27031 : if (entry_count.nonzero_p ())
10872 27031 : set_edge_probability_and_rescale_others
10873 27031 : (exit_e,
10874 27031 : entry_count.probability_in (loop->header->count / vf));
10875 : /* Avoid producing very large exit probability when we do not have
10876 : sensible profile. */
10877 0 : else if (exit_e->probability < profile_probability::always () / (vf * 2))
10878 0 : set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
10879 27031 : loop->latch->count = single_pred_edge (loop->latch)->count ();
10880 :
10881 27031 : scale_loop_profile (loop, profile_probability::always () / vf,
10882 : get_likely_max_loop_iterations_int (loop));
10883 : }
10884 :
10885 : /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
10886 : original loop that has now been vectorized.
10887 :
10888 : The inits of the data_references need to be advanced with the number of
10889 : iterations of the main loop. This has been computed in vect_do_peeling and
10890 : is stored in parameter ADVANCE.
10891 :
10892 : Since the loop_vec_info of this EPILOGUE was constructed for the original
10893 : loop, its stmt_vec_infos all point to the original statements. These need
10894 : to be updated to point to their corresponding copies.
10895 :
10896 : The data_reference's connections also need to be updated. Their
10897 : corresponding dr_vec_info need to be reconnected to the EPILOGUE's
10898 : stmt_vec_infos, their statements need to point to their corresponding
10899 : copy. */
10900 :
10901 : static void
10902 7056 : update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
10903 : {
10904 7056 : loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
10905 7056 : hash_map<tree,tree> mapping;
10906 7056 : gimple *orig_stmt, *new_stmt;
10907 7056 : gimple_stmt_iterator epilogue_gsi;
10908 7056 : gphi_iterator epilogue_phi_gsi;
10909 7056 : stmt_vec_info stmt_vinfo = NULL, related_vinfo;
10910 7056 : basic_block *epilogue_bbs = get_loop_body (epilogue);
10911 7056 : unsigned i;
10912 :
10913 7056 : free (LOOP_VINFO_BBS (epilogue_vinfo));
10914 7056 : LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
10915 7056 : LOOP_VINFO_NBBS (epilogue_vinfo) = epilogue->num_nodes;
10916 :
10917 : /* The EPILOGUE loop is a copy of the original loop so they share the same
10918 : gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
10919 : point to the copied statements. */
10920 21168 : for (unsigned i = 0; i < epilogue->num_nodes; ++i)
10921 : {
10922 14112 : for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
10923 36315 : !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
10924 : {
10925 22203 : new_stmt = epilogue_phi_gsi.phi ();
10926 :
10927 22203 : gcc_assert (gimple_uid (new_stmt) > 0);
10928 22203 : stmt_vinfo
10929 22203 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10930 :
10931 22203 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10932 : }
10933 :
10934 28224 : for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
10935 144650 : !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
10936 : {
10937 130538 : new_stmt = gsi_stmt (epilogue_gsi);
10938 130538 : if (is_gimple_debug (new_stmt))
10939 21966 : continue;
10940 :
10941 108572 : gcc_assert (gimple_uid (new_stmt) > 0);
10942 108572 : stmt_vinfo
10943 108572 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10944 :
10945 108572 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10946 :
10947 108572 : related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
10948 108572 : if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
10949 : {
10950 2009 : gimple *stmt = STMT_VINFO_STMT (related_vinfo);
10951 : /* Set BB such that the assert in
10952 : 'get_initial_defs_for_reduction' is able to determine that
10953 : the BB of the related stmt is inside this loop. */
10954 2009 : gimple_set_bb (stmt,
10955 : gimple_bb (new_stmt));
10956 2009 : related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
10957 2009 : gcc_assert (related_vinfo == NULL
10958 : || related_vinfo == stmt_vinfo);
10959 : }
10960 : }
10961 : }
10962 :
10963 7056 : struct data_reference *dr;
10964 7056 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
10965 32641 : FOR_EACH_VEC_ELT (datarefs, i, dr)
10966 : {
10967 25585 : orig_stmt = DR_STMT (dr);
10968 25585 : gcc_assert (gimple_uid (orig_stmt) > 0);
10969 25585 : stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
10970 25585 : DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
10971 : }
10972 :
10973 : /* Advance data_reference's with the number of iterations of the previous
10974 : loop and its prologue. */
10975 7056 : vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
10976 :
10977 : /* Remember the advancement made. */
10978 7056 : LOOP_VINFO_DRS_ADVANCED_BY (epilogue_vinfo) = advance;
10979 7056 : }
10980 :
10981 : /* When vectorizing early break statements instructions that happen before
10982 : the early break in the current BB need to be moved to after the early
10983 : break. This function deals with that and assumes that any validity
10984 : checks has already been performed.
10985 :
10986 : While moving the instructions if it encounters a VUSE or VDEF it then
10987 : corrects the VUSES as it moves the statements along. GDEST is the location
10988 : in which to insert the new statements. */
10989 :
10990 : static void
10991 1406 : move_early_exit_stmts (loop_vec_info loop_vinfo)
10992 : {
10993 1406 : DUMP_VECT_SCOPE ("move_early_exit_stmts");
10994 :
10995 1406 : if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
10996 1189 : return;
10997 :
10998 : /* Move all stmts that need moving. */
10999 217 : basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
11000 217 : gimple_stmt_iterator dest_gsi = gsi_after_labels (dest_bb);
11001 :
11002 217 : tree last_seen_vuse = NULL_TREE;
11003 533 : for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11004 : {
11005 : /* We have to update crossed degenerate virtual PHIs. Simply
11006 : elide them. */
11007 316 : if (gphi *vphi = dyn_cast <gphi *> (stmt))
11008 : {
11009 7 : tree vdef = gimple_phi_result (vphi);
11010 7 : tree vuse = gimple_phi_arg_def (vphi, 0);
11011 7 : imm_use_iterator iter;
11012 7 : use_operand_p use_p;
11013 7 : gimple *use_stmt;
11014 30 : FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef)
11015 : {
11016 48 : FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
11017 16 : SET_USE (use_p, vuse);
11018 7 : }
11019 7 : auto gsi = gsi_for_stmt (stmt);
11020 7 : remove_phi_node (&gsi, true);
11021 7 : last_seen_vuse = vuse;
11022 7 : continue;
11023 7 : }
11024 :
11025 : /* Check to see if statement is still required for vect or has been
11026 : elided. */
11027 309 : auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11028 309 : if (!stmt_info)
11029 0 : continue;
11030 :
11031 309 : if (dump_enabled_p ())
11032 158 : dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11033 :
11034 309 : gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11035 309 : gsi_move_before (&stmt_gsi, &dest_gsi, GSI_NEW_STMT);
11036 618 : last_seen_vuse = gimple_vuse (stmt);
11037 : }
11038 :
11039 : /* Update all the stmts with their new reaching VUSES. */
11040 679 : for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11041 : {
11042 194 : if (dump_enabled_p ())
11043 158 : dump_printf_loc (MSG_NOTE, vect_location,
11044 : "updating vuse to %T for load %G",
11045 : last_seen_vuse, p);
11046 194 : gimple_set_vuse (p, last_seen_vuse);
11047 194 : update_stmt (p);
11048 : }
11049 :
11050 : /* And update the LC PHIs on exits. */
11051 1098 : for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
11052 447 : if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
11053 243 : if (gphi *phi = get_virtual_phi (e->dest))
11054 460 : SET_PHI_ARG_DEF_ON_EDGE (phi, e, last_seen_vuse);
11055 : }
11056 :
11057 : /* Generate adjustment code for early break scalar IVs filling in the value
11058 : we created earlier on for LOOP_VINFO_EARLY_BRK_NITERS_VAR. */
11059 :
11060 : static void
11061 1406 : vect_update_ivs_after_vectorizer_for_early_breaks (loop_vec_info loop_vinfo)
11062 : {
11063 1406 : DUMP_VECT_SCOPE ("vect_update_ivs_after_vectorizer_for_early_breaks");
11064 :
11065 1406 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11066 0 : return;
11067 :
11068 1406 : gcc_assert (LOOP_VINFO_EARLY_BRK_NITERS_VAR (loop_vinfo));
11069 :
11070 1406 : tree phi_var = LOOP_VINFO_EARLY_BRK_NITERS_VAR (loop_vinfo);
11071 1406 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11072 1406 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11073 1406 : tree ty_var = TREE_TYPE (phi_var);
11074 1406 : auto loop = LOOP_VINFO_LOOP (loop_vinfo);
11075 1406 : tree induc_var = niters_skip ? copy_ssa_name (phi_var) : phi_var;
11076 :
11077 1406 : auto induction_phi = create_phi_node (induc_var, loop->header);
11078 1406 : tree induc_def = PHI_RESULT (induction_phi);
11079 :
11080 : /* Create the iv update inside the loop. */
11081 1406 : gimple_seq init_stmts = NULL;
11082 1406 : gimple_seq stmts = NULL;
11083 1406 : gimple_seq iv_stmts = NULL;
11084 1406 : tree tree_vf = build_int_cst (ty_var, vf);
11085 :
11086 : /* For loop len targets we have to use .SELECT_VL (ivtmp_33, VF); instead of
11087 : just += VF as the VF can change in between two loop iterations. */
11088 1406 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
11089 : {
11090 0 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
11091 0 : tree_vf = vect_get_loop_len (loop_vinfo, NULL, lens, 1,
11092 : NULL_TREE, 0, 0, true);
11093 : }
11094 :
11095 1406 : tree iter_var;
11096 1406 : if (POINTER_TYPE_P (ty_var))
11097 : {
11098 0 : tree offset = gimple_convert (&stmts, sizetype, tree_vf);
11099 0 : iter_var = gimple_build (&stmts, POINTER_PLUS_EXPR, ty_var, induc_def,
11100 : gimple_convert (&stmts, sizetype, offset));
11101 : }
11102 : else
11103 : {
11104 1406 : tree offset = gimple_convert (&stmts, ty_var, tree_vf);
11105 1406 : iter_var = gimple_build (&stmts, PLUS_EXPR, ty_var, induc_def, offset);
11106 : }
11107 :
11108 1406 : tree init_var = build_zero_cst (ty_var);
11109 1406 : if (niters_skip)
11110 0 : init_var = gimple_build (&init_stmts, MINUS_EXPR, ty_var, init_var,
11111 : gimple_convert (&init_stmts, ty_var, niters_skip));
11112 :
11113 1406 : add_phi_arg (induction_phi, iter_var,
11114 : loop_latch_edge (loop), UNKNOWN_LOCATION);
11115 1406 : add_phi_arg (induction_phi, init_var,
11116 : loop_preheader_edge (loop), UNKNOWN_LOCATION);
11117 :
11118 : /* Find the first insertion point in the BB. */
11119 1406 : auto pe = loop_preheader_edge (loop);
11120 :
11121 : /* If we've done any peeling, calculate the peeling adjustment needed to the
11122 : final IV. */
11123 1406 : if (niters_skip)
11124 : {
11125 0 : tree induc_type = TREE_TYPE (induc_def);
11126 0 : tree s_induc_type = signed_type_for (induc_type);
11127 0 : induc_def = gimple_build (&iv_stmts, MAX_EXPR, s_induc_type,
11128 : gimple_convert (&iv_stmts, s_induc_type,
11129 : induc_def),
11130 : build_zero_cst (s_induc_type));
11131 0 : auto stmt = gimple_build_assign (phi_var,
11132 : gimple_convert (&iv_stmts, induc_type,
11133 : induc_def));
11134 0 : gimple_seq_add_stmt_without_update (&iv_stmts, stmt);
11135 0 : basic_block exit_bb = NULL;
11136 : /* Identify the early exit merge block. I wish we had stored this. */
11137 0 : for (auto e : get_loop_exit_edges (loop))
11138 0 : if (e != LOOP_VINFO_MAIN_EXIT (loop_vinfo))
11139 : {
11140 0 : exit_bb = e->dest;
11141 0 : break;
11142 0 : }
11143 :
11144 0 : gcc_assert (exit_bb);
11145 0 : auto exit_gsi = gsi_after_labels (exit_bb);
11146 0 : gsi_insert_seq_before (&exit_gsi, iv_stmts, GSI_SAME_STMT);
11147 : }
11148 : /* Write the init_stmts in the loop-preheader block. */
11149 1406 : auto psi = gsi_last_nondebug_bb (pe->src);
11150 1406 : gsi_insert_seq_after (&psi, init_stmts, GSI_LAST_NEW_STMT);
11151 : /* Wite the adjustments in the header block. */
11152 1406 : basic_block bb = loop->header;
11153 1406 : auto si = gsi_after_labels (bb);
11154 1406 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11155 : }
11156 :
11157 : /* Function vect_transform_loop.
11158 :
11159 : The analysis phase has determined that the loop is vectorizable.
11160 : Vectorize the loop - created vectorized stmts to replace the scalar
11161 : stmts in the loop, and update the loop exit condition.
11162 : Returns scalar epilogue loop if any. */
11163 :
11164 : class loop *
11165 61418 : vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11166 : {
11167 61418 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11168 61418 : class loop *epilogue = NULL;
11169 61418 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11170 61418 : int nbbs = loop->num_nodes;
11171 61418 : int i;
11172 61418 : tree niters_vector = NULL_TREE;
11173 61418 : tree step_vector = NULL_TREE;
11174 61418 : tree niters_vector_mult_vf = NULL_TREE;
11175 61418 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11176 61418 : unsigned int lowest_vf = constant_lower_bound (vf);
11177 61418 : gimple *stmt;
11178 61418 : bool check_profitability = false;
11179 61418 : unsigned int th;
11180 61418 : bool flat = maybe_flat_loop_profile (loop);
11181 61418 : bool uncounted_p = LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo);
11182 :
11183 61418 : DUMP_VECT_SCOPE ("vec_transform_loop");
11184 :
11185 61418 : if (! LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11186 54362 : loop_vinfo->shared->check_datarefs ();
11187 :
11188 : /* Use the more conservative vectorization threshold. If the number
11189 : of iterations is constant assume the cost check has been performed
11190 : by our caller. If the threshold makes all loops profitable that
11191 : run at least the (estimated) vectorization factor number of times
11192 : checking is pointless, too. */
11193 61418 : th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11194 61418 : if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11195 : {
11196 18548 : if (dump_enabled_p ())
11197 174 : dump_printf_loc (MSG_NOTE, vect_location,
11198 : "Profitability threshold is %d loop iterations.\n",
11199 : th);
11200 : check_profitability = true;
11201 : }
11202 :
11203 : /* Make sure there exists a single-predecessor exit bb. Do this before
11204 : versioning. */
11205 61418 : edge e = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
11206 61418 : if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11207 : {
11208 18947 : split_loop_exit_edge (e, true);
11209 18947 : if (dump_enabled_p ())
11210 2268 : dump_printf (MSG_NOTE, "split exit edge\n");
11211 : }
11212 :
11213 : /* Version the loop first, if required, so the profitability check
11214 : comes first. */
11215 :
11216 61418 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11217 : {
11218 3783 : class loop *sloop
11219 3783 : = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11220 3783 : sloop->force_vectorize = false;
11221 3783 : check_profitability = false;
11222 : }
11223 :
11224 : /* Make sure there exists a single-predecessor exit bb also on the
11225 : scalar loop copy. Do this after versioning but before peeling
11226 : so CFG structure is fine for both scalar and if-converted loop
11227 : to make slpeel_duplicate_current_defs_from_edges face matched
11228 : loop closed PHI nodes on the exit. */
11229 61418 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11230 : {
11231 8037 : e = LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo);
11232 8037 : if (! single_pred_p (e->dest))
11233 : {
11234 7768 : split_loop_exit_edge (e, true);
11235 7768 : if (dump_enabled_p ())
11236 1140 : dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11237 : }
11238 : }
11239 :
11240 61418 : tree niters = vect_build_loop_niters (loop_vinfo);
11241 61418 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11242 61418 : tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11243 61418 : tree advance;
11244 61418 : drs_init_vec orig_drs_init;
11245 61418 : bool niters_no_overflow = uncounted_p ? false /* Not known. */
11246 61375 : : loop_niters_no_overflow (loop_vinfo);
11247 :
11248 61418 : epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11249 : &step_vector, &niters_vector_mult_vf, th,
11250 : check_profitability, niters_no_overflow,
11251 : &advance);
11252 :
11253 : /* Assign hierarchical discriminators to the vectorized loop. */
11254 61418 : poly_uint64 vf_val = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11255 61418 : unsigned int vf_int = constant_lower_bound (vf_val);
11256 61418 : if (vf_int > DISCR_MULTIPLICITY_MAX)
11257 : vf_int = DISCR_MULTIPLICITY_MAX;
11258 :
11259 : /* Assign unique copy_id dynamically instead of using hardcoded constants.
11260 : Epilogue and main vectorized loops get different copy_ids. */
11261 61418 : gimple *loop_last = last_nondebug_stmt (loop->header);
11262 61418 : location_t loop_loc
11263 61418 : = loop_last ? gimple_location (loop_last) : UNKNOWN_LOCATION;
11264 61144 : if (loop_loc != UNKNOWN_LOCATION)
11265 : {
11266 50615 : unsigned int copyid = allocate_copyid_base (loop_loc, 1);
11267 50615 : assign_discriminators_to_loop (loop, vf_int, copyid);
11268 : }
11269 61418 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11270 61418 : && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11271 : {
11272 : /* Ifcvt duplicates loop preheader, loop body and produces an basic
11273 : block after loop exit. We need to scale all that. */
11274 91 : basic_block preheader
11275 91 : = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11276 91 : preheader->count
11277 : = preheader->count.apply_probability
11278 91 : (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11279 91 : scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11280 : LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11281 91 : LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo)->dest->count = preheader->count;
11282 : }
11283 :
11284 61418 : if (niters_vector == NULL_TREE && !uncounted_p)
11285 : {
11286 27331 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11287 27331 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11288 55404 : && known_eq (lowest_vf, vf))
11289 : {
11290 27328 : niters_vector
11291 27328 : = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11292 27328 : LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11293 27328 : step_vector = build_one_cst (TREE_TYPE (niters));
11294 : }
11295 748 : else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11296 1 : vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11297 : &step_vector, niters_no_overflow);
11298 : else
11299 : /* vect_do_peeling subtracted the number of peeled prologue
11300 : iterations from LOOP_VINFO_NITERS. */
11301 747 : vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11302 : &niters_vector, &step_vector,
11303 : niters_no_overflow);
11304 : }
11305 :
11306 : /* 1) Make sure the loop header has exactly two entries
11307 : 2) Make sure we have a preheader basic block. */
11308 :
11309 61418 : gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11310 :
11311 61418 : split_edge (loop_preheader_edge (loop));
11312 :
11313 61418 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11314 : /* This will deal with any possible peeling. */
11315 1 : vect_prepare_for_masked_peels (loop_vinfo);
11316 :
11317 : /* Handle any code motion that we need to for early-break vectorization after
11318 : we've done peeling but just before we start vectorizing. */
11319 61418 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11320 : {
11321 1406 : vect_update_ivs_after_vectorizer_for_early_breaks (loop_vinfo);
11322 1406 : move_early_exit_stmts (loop_vinfo);
11323 : }
11324 :
11325 : /* Remove existing clobber stmts and prefetches. */
11326 187520 : for (i = 0; i < nbbs; i++)
11327 : {
11328 126102 : basic_block bb = bbs[i];
11329 1094786 : for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);)
11330 : {
11331 842582 : stmt = gsi_stmt (si);
11332 842582 : if (gimple_clobber_p (stmt)
11333 842582 : || gimple_call_builtin_p (stmt, BUILT_IN_PREFETCH))
11334 : {
11335 90 : unlink_stmt_vdef (stmt);
11336 90 : gsi_remove (&si, true);
11337 90 : release_defs (stmt);
11338 : }
11339 : else
11340 842492 : gsi_next (&si);
11341 : }
11342 : }
11343 :
11344 : /* Schedule the SLP instances. */
11345 61418 : if (!loop_vinfo->slp_instances.is_empty ())
11346 : {
11347 61418 : DUMP_VECT_SCOPE ("scheduling SLP instances");
11348 61418 : vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11349 : }
11350 :
11351 : /* Generate the loop invariant statements. */
11352 61418 : if (!gimple_seq_empty_p (LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo)))
11353 : {
11354 73 : if (dump_enabled_p ())
11355 30 : dump_printf_loc (MSG_NOTE, vect_location,
11356 : "------>generating loop invariant statements\n");
11357 73 : gimple_stmt_iterator gsi;
11358 73 : gsi = gsi_after_labels (loop_preheader_edge (loop)->src);
11359 73 : gsi_insert_seq_before (&gsi, LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo),
11360 : GSI_CONTINUE_LINKING);
11361 : }
11362 :
11363 : /* Stub out scalar statements that must not survive vectorization and
11364 : were not picked as relevant in any SLP instance.
11365 : Doing this here helps with grouped statements, or statements that
11366 : are involved in patterns. */
11367 187520 : for (i = 0; i < nbbs; i++)
11368 : {
11369 126102 : basic_block bb = bbs[i];
11370 126102 : stmt_vec_info stmt_info;
11371 252204 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11372 1678350 : !gsi_end_p (gsi); gsi_next (&gsi))
11373 : {
11374 1552248 : gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11375 6355 : if (!call || !gimple_call_internal_p (call))
11376 1547047 : continue;
11377 5201 : internal_fn ifn = gimple_call_internal_fn (call);
11378 5201 : if (ifn == IFN_MASK_LOAD)
11379 : {
11380 735 : tree lhs = gimple_get_lhs (call);
11381 735 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11382 : {
11383 0 : tree zero = build_zero_cst (TREE_TYPE (lhs));
11384 0 : gimple *new_stmt = gimple_build_assign (lhs, zero);
11385 0 : gsi_replace (&gsi, new_stmt, true);
11386 : }
11387 : }
11388 4466 : else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11389 : {
11390 2295 : tree lhs = gimple_get_lhs (call);
11391 2295 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11392 : {
11393 0 : tree else_arg
11394 0 : = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11395 0 : gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11396 0 : gsi_replace (&gsi, new_stmt, true);
11397 : }
11398 : }
11399 2171 : else if (ifn == IFN_MASK_CALL
11400 4 : && (stmt_info = loop_vinfo->lookup_stmt (call))
11401 4 : && !STMT_VINFO_RELEVANT_P (stmt_info)
11402 2175 : && !STMT_VINFO_LIVE_P (stmt_info))
11403 : {
11404 4 : gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11405 4 : loop_vinfo->remove_stmt (stmt_info);
11406 : }
11407 : }
11408 : }
11409 :
11410 61418 : if (!uncounted_p)
11411 : {
11412 : /* The vectorization factor is always > 1, so if we use an IV increment of
11413 : 1. A zero NITERS becomes a nonzero NITERS_VECTOR. */
11414 61375 : if (integer_onep (step_vector))
11415 61357 : niters_no_overflow = true;
11416 :
11417 61375 : vect_set_loop_condition (loop, LOOP_VINFO_MAIN_EXIT (loop_vinfo),
11418 : loop_vinfo, niters_vector, step_vector,
11419 61375 : niters_vector_mult_vf, !niters_no_overflow);
11420 : }
11421 :
11422 61418 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11423 :
11424 : /* True if the final iteration might not handle a full vector's
11425 : worth of scalar iterations. */
11426 122836 : bool final_iter_may_be_partial
11427 61418 : = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11428 61418 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
11429 :
11430 : /* +1 to convert latch counts to loop iteration counts. */
11431 61418 : int bias_for_lowest = 1;
11432 :
11433 : /* When we are peeling for gaps then we take away one scalar iteration
11434 : from the vector loop. Thus we can adjust the upper bound by one
11435 : scalar iteration. But only when we know the bound applies to the
11436 : IV exit test which might not be true when we have multiple exits. */
11437 61418 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11438 119652 : bias_for_lowest -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11439 :
11440 61418 : int bias_for_assumed = bias_for_lowest;
11441 61418 : int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11442 61418 : if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11443 : {
11444 : /* When the amount of peeling is known at compile time, the first
11445 : iteration will have exactly alignment_npeels active elements.
11446 : In the worst case it will have at least one. */
11447 1 : int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11448 1 : bias_for_lowest += lowest_vf - min_first_active;
11449 1 : bias_for_assumed += assumed_vf - min_first_active;
11450 : }
11451 : /* In these calculations the "- 1" converts loop iteration counts
11452 : back to latch counts. */
11453 61418 : if (loop->any_upper_bound)
11454 : {
11455 61402 : loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11456 61402 : loop->nb_iterations_upper_bound
11457 61402 : = (final_iter_may_be_partial
11458 62810 : ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11459 2816 : lowest_vf) - 1
11460 59994 : : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11461 119988 : lowest_vf) - 1);
11462 61402 : if (main_vinfo
11463 : /* Both peeling for alignment and peeling for gaps can end up
11464 : with the scalar epilogue running for more than VF-1 iterations. */
11465 7056 : && !main_vinfo->peeling_for_alignment
11466 7008 : && !main_vinfo->peeling_for_gaps)
11467 : {
11468 6826 : unsigned int bound;
11469 6826 : poly_uint64 main_iters
11470 6826 : = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11471 : LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11472 6826 : main_iters
11473 6826 : = upper_bound (main_iters,
11474 6826 : LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11475 13652 : if (can_div_away_from_zero_p (main_iters,
11476 6826 : LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11477 : &bound))
11478 6826 : loop->nb_iterations_upper_bound
11479 6826 : = wi::umin ((bound_wide_int) (bound - 1),
11480 6826 : loop->nb_iterations_upper_bound);
11481 : }
11482 : }
11483 61418 : if (loop->any_likely_upper_bound)
11484 61402 : loop->nb_iterations_likely_upper_bound
11485 61402 : = (final_iter_may_be_partial
11486 62810 : ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11487 1408 : + bias_for_lowest, lowest_vf) - 1
11488 59994 : : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11489 61402 : + bias_for_lowest, lowest_vf) - 1);
11490 61418 : if (loop->any_estimate)
11491 35447 : loop->nb_iterations_estimate
11492 35447 : = (final_iter_may_be_partial
11493 36140 : ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11494 1386 : assumed_vf) - 1
11495 34754 : : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11496 70201 : assumed_vf) - 1);
11497 61418 : scale_profile_for_vect_loop (loop, LOOP_VINFO_MAIN_EXIT (loop_vinfo),
11498 : assumed_vf, flat);
11499 :
11500 61418 : if (dump_enabled_p ())
11501 : {
11502 10947 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11503 : {
11504 9500 : dump_printf_loc (MSG_NOTE, vect_location,
11505 : "LOOP VECTORIZED\n");
11506 9500 : if (loop->inner)
11507 343 : dump_printf_loc (MSG_NOTE, vect_location,
11508 : "OUTER LOOP VECTORIZED\n");
11509 9500 : dump_printf (MSG_NOTE, "\n");
11510 : }
11511 : else
11512 1447 : dump_printf_loc (MSG_NOTE, vect_location,
11513 : "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11514 1447 : GET_MODE_NAME (loop_vinfo->vector_mode));
11515 : }
11516 :
11517 : /* Loops vectorized with a variable factor won't benefit from
11518 : unrolling/peeling. */
11519 61418 : if (!vf.is_constant ())
11520 : {
11521 : loop->unroll = 1;
11522 : if (dump_enabled_p ())
11523 : dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11524 : " variable-length vectorization factor\n");
11525 : }
11526 :
11527 : /* When we have unrolled the loop due to a user requested value we should
11528 : leave it up to the RTL unroll heuristics to determine if it's still worth
11529 : while to unroll more. */
11530 61418 : if (LOOP_VINFO_USER_UNROLL (loop_vinfo))
11531 44 : loop->unroll = 0;
11532 :
11533 : /* Free SLP instances here because otherwise stmt reference counting
11534 : won't work. */
11535 : slp_instance instance;
11536 151200 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11537 89782 : vect_free_slp_instance (instance);
11538 61418 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11539 : /* Clear-up safelen field since its value is invalid after vectorization
11540 : since vectorized loop can have loop-carried dependencies. */
11541 61418 : loop->safelen = 0;
11542 :
11543 61418 : if (epilogue)
11544 : {
11545 : /* Accumulate past advancements made. */
11546 7056 : if (LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo))
11547 89 : advance = fold_build2 (PLUS_EXPR, TREE_TYPE (advance),
11548 : LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo),
11549 : advance);
11550 7056 : update_epilogue_loop_vinfo (epilogue, advance);
11551 :
11552 7056 : epilogue->simduid = loop->simduid;
11553 7056 : epilogue->force_vectorize = loop->force_vectorize;
11554 7056 : epilogue->dont_vectorize = false;
11555 : }
11556 :
11557 61418 : return epilogue;
11558 61418 : }
11559 :
11560 : /* The code below is trying to perform simple optimization - revert
11561 : if-conversion for masked stores, i.e. if the mask of a store is zero
11562 : do not perform it and all stored value producers also if possible.
11563 : For example,
11564 : for (i=0; i<n; i++)
11565 : if (c[i])
11566 : {
11567 : p1[i] += 1;
11568 : p2[i] = p3[i] +2;
11569 : }
11570 : this transformation will produce the following semi-hammock:
11571 :
11572 : if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11573 : {
11574 : vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11575 : vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11576 : MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11577 : vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11578 : vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11579 : MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11580 : }
11581 : */
11582 :
11583 : void
11584 499 : optimize_mask_stores (class loop *loop)
11585 : {
11586 499 : basic_block *bbs = get_loop_body (loop);
11587 499 : unsigned nbbs = loop->num_nodes;
11588 499 : unsigned i;
11589 499 : basic_block bb;
11590 499 : class loop *bb_loop;
11591 499 : gimple_stmt_iterator gsi;
11592 499 : gimple *stmt;
11593 499 : auto_vec<gimple *> worklist;
11594 499 : auto_purge_vect_location sentinel;
11595 :
11596 499 : vect_location = find_loop_location (loop);
11597 : /* Pick up all masked stores in loop if any. */
11598 1996 : for (i = 0; i < nbbs; i++)
11599 : {
11600 998 : bb = bbs[i];
11601 17427 : for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11602 15431 : gsi_next (&gsi))
11603 : {
11604 15431 : stmt = gsi_stmt (gsi);
11605 15431 : if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11606 701 : worklist.safe_push (stmt);
11607 : }
11608 : }
11609 :
11610 499 : free (bbs);
11611 499 : if (worklist.is_empty ())
11612 68 : return;
11613 :
11614 : /* Loop has masked stores. */
11615 1115 : while (!worklist.is_empty ())
11616 : {
11617 684 : gimple *last, *last_store;
11618 684 : edge e, efalse;
11619 684 : tree mask;
11620 684 : basic_block store_bb, join_bb;
11621 684 : gimple_stmt_iterator gsi_to;
11622 684 : tree vdef, new_vdef;
11623 684 : gphi *phi;
11624 684 : tree vectype;
11625 684 : tree zero;
11626 :
11627 684 : last = worklist.pop ();
11628 684 : mask = gimple_call_arg (last, 2);
11629 684 : bb = gimple_bb (last);
11630 : /* Create then_bb and if-then structure in CFG, then_bb belongs to
11631 : the same loop as if_bb. It could be different to LOOP when two
11632 : level loop-nest is vectorized and mask_store belongs to the inner
11633 : one. */
11634 684 : e = split_block (bb, last);
11635 684 : bb_loop = bb->loop_father;
11636 684 : gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11637 684 : join_bb = e->dest;
11638 684 : store_bb = create_empty_bb (bb);
11639 684 : add_bb_to_loop (store_bb, bb_loop);
11640 684 : e->flags = EDGE_TRUE_VALUE;
11641 684 : efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11642 : /* Put STORE_BB to likely part. */
11643 684 : efalse->probability = profile_probability::likely ();
11644 684 : e->probability = efalse->probability.invert ();
11645 684 : store_bb->count = efalse->count ();
11646 684 : make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11647 684 : if (dom_info_available_p (CDI_DOMINATORS))
11648 684 : set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11649 684 : if (dump_enabled_p ())
11650 351 : dump_printf_loc (MSG_NOTE, vect_location,
11651 : "Create new block %d to sink mask stores.",
11652 : store_bb->index);
11653 : /* Create vector comparison with boolean result. */
11654 684 : vectype = TREE_TYPE (mask);
11655 684 : zero = build_zero_cst (vectype);
11656 684 : stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11657 684 : gsi = gsi_last_bb (bb);
11658 684 : gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11659 : /* Create new PHI node for vdef of the last masked store:
11660 : .MEM_2 = VDEF <.MEM_1>
11661 : will be converted to
11662 : .MEM.3 = VDEF <.MEM_1>
11663 : and new PHI node will be created in join bb
11664 : .MEM_2 = PHI <.MEM_1, .MEM_3>
11665 : */
11666 684 : vdef = gimple_vdef (last);
11667 684 : new_vdef = make_ssa_name (gimple_vop (cfun), last);
11668 684 : gimple_set_vdef (last, new_vdef);
11669 684 : phi = create_phi_node (vdef, join_bb);
11670 684 : add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11671 :
11672 : /* Put all masked stores with the same mask to STORE_BB if possible. */
11673 718 : while (true)
11674 : {
11675 701 : gimple_stmt_iterator gsi_from;
11676 701 : gimple *stmt1 = NULL;
11677 :
11678 : /* Move masked store to STORE_BB. */
11679 701 : last_store = last;
11680 701 : gsi = gsi_for_stmt (last);
11681 701 : gsi_from = gsi;
11682 : /* Shift GSI to the previous stmt for further traversal. */
11683 701 : gsi_prev (&gsi);
11684 701 : gsi_to = gsi_start_bb (store_bb);
11685 701 : gsi_move_before (&gsi_from, &gsi_to);
11686 : /* Setup GSI_TO to the non-empty block start. */
11687 701 : gsi_to = gsi_start_bb (store_bb);
11688 701 : if (dump_enabled_p ())
11689 367 : dump_printf_loc (MSG_NOTE, vect_location,
11690 : "Move stmt to created bb\n%G", last);
11691 : /* Move all stored value producers if possible. */
11692 4976 : while (!gsi_end_p (gsi))
11693 : {
11694 4975 : tree lhs;
11695 4975 : imm_use_iterator imm_iter;
11696 4975 : use_operand_p use_p;
11697 4975 : bool res;
11698 :
11699 : /* Skip debug statements. */
11700 4975 : if (is_gimple_debug (gsi_stmt (gsi)))
11701 : {
11702 3 : gsi_prev (&gsi);
11703 3231 : continue;
11704 : }
11705 4972 : stmt1 = gsi_stmt (gsi);
11706 : /* Do not consider statements writing to memory or having
11707 : volatile operand. */
11708 9794 : if (gimple_vdef (stmt1)
11709 9794 : || gimple_has_volatile_ops (stmt1))
11710 : break;
11711 4822 : gsi_from = gsi;
11712 4822 : gsi_prev (&gsi);
11713 4822 : lhs = gimple_get_lhs (stmt1);
11714 4822 : if (!lhs)
11715 : break;
11716 :
11717 : /* LHS of vectorized stmt must be SSA_NAME. */
11718 4822 : if (TREE_CODE (lhs) != SSA_NAME)
11719 : break;
11720 :
11721 4822 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11722 : {
11723 : /* Remove dead scalar statement. */
11724 3560 : if (has_zero_uses (lhs))
11725 : {
11726 3228 : gsi_remove (&gsi_from, true);
11727 3228 : release_defs (stmt1);
11728 3228 : continue;
11729 : }
11730 : }
11731 :
11732 : /* Check that LHS does not have uses outside of STORE_BB. */
11733 1594 : res = true;
11734 4333 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11735 : {
11736 1695 : gimple *use_stmt;
11737 1695 : use_stmt = USE_STMT (use_p);
11738 1695 : if (is_gimple_debug (use_stmt))
11739 0 : continue;
11740 1695 : if (gimple_bb (use_stmt) != store_bb)
11741 : {
11742 : res = false;
11743 : break;
11744 : }
11745 1594 : }
11746 1594 : if (!res)
11747 : break;
11748 :
11749 1044 : if (gimple_vuse (stmt1)
11750 1480 : && gimple_vuse (stmt1) != gimple_vuse (last_store))
11751 : break;
11752 :
11753 : /* Can move STMT1 to STORE_BB. */
11754 1044 : if (dump_enabled_p ())
11755 563 : dump_printf_loc (MSG_NOTE, vect_location,
11756 : "Move stmt to created bb\n%G", stmt1);
11757 1044 : gsi_move_before (&gsi_from, &gsi_to);
11758 : /* Shift GSI_TO for further insertion. */
11759 2088 : gsi_prev (&gsi_to);
11760 : }
11761 : /* Put other masked stores with the same mask to STORE_BB. */
11762 701 : if (worklist.is_empty ()
11763 270 : || gimple_call_arg (worklist.last (), 2) != mask
11764 17 : || worklist.last () != stmt1)
11765 : break;
11766 17 : last = worklist.pop ();
11767 17 : }
11768 1368 : add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11769 : }
11770 499 : }
11771 :
11772 : /* Decide whether it is possible to use a zero-based induction variable
11773 : when vectorizing LOOP_VINFO with partial vectors. If it is, return
11774 : the value that the induction variable must be able to hold in order
11775 : to ensure that the rgroups eventually have no active vector elements.
11776 : Return -1 otherwise. */
11777 :
11778 : widest_int
11779 33510 : vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11780 : {
11781 33510 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11782 33510 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11783 33510 : unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11784 :
11785 : /* Calculate the value that the induction variable must be able
11786 : to hit in order to ensure that we end the loop with an all-false mask.
11787 : This involves adding the maximum number of inactive trailing scalar
11788 : iterations. */
11789 33510 : widest_int iv_limit = -1;
11790 33510 : if (max_loop_iterations (loop, &iv_limit))
11791 : {
11792 33510 : if (niters_skip)
11793 : {
11794 : /* Add the maximum number of skipped iterations to the
11795 : maximum iteration count. */
11796 0 : if (TREE_CODE (niters_skip) == INTEGER_CST)
11797 0 : iv_limit += wi::to_widest (niters_skip);
11798 : else
11799 0 : iv_limit += max_vf - 1;
11800 : }
11801 33510 : else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11802 : /* Make a conservatively-correct assumption. */
11803 336 : iv_limit += max_vf - 1;
11804 :
11805 : /* IV_LIMIT is the maximum number of latch iterations, which is also
11806 : the maximum in-range IV value. Round this value down to the previous
11807 : vector alignment boundary and then add an extra full iteration. */
11808 33510 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11809 33510 : iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11810 : }
11811 33510 : return iv_limit;
11812 : }
11813 :
11814 : /* For the given rgroup_controls RGC, check whether an induction variable
11815 : would ever hit a value that produces a set of all-false masks or zero
11816 : lengths before wrapping around. Return true if it's possible to wrap
11817 : around before hitting the desirable value, otherwise return false. */
11818 :
11819 : bool
11820 0 : vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11821 : {
11822 0 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11823 :
11824 0 : if (iv_limit == -1)
11825 : return true;
11826 :
11827 0 : tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11828 0 : unsigned int compare_precision = TYPE_PRECISION (compare_type);
11829 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11830 :
11831 0 : if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11832 : return true;
11833 :
11834 : return false;
11835 0 : }
|