Line data Source code
1 : /* Loop Vectorization
2 : Copyright (C) 2003-2026 Free Software Foundation, Inc.
3 : Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 : Ira Rosen <irar@il.ibm.com>
5 :
6 : This file is part of GCC.
7 :
8 : GCC is free software; you can redistribute it and/or modify it under
9 : the terms of the GNU General Public License as published by the Free
10 : Software Foundation; either version 3, or (at your option) any later
11 : version.
12 :
13 : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : for more details.
17 :
18 : You should have received a copy of the GNU General Public License
19 : along with GCC; see the file COPYING3. If not see
20 : <http://www.gnu.org/licenses/>. */
21 :
22 : #define INCLUDE_ALGORITHM
23 : #include "config.h"
24 : #include "system.h"
25 : #include "coretypes.h"
26 : #include "backend.h"
27 : #include "target.h"
28 : #include "rtl.h"
29 : #include "tree.h"
30 : #include "gimple.h"
31 : #include "cfghooks.h"
32 : #include "tree-pass.h"
33 : #include "ssa.h"
34 : #include "optabs-tree.h"
35 : #include "memmodel.h"
36 : #include "optabs.h"
37 : #include "diagnostic-core.h"
38 : #include "fold-const.h"
39 : #include "stor-layout.h"
40 : #include "cfganal.h"
41 : #include "gimplify.h"
42 : #include "gimple-iterator.h"
43 : #include "gimplify-me.h"
44 : #include "tree-ssa-loop-ivopts.h"
45 : #include "tree-ssa-loop-manip.h"
46 : #include "tree-ssa-loop-niter.h"
47 : #include "tree-ssa-loop.h"
48 : #include "cfgloop.h"
49 : #include "tree-scalar-evolution.h"
50 : #include "tree-vectorizer.h"
51 : #include "gimple-fold.h"
52 : #include "cgraph.h"
53 : #include "tree-cfg.h"
54 : #include "tree-if-conv.h"
55 : #include "internal-fn.h"
56 : #include "tree-vector-builder.h"
57 : #include "vec-perm-indices.h"
58 : #include "tree-eh.h"
59 : #include "case-cfn-macros.h"
60 : #include "langhooks.h"
61 : #include "opts.h"
62 : #include "hierarchical_discriminator.h"
63 :
64 : /* Loop Vectorization Pass.
65 :
66 : This pass tries to vectorize loops.
67 :
68 : For example, the vectorizer transforms the following simple loop:
69 :
70 : short a[N]; short b[N]; short c[N]; int i;
71 :
72 : for (i=0; i<N; i++){
73 : a[i] = b[i] + c[i];
74 : }
75 :
76 : as if it was manually vectorized by rewriting the source code into:
77 :
78 : typedef int __attribute__((mode(V8HI))) v8hi;
79 : short a[N]; short b[N]; short c[N]; int i;
80 : v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
81 : v8hi va, vb, vc;
82 :
83 : for (i=0; i<N/8; i++){
84 : vb = pb[i];
85 : vc = pc[i];
86 : va = vb + vc;
87 : pa[i] = va;
88 : }
89 :
90 : The main entry to this pass is vectorize_loops(), in which
91 : the vectorizer applies a set of analyses on a given set of loops,
92 : followed by the actual vectorization transformation for the loops that
93 : had successfully passed the analysis phase.
94 : Throughout this pass we make a distinction between two types of
95 : data: scalars (which are represented by SSA_NAMES), and memory references
96 : ("data-refs"). These two types of data require different handling both
97 : during analysis and transformation. The types of data-refs that the
98 : vectorizer currently supports are ARRAY_REFS which base is an array DECL
99 : (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
100 : accesses are required to have a simple (consecutive) access pattern.
101 :
102 : Analysis phase:
103 : ===============
104 : The driver for the analysis phase is vect_analyze_loop().
105 : It applies a set of analyses, some of which rely on the scalar evolution
106 : analyzer (scev) developed by Sebastian Pop.
107 :
108 : During the analysis phase the vectorizer records some information
109 : per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
110 : loop, as well as general information about the loop as a whole, which is
111 : recorded in a "loop_vec_info" struct attached to each loop.
112 :
113 : Transformation phase:
114 : =====================
115 : The loop transformation phase scans all the stmts in the loop, and
116 : creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
117 : the loop that needs to be vectorized. It inserts the vector code sequence
118 : just before the scalar stmt S, and records a pointer to the vector code
119 : in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
120 : attached to S). This pointer will be used for the vectorization of following
121 : stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
122 : otherwise, we rely on dead code elimination for removing it.
123 :
124 : For example, say stmt S1 was vectorized into stmt VS1:
125 :
126 : VS1: vb = px[i];
127 : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
128 : S2: a = b;
129 :
130 : To vectorize stmt S2, the vectorizer first finds the stmt that defines
131 : the operand 'b' (S1), and gets the relevant vector def 'vb' from the
132 : vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
133 : resulting sequence would be:
134 :
135 : VS1: vb = px[i];
136 : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
137 : VS2: va = vb;
138 : S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
139 :
140 : Operands that are not SSA_NAMEs, are data-refs that appear in
141 : load/store operations (like 'x[i]' in S1), and are handled differently.
142 :
143 : Target modeling:
144 : =================
145 : Currently the only target specific information that is used is the
146 : size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
147 : Targets that can support different sizes of vectors, for now will need
148 : to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
149 : flexibility will be added in the future.
150 :
151 : Since we only vectorize operations which vector form can be
152 : expressed using existing tree codes, to verify that an operation is
153 : supported, the vectorizer checks the relevant optab at the relevant
154 : machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
155 : the value found is CODE_FOR_nothing, then there's no target support, and
156 : we can't vectorize the stmt.
157 :
158 : For additional information on this project see:
159 : http://gcc.gnu.org/projects/tree-ssa/vectorization.html
160 : */
161 :
162 : static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
163 : unsigned *);
164 : static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
165 : gphi **);
166 :
167 :
168 : /* Function vect_is_simple_iv_evolution.
169 :
170 : FORNOW: A simple evolution of an induction variables in the loop is
171 : considered a polynomial evolution. */
172 :
173 : static bool
174 909546 : vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn,
175 : stmt_vec_info stmt_info)
176 : {
177 909546 : tree init_expr;
178 909546 : tree step_expr;
179 909546 : tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
180 909546 : basic_block bb;
181 :
182 : /* When there is no evolution in this loop, the evolution function
183 : is not "simple". */
184 909546 : if (evolution_part == NULL_TREE)
185 : return false;
186 :
187 : /* When the evolution is a polynomial of degree >= 2
188 : the evolution function is not "simple". */
189 973395 : if (tree_is_chrec (evolution_part))
190 : return false;
191 :
192 799270 : step_expr = evolution_part;
193 799270 : init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
194 :
195 799270 : if (dump_enabled_p ())
196 39910 : dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
197 : step_expr, init_expr);
198 :
199 799270 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = init_expr;
200 799270 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = step_expr;
201 :
202 799270 : if (TREE_CODE (step_expr) != INTEGER_CST
203 71598 : && (TREE_CODE (step_expr) != SSA_NAME
204 56513 : || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
205 56250 : && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
206 7787 : || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
207 131 : && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
208 131 : || !flag_associative_math)))
209 863176 : && (TREE_CODE (step_expr) != REAL_CST
210 431 : || !flag_associative_math))
211 : {
212 63849 : if (dump_enabled_p ())
213 3086 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
214 : "step unknown.\n");
215 63849 : return false;
216 : }
217 :
218 : return true;
219 : }
220 :
221 : /* Function vect_is_nonlinear_iv_evolution
222 :
223 : Only support nonlinear induction for integer type
224 : 1. neg
225 : 2. mul by constant
226 : 3. lshift/rshift by constant.
227 :
228 : For neg induction, return a fake step as integer -1. */
229 : static bool
230 171558 : vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
231 : gphi* loop_phi_node)
232 : {
233 171558 : tree init_expr, ev_expr, result, op1, op2;
234 171558 : gimple* def;
235 :
236 171558 : if (gimple_phi_num_args (loop_phi_node) != 2)
237 : return false;
238 :
239 171558 : init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
240 171558 : ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
241 :
242 : /* Support nonlinear induction only for integer type. */
243 171558 : if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
244 : return false;
245 :
246 108323 : result = PHI_RESULT (loop_phi_node);
247 :
248 108323 : if (TREE_CODE (ev_expr) != SSA_NAME
249 106060 : || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
250 108323 : || !is_gimple_assign (def))
251 : return false;
252 :
253 97524 : enum tree_code t_code = gimple_assign_rhs_code (def);
254 97524 : tree step;
255 97524 : switch (t_code)
256 : {
257 3510 : case NEGATE_EXPR:
258 3510 : if (gimple_assign_rhs1 (def) != result)
259 : return false;
260 3510 : step = build_int_cst (TREE_TYPE (init_expr), -1);
261 3510 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
262 3510 : break;
263 :
264 11326 : case RSHIFT_EXPR:
265 11326 : case LSHIFT_EXPR:
266 11326 : case MULT_EXPR:
267 11326 : op1 = gimple_assign_rhs1 (def);
268 11326 : op2 = gimple_assign_rhs2 (def);
269 11326 : if (TREE_CODE (op2) != INTEGER_CST
270 7438 : || op1 != result)
271 : return false;
272 7051 : step = op2;
273 7051 : if (t_code == LSHIFT_EXPR)
274 472 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
275 6579 : else if (t_code == RSHIFT_EXPR)
276 5609 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
277 : /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
278 : else
279 970 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
280 : break;
281 :
282 : default:
283 : return false;
284 : }
285 :
286 10561 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = init_expr;
287 10561 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = step;
288 :
289 10561 : return true;
290 : }
291 :
292 : /* Returns true if Phi is a first-order recurrence. A first-order
293 : recurrence is a non-reduction recurrence relation in which the value of
294 : the recurrence in the current loop iteration equals a value defined in
295 : the previous iteration. */
296 :
297 : static bool
298 66211 : vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
299 : gphi *phi)
300 : {
301 : /* A nested cycle isn't vectorizable as first order recurrence. */
302 66211 : if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
303 : return false;
304 :
305 : /* Ensure the loop latch definition is from within the loop. */
306 66069 : edge latch = loop_latch_edge (loop);
307 66069 : tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
308 66069 : if (TREE_CODE (ldef) != SSA_NAME
309 63463 : || SSA_NAME_IS_DEFAULT_DEF (ldef)
310 63397 : || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
311 124905 : || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
312 7886 : return false;
313 :
314 58183 : tree def = gimple_phi_result (phi);
315 :
316 : /* Ensure every use_stmt of the phi node is dominated by the latch
317 : definition. */
318 58183 : imm_use_iterator imm_iter;
319 58183 : use_operand_p use_p;
320 128681 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
321 69987 : if (!is_gimple_debug (USE_STMT (use_p))
322 136344 : && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
323 45855 : || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
324 : USE_STMT (use_p))))
325 57672 : return false;
326 :
327 : /* First-order recurrence autovectorization needs shuffle vector. */
328 511 : tree scalar_type = TREE_TYPE (def);
329 511 : tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
330 511 : if (!vectype)
331 : return false;
332 :
333 : return true;
334 : }
335 :
336 : /* Function vect_analyze_scalar_cycles_1.
337 :
338 : Examine the cross iteration def-use cycles of scalar variables
339 : in LOOP. LOOP_VINFO represents the loop that is now being
340 : considered for vectorization (can be LOOP, or an outer-loop
341 : enclosing LOOP). SLP indicates there will be some subsequent
342 : slp analyses or not. */
343 :
344 : static void
345 446232 : vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
346 : {
347 446232 : basic_block bb = loop->header;
348 446232 : auto_vec<stmt_vec_info, 64> worklist;
349 446232 : gphi_iterator gsi;
350 :
351 446232 : DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
352 :
353 : /* First - identify all inductions. Reduction detection assumes that all the
354 : inductions have been identified, therefore, this order must not be
355 : changed. */
356 1597170 : for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
357 : {
358 1150938 : gphi *phi = gsi.phi ();
359 1150938 : tree access_fn = NULL;
360 1150938 : tree def = PHI_RESULT (phi);
361 1150938 : stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
362 :
363 : /* Skip virtual phi's. The data dependences that are associated with
364 : virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
365 2301876 : if (virtual_operand_p (def))
366 404962 : continue;
367 :
368 : /* Skip already analyzed inner loop PHIs of double reductions. */
369 910551 : if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))
370 1005 : continue;
371 :
372 909546 : if (dump_enabled_p ())
373 42038 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
374 : (gimple *) phi);
375 :
376 909546 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
377 :
378 : /* Analyze the evolution function. */
379 909546 : access_fn = analyze_scalar_evolution (loop, def);
380 909546 : if (dump_enabled_p ())
381 42038 : dump_printf_loc (MSG_NOTE, vect_location,
382 : "Access function of PHI: %T\n", access_fn);
383 909546 : if (access_fn)
384 909546 : STRIP_NOPS (access_fn);
385 :
386 1073116 : if ((!access_fn
387 909546 : || !vect_is_simple_iv_evolution (loop->num, access_fn, stmt_vinfo)
388 735421 : || (LOOP_VINFO_LOOP (loop_vinfo) != loop
389 11368 : && (TREE_CODE (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo))
390 : != INTEGER_CST)))
391 : /* Only handle nonlinear iv for same loop. */
392 1083677 : && (LOOP_VINFO_LOOP (loop_vinfo) != loop
393 171558 : || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo, phi)))
394 : {
395 163570 : worklist.safe_push (stmt_vinfo);
396 163570 : continue;
397 : }
398 :
399 745976 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
400 : != NULL_TREE);
401 745976 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
402 :
403 745976 : if (dump_enabled_p ())
404 36933 : dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
405 745976 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
406 :
407 : /* Mark if we have a non-linear IV. */
408 745976 : LOOP_VINFO_NON_LINEAR_IV (loop_vinfo)
409 745976 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_vinfo) != vect_step_op_add;
410 : }
411 :
412 :
413 : /* Second - identify all reductions and nested cycles. */
414 609802 : while (worklist.length () > 0)
415 : {
416 163570 : stmt_vec_info stmt_vinfo = worklist.pop ();
417 163570 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
418 163570 : tree def = PHI_RESULT (phi);
419 :
420 163570 : if (dump_enabled_p ())
421 5105 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
422 : (gimple *) phi);
423 :
424 327140 : gcc_assert (!virtual_operand_p (def)
425 : && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
426 :
427 163570 : gphi *double_reduc;
428 163570 : stmt_vec_info reduc_stmt_info
429 163570 : = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc);
430 163570 : if (reduc_stmt_info && double_reduc)
431 : {
432 1107 : stmt_vec_info inner_phi_info
433 1107 : = loop_vinfo->lookup_stmt (double_reduc);
434 : /* ??? Pass down flag we're the inner loop of a double reduc. */
435 1107 : stmt_vec_info inner_reduc_info
436 1107 : = vect_is_simple_reduction (loop_vinfo, inner_phi_info, NULL);
437 1107 : if (inner_reduc_info)
438 : {
439 1005 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
440 1005 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
441 1005 : STMT_VINFO_REDUC_DEF (inner_phi_info) = inner_reduc_info;
442 1005 : STMT_VINFO_REDUC_DEF (inner_reduc_info) = inner_phi_info;
443 1005 : if (dump_enabled_p ())
444 130 : dump_printf_loc (MSG_NOTE, vect_location,
445 : "Detected double reduction.\n");
446 :
447 1005 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
448 1005 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
449 1005 : STMT_VINFO_DEF_TYPE (inner_phi_info) = vect_nested_cycle;
450 : /* Make it accessible for SLP vectorization. */
451 1005 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
452 : }
453 102 : else if (dump_enabled_p ())
454 14 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
455 : "Unknown def-use cycle pattern.\n");
456 : }
457 162463 : else if (reduc_stmt_info)
458 : {
459 96252 : if (loop != LOOP_VINFO_LOOP (loop_vinfo))
460 : {
461 2431 : if (dump_enabled_p ())
462 434 : dump_printf_loc (MSG_NOTE, vect_location,
463 : "Detected vectorizable nested cycle.\n");
464 :
465 2431 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
466 : }
467 : else
468 : {
469 93821 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
470 93821 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
471 93821 : if (dump_enabled_p ())
472 3974 : dump_printf_loc (MSG_NOTE, vect_location,
473 : "Detected reduction.\n");
474 :
475 93821 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
476 93821 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
477 93821 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
478 : }
479 : }
480 66211 : else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
481 505 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
482 : else
483 65706 : if (dump_enabled_p ())
484 476 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
485 : "Unknown def-use cycle pattern.\n");
486 : }
487 446232 : }
488 :
489 :
490 : /* Function vect_analyze_scalar_cycles.
491 :
492 : Examine the cross iteration def-use cycles of scalar variables, by
493 : analyzing the loop-header PHIs of scalar variables. Classify each
494 : cycle as one of the following: invariant, induction, reduction, unknown.
495 : We do that for the loop represented by LOOP_VINFO, and also to its
496 : inner-loop, if exists.
497 : Examples for scalar cycles:
498 :
499 : Example1: reduction:
500 :
501 : loop1:
502 : for (i=0; i<N; i++)
503 : sum += a[i];
504 :
505 : Example2: induction:
506 :
507 : loop2:
508 : for (i=0; i<N; i++)
509 : a[i] = i; */
510 :
511 : static void
512 440452 : vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
513 : {
514 440452 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
515 :
516 440452 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
517 :
518 : /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
519 : Reductions in such inner-loop therefore have different properties than
520 : the reductions in the nest that gets vectorized:
521 : 1. When vectorized, they are executed in the same order as in the original
522 : scalar loop, so we can't change the order of computation when
523 : vectorizing them.
524 : 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
525 : current checks are too strict. */
526 :
527 440452 : if (loop->inner)
528 5780 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
529 440452 : }
530 :
531 : /* Function vect_get_loop_niters.
532 :
533 : Determine how many iterations the loop is executed and place it
534 : in NUMBER_OF_ITERATIONS. Place the number of latch iterations
535 : in NUMBER_OF_ITERATIONSM1. Place the condition under which the
536 : niter information holds in ASSUMPTIONS.
537 :
538 : Return the loop exit conditions. */
539 :
540 :
541 : static vec<gcond *>
542 278515 : vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
543 : tree *number_of_iterations, tree *number_of_iterationsm1)
544 : {
545 278515 : auto_vec<edge> exits = get_loop_exit_edges (loop);
546 278515 : vec<gcond *> conds;
547 557030 : conds.create (exits.length ());
548 278515 : class tree_niter_desc niter_desc;
549 278515 : tree niter_assumptions, niter, may_be_zero;
550 :
551 278515 : *assumptions = boolean_true_node;
552 278515 : *number_of_iterationsm1 = chrec_dont_know;
553 278515 : *number_of_iterations = chrec_dont_know;
554 :
555 278515 : DUMP_VECT_SCOPE ("get_loop_niters");
556 :
557 278515 : if (exits.is_empty ())
558 0 : return conds;
559 :
560 278515 : if (dump_enabled_p ())
561 14647 : dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
562 : exits.length ());
563 :
564 : edge exit;
565 : unsigned int i;
566 677583 : FOR_EACH_VEC_ELT (exits, i, exit)
567 : {
568 399068 : gcond *cond = get_loop_exit_condition (exit);
569 399068 : if (cond)
570 399035 : conds.safe_push (cond);
571 :
572 399068 : if (dump_enabled_p ())
573 15803 : dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
574 :
575 399068 : if (exit != main_exit)
576 178800 : continue;
577 :
578 278515 : may_be_zero = NULL_TREE;
579 278515 : if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
580 278515 : || chrec_contains_undetermined (niter_desc.niter))
581 58247 : continue;
582 :
583 220268 : niter_assumptions = niter_desc.assumptions;
584 220268 : may_be_zero = niter_desc.may_be_zero;
585 220268 : niter = niter_desc.niter;
586 :
587 220268 : if (may_be_zero && integer_zerop (may_be_zero))
588 : may_be_zero = NULL_TREE;
589 :
590 9480 : if (may_be_zero)
591 : {
592 9480 : if (COMPARISON_CLASS_P (may_be_zero))
593 : {
594 : /* Try to combine may_be_zero with assumptions, this can simplify
595 : computation of niter expression. */
596 9480 : if (niter_assumptions && !integer_nonzerop (niter_assumptions))
597 951 : niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
598 : niter_assumptions,
599 : fold_build1 (TRUTH_NOT_EXPR,
600 : boolean_type_node,
601 : may_be_zero));
602 : else
603 8529 : niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
604 : build_int_cst (TREE_TYPE (niter), 0),
605 : rewrite_to_non_trapping_overflow (niter));
606 :
607 220268 : may_be_zero = NULL_TREE;
608 : }
609 0 : else if (integer_nonzerop (may_be_zero))
610 : {
611 0 : *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
612 0 : *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
613 0 : continue;
614 : }
615 : else
616 0 : continue;
617 : }
618 :
619 : /* Loop assumptions are based off the normal exit. */
620 220268 : *assumptions = niter_assumptions;
621 220268 : *number_of_iterationsm1 = niter;
622 :
623 : /* We want the number of loop header executions which is the number
624 : of latch executions plus one.
625 : ??? For UINT_MAX latch executions this number overflows to zero
626 : for loops like do { n++; } while (n != 0); */
627 220268 : if (niter && !chrec_contains_undetermined (niter))
628 : {
629 220268 : niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
630 : unshare_expr (niter),
631 : build_int_cst (TREE_TYPE (niter), 1));
632 220268 : if (TREE_CODE (niter) == INTEGER_CST
633 121261 : && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
634 : {
635 : /* If we manage to fold niter + 1 into INTEGER_CST even when
636 : niter is some complex expression, ensure back
637 : *number_of_iterationsm1 is an INTEGER_CST as well. See
638 : PR113210. */
639 0 : *number_of_iterationsm1
640 0 : = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
641 : build_minus_one_cst (TREE_TYPE (niter)));
642 : }
643 : }
644 220268 : *number_of_iterations = niter;
645 : }
646 :
647 278515 : if (dump_enabled_p ())
648 14647 : dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
649 :
650 278515 : return conds;
651 278515 : }
652 :
653 : /* Determine the main loop exit for the vectorizer. */
654 :
655 : edge
656 490724 : vec_init_loop_exit_info (class loop *loop)
657 : {
658 : /* Before we begin we must first determine which exit is the main one and
659 : which are auxiliary exits. */
660 490724 : auto_vec<edge> exits = get_loop_exit_edges (loop);
661 976393 : if (exits.length () == 0)
662 : return NULL;
663 485669 : if (exits.length () == 1)
664 320846 : return exits[0];
665 :
666 : /* If we have multiple exits, look for counting IV exit.
667 : Analyze all exits and return the last one we can analyze. */
668 164823 : class tree_niter_desc niter_desc;
669 164823 : edge candidate = NULL;
670 610274 : for (edge exit : exits)
671 : {
672 465476 : if (!get_loop_exit_condition (exit))
673 : {
674 20025 : if (dump_enabled_p ())
675 14 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
676 : "Unhandled loop exit detected.\n");
677 20025 : return NULL;
678 : }
679 :
680 445451 : if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
681 445451 : && !chrec_contains_undetermined (niter_desc.niter))
682 : {
683 132468 : tree may_be_zero = niter_desc.may_be_zero;
684 132468 : if ((integer_zerop (may_be_zero)
685 : /* As we are handling may_be_zero that's not false by
686 : rewriting niter to may_be_zero ? 0 : niter we require
687 : an empty latch. */
688 455812 : || (single_pred_p (loop->latch)
689 10026 : && exit->src == single_pred (loop->latch)
690 2535 : && (integer_nonzerop (may_be_zero)
691 2535 : || COMPARISON_CLASS_P (may_be_zero))))
692 135003 : && (!candidate
693 5898 : || dominated_by_p (CDI_DOMINATORS, exit->src,
694 5898 : candidate->src)))
695 : candidate = exit;
696 : }
697 : }
698 :
699 : /* If no exit is analyzable by scalar evolution, we return the last exit
700 : under the assummption we are dealing with an uncounted loop. */
701 199153 : if (!candidate && single_pred_p (loop->latch))
702 34330 : candidate = loop_exits_from_bb_p (loop, single_pred (loop->latch));
703 :
704 : return candidate;
705 164823 : }
706 :
707 : /* Function bb_in_loop_p
708 :
709 : Used as predicate for dfs order traversal of the loop bbs. */
710 :
711 : static bool
712 1679355 : bb_in_loop_p (const_basic_block bb, const void *data)
713 : {
714 1679355 : const class loop *const loop = (const class loop *)data;
715 1679355 : if (flow_bb_inside_loop_p (loop, bb))
716 : return true;
717 : return false;
718 : }
719 :
720 :
721 : /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
722 : stmt_vec_info structs for all the stmts in LOOP_IN. */
723 :
724 575326 : _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
725 : : vec_info (vec_info::loop, shared),
726 575326 : loop (loop_in),
727 575326 : num_itersm1 (NULL_TREE),
728 575326 : num_iters (NULL_TREE),
729 575326 : num_iters_unchanged (NULL_TREE),
730 575326 : num_iters_assumptions (NULL_TREE),
731 575326 : vector_costs (nullptr),
732 575326 : scalar_costs (nullptr),
733 575326 : th (0),
734 575326 : versioning_threshold (0),
735 575326 : vectorization_factor (0),
736 575326 : main_loop_edge (nullptr),
737 575326 : skip_main_loop_edge (nullptr),
738 575326 : skip_this_loop_edge (nullptr),
739 575326 : reusable_accumulators (),
740 575326 : suggested_unroll_factor (1),
741 575326 : max_vectorization_factor (0),
742 575326 : mask_skip_niters (NULL_TREE),
743 575326 : mask_skip_niters_pfa_offset (NULL_TREE),
744 575326 : rgroup_compare_type (NULL_TREE),
745 575326 : simd_if_cond (NULL_TREE),
746 575326 : partial_vector_style (vect_partial_vectors_none),
747 575326 : unaligned_dr (NULL),
748 575326 : peeling_for_alignment (0),
749 575326 : ptr_mask (0),
750 575326 : max_spec_read_amount (0),
751 575326 : nonlinear_iv (false),
752 575326 : ivexpr_map (NULL),
753 575326 : scan_map (NULL),
754 575326 : inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
755 575326 : vectorizable (false),
756 575326 : can_use_partial_vectors_p (true),
757 575326 : must_use_partial_vectors_p (false),
758 575326 : using_partial_vectors_p (false),
759 575326 : using_decrementing_iv_p (false),
760 575326 : using_select_vl_p (false),
761 575326 : allow_mutual_alignment (false),
762 575326 : partial_load_store_bias (0),
763 575326 : peeling_for_gaps (false),
764 575326 : peeling_for_niter (false),
765 575326 : early_breaks (false),
766 575326 : loop_iv_cond (NULL),
767 575326 : user_unroll (false),
768 575326 : no_data_dependencies (false),
769 575326 : has_mask_store (false),
770 575326 : scalar_loop_scaling (profile_probability::uninitialized ()),
771 575326 : scalar_loop (NULL),
772 575326 : main_loop_info (NULL),
773 575326 : orig_loop_info (NULL),
774 575326 : epilogue_vinfo (NULL),
775 575326 : drs_advanced_by (NULL_TREE),
776 575326 : vec_loop_main_exit (NULL),
777 575326 : vec_epilogue_loop_main_exit (NULL),
778 575326 : scalar_loop_main_exit (NULL),
779 575326 : early_break_needs_epilogue (false),
780 575326 : early_break_niters_var (NULL)
781 : {
782 : /* CHECKME: We want to visit all BBs before their successors (except for
783 : latch blocks, for which this assertion wouldn't hold). In the simple
784 : case of the loop forms we allow, a dfs order of the BBs would the same
785 : as reversed postorder traversal, so we are safe. */
786 :
787 575326 : bbs = XCNEWVEC (basic_block, loop->num_nodes);
788 1150652 : nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, bbs,
789 575326 : loop->num_nodes, loop);
790 575326 : gcc_assert (nbbs == loop->num_nodes);
791 :
792 2003910 : for (unsigned int i = 0; i < nbbs; i++)
793 : {
794 1428584 : basic_block bb = bbs[i];
795 1428584 : gimple_stmt_iterator si;
796 :
797 2952232 : for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
798 : {
799 1523648 : gimple *phi = gsi_stmt (si);
800 1523648 : gimple_set_uid (phi, 0);
801 1523648 : add_stmt (phi);
802 : }
803 :
804 13230563 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
805 : {
806 10373395 : gimple *stmt = gsi_stmt (si);
807 10373395 : gimple_set_uid (stmt, 0);
808 10373395 : if (is_gimple_debug (stmt))
809 4405261 : continue;
810 5968134 : add_stmt (stmt);
811 : /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
812 : third argument is the #pragma omp simd if (x) condition, when 0,
813 : loop shouldn't be vectorized, when non-zero constant, it should
814 : be vectorized normally, otherwise versioned with vectorized loop
815 : done if the condition is non-zero at runtime. */
816 5968134 : if (loop_in->simduid
817 43372 : && is_gimple_call (stmt)
818 4268 : && gimple_call_internal_p (stmt)
819 4141 : && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
820 4137 : && gimple_call_num_args (stmt) >= 3
821 103 : && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
822 5968237 : && (loop_in->simduid
823 103 : == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
824 : {
825 103 : tree arg = gimple_call_arg (stmt, 2);
826 103 : if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
827 103 : simd_if_cond = arg;
828 : else
829 0 : gcc_assert (integer_nonzerop (arg));
830 : }
831 : }
832 : }
833 575326 : }
834 :
835 : /* Free all levels of rgroup CONTROLS. */
836 :
837 : void
838 1426505 : release_vec_loop_controls (vec<rgroup_controls> *controls)
839 : {
840 1426505 : rgroup_controls *rgc;
841 1426505 : unsigned int i;
842 1451011 : FOR_EACH_VEC_ELT (*controls, i, rgc)
843 24506 : rgc->controls.release ();
844 1426505 : controls->release ();
845 1426505 : }
846 :
847 : /* Free all memory used by the _loop_vec_info, as well as all the
848 : stmt_vec_info structs of all the stmts in the loop. */
849 :
850 575326 : _loop_vec_info::~_loop_vec_info ()
851 : {
852 575326 : free (bbs);
853 :
854 575326 : release_vec_loop_controls (&masks.rgc_vec);
855 575326 : release_vec_loop_controls (&lens);
856 579219 : delete ivexpr_map;
857 575648 : delete scan_map;
858 575326 : delete scalar_costs;
859 575326 : delete vector_costs;
860 787471 : for (auto reduc_info : reduc_infos)
861 203618 : delete reduc_info;
862 :
863 : /* When we release an epiloge vinfo that we do not intend to use
864 : avoid clearing AUX of the main loop which should continue to
865 : point to the main loop vinfo since otherwise we'll leak that. */
866 575326 : if (loop->aux == this)
867 61807 : loop->aux = NULL;
868 1150652 : }
869 :
870 : /* Return an invariant or register for EXPR and emit necessary
871 : computations in the LOOP_VINFO loop preheader. */
872 :
873 : tree
874 20502 : cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
875 : {
876 20502 : if (is_gimple_reg (expr)
877 20502 : || is_gimple_min_invariant (expr))
878 6956 : return expr;
879 :
880 13546 : if (! loop_vinfo->ivexpr_map)
881 3893 : loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
882 13546 : tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
883 13546 : if (! cached)
884 : {
885 8676 : gimple_seq stmts = NULL;
886 8676 : cached = force_gimple_operand (unshare_expr (expr),
887 : &stmts, true, NULL_TREE);
888 8676 : if (stmts)
889 : {
890 8528 : edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
891 8528 : gsi_insert_seq_on_edge_immediate (e, stmts);
892 : }
893 : }
894 13546 : return cached;
895 : }
896 :
897 : /* Return true if we can use CMP_TYPE as the comparison type to produce
898 : all masks required to mask LOOP_VINFO. */
899 :
900 : static bool
901 109782 : can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
902 : {
903 109782 : rgroup_controls *rgm;
904 109782 : unsigned int i;
905 125504 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
906 125504 : if (rgm->type != NULL_TREE
907 125504 : && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
908 : cmp_type, rgm->type,
909 : OPTIMIZE_FOR_SPEED))
910 : return false;
911 : return true;
912 : }
913 :
914 : /* Calculate the maximum number of scalars per iteration for every
915 : rgroup in LOOP_VINFO. */
916 :
917 : static unsigned int
918 23393 : vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
919 : {
920 23393 : unsigned int res = 1;
921 23393 : unsigned int i;
922 23393 : rgroup_controls *rgm;
923 56100 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
924 32707 : res = MAX (res, rgm->max_nscalars_per_iter);
925 23393 : return res;
926 : }
927 :
928 : /* Calculate the minimum precision necessary to represent:
929 :
930 : MAX_NITERS * FACTOR
931 :
932 : as an unsigned integer, where MAX_NITERS is the maximum number of
933 : loop header iterations for the original scalar form of LOOP_VINFO. */
934 :
935 : unsigned
936 25774 : vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
937 : {
938 25774 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
939 :
940 : /* Get the maximum number of iterations that is representable
941 : in the counter type. */
942 25774 : tree ni_type;
943 25774 : if (!LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo))
944 25774 : ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
945 : else
946 0 : ni_type = sizetype;
947 25774 : widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
948 :
949 : /* Get a more refined estimate for the number of iterations. */
950 25774 : widest_int max_back_edges;
951 25774 : if (max_loop_iterations (loop, &max_back_edges))
952 25774 : max_ni = wi::smin (max_ni, max_back_edges + 1);
953 :
954 : /* Work out how many bits we need to represent the limit. */
955 25774 : return wi::min_precision (max_ni * factor, UNSIGNED);
956 25774 : }
957 :
958 : /* True if the loop needs peeling or partial vectors when vectorized. */
959 :
960 : static bool
961 155269 : vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
962 : {
963 155269 : unsigned HOST_WIDE_INT const_vf;
964 :
965 155269 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
966 : return true;
967 :
968 13362 : loop_vec_info main_loop_vinfo
969 153983 : = (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
970 153983 : ? LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo) : loop_vinfo);
971 153983 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
972 79008 : && LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo) >= 0)
973 : {
974 : /* Work out the (constant) number of iterations that need to be
975 : peeled for reasons other than niters. */
976 78958 : unsigned int peel_niter
977 : = LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo);
978 78958 : return !multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
979 78958 : LOOP_VINFO_VECT_FACTOR (loop_vinfo));
980 : }
981 :
982 75025 : if (!LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo)
983 75025 : && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf))
984 : {
985 : /* When the number of iterations is a multiple of the vectorization
986 : factor and we are not doing prologue or forced epilogue peeling
987 : the epilogue isn't necessary. */
988 74611 : if (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
989 149222 : >= (unsigned) exact_log2 (const_vf))
990 : return false;
991 : }
992 :
993 : return true;
994 : }
995 :
996 : /* Each statement in LOOP_VINFO can be masked where necessary. Check
997 : whether we can actually generate the masks required. Return true if so,
998 : storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
999 :
1000 : static bool
1001 23393 : vect_verify_full_masking (loop_vec_info loop_vinfo)
1002 : {
1003 23393 : unsigned int min_ni_width;
1004 :
1005 : /* Use a normal loop if there are no statements that need masking.
1006 : This only happens in rare degenerate cases: it means that the loop
1007 : has no loads, no stores, and no live-out values. */
1008 23393 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1009 : return false;
1010 :
1011 : /* Produce the rgroup controls. */
1012 92189 : for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1013 : {
1014 34398 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1015 34398 : tree vectype = mask.first;
1016 34398 : unsigned nvectors = mask.second;
1017 :
1018 45403 : if (masks->rgc_vec.length () < nvectors)
1019 25614 : masks->rgc_vec.safe_grow_cleared (nvectors, true);
1020 34398 : rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1021 : /* The number of scalars per iteration and the number of vectors are
1022 : both compile-time constants. */
1023 34398 : unsigned int nscalars_per_iter
1024 34398 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1025 34398 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1026 :
1027 34398 : if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1028 : {
1029 27427 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1030 27427 : rgm->type = truth_type_for (vectype);
1031 27427 : rgm->factor = 1;
1032 : }
1033 : }
1034 :
1035 23393 : unsigned int max_nscalars_per_iter
1036 23393 : = vect_get_max_nscalars_per_iter (loop_vinfo);
1037 :
1038 : /* Work out how many bits we need to represent the limit. */
1039 23393 : min_ni_width
1040 23393 : = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1041 :
1042 : /* Find a scalar mode for which WHILE_ULT is supported. */
1043 23393 : opt_scalar_int_mode cmp_mode_iter;
1044 23393 : tree cmp_type = NULL_TREE;
1045 23393 : tree iv_type = NULL_TREE;
1046 23393 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1047 23393 : unsigned int iv_precision = UINT_MAX;
1048 :
1049 23393 : if (iv_limit != -1)
1050 23393 : iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1051 : UNSIGNED);
1052 :
1053 187144 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1054 : {
1055 163751 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1056 163751 : if (cmp_bits >= min_ni_width
1057 163751 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1058 : {
1059 109782 : tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1060 109782 : if (this_type
1061 109782 : && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1062 : {
1063 : /* Although we could stop as soon as we find a valid mode,
1064 : there are at least two reasons why that's not always the
1065 : best choice:
1066 :
1067 : - An IV that's Pmode or wider is more likely to be reusable
1068 : in address calculations than an IV that's narrower than
1069 : Pmode.
1070 :
1071 : - Doing the comparison in IV_PRECISION or wider allows
1072 : a natural 0-based IV, whereas using a narrower comparison
1073 : type requires mitigations against wrap-around.
1074 :
1075 : Conversely, if the IV limit is variable, doing the comparison
1076 : in a wider type than the original type can introduce
1077 : unnecessary extensions, so picking the widest valid mode
1078 : is not always a good choice either.
1079 :
1080 : Here we prefer the first IV type that's Pmode or wider,
1081 : and the first comparison type that's IV_PRECISION or wider.
1082 : (The comparison type must be no wider than the IV type,
1083 : to avoid extensions in the vector loop.)
1084 :
1085 : ??? We might want to try continuing beyond Pmode for ILP32
1086 : targets if CMP_BITS < IV_PRECISION. */
1087 0 : iv_type = this_type;
1088 0 : if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1089 : cmp_type = this_type;
1090 0 : if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1091 : break;
1092 : }
1093 : }
1094 : }
1095 :
1096 23393 : if (!cmp_type)
1097 : {
1098 23393 : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1099 23393 : return false;
1100 : }
1101 :
1102 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1103 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1104 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1105 0 : return true;
1106 23393 : }
1107 :
1108 : /* Each statement in LOOP_VINFO can be masked where necessary. Check
1109 : whether we can actually generate AVX512 style masks. Return true if so,
1110 : storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1111 :
1112 : static bool
1113 23393 : vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1114 : {
1115 : /* Produce differently organized rgc_vec and differently check
1116 : we can produce masks. */
1117 :
1118 : /* Use a normal loop if there are no statements that need masking.
1119 : This only happens in rare degenerate cases: it means that the loop
1120 : has no loads, no stores, and no live-out values. */
1121 23393 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1122 : return false;
1123 :
1124 : /* For the decrementing IV we need to represent all values in
1125 : [0, niter + niter_skip] where niter_skip is the elements we
1126 : skip in the first iteration for prologue peeling. */
1127 23393 : tree iv_type = NULL_TREE;
1128 23393 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1129 23393 : unsigned int iv_precision = UINT_MAX;
1130 23393 : if (iv_limit != -1)
1131 23393 : iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1132 :
1133 : /* First compute the type for the IV we use to track the remaining
1134 : scalar iterations. */
1135 23393 : opt_scalar_int_mode cmp_mode_iter;
1136 30550 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1137 : {
1138 30550 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1139 30550 : if (cmp_bits >= iv_precision
1140 30550 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1141 : {
1142 23393 : iv_type = build_nonstandard_integer_type (cmp_bits, true);
1143 23393 : if (iv_type)
1144 : break;
1145 : }
1146 : }
1147 23393 : if (!iv_type)
1148 : return false;
1149 :
1150 : /* Produce the rgroup controls. */
1151 92189 : for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1152 : {
1153 34398 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1154 34398 : tree vectype = mask.first;
1155 34398 : unsigned nvectors = mask.second;
1156 :
1157 : /* The number of scalars per iteration and the number of vectors are
1158 : both compile-time constants. */
1159 34398 : unsigned int nscalars_per_iter
1160 34398 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1161 34398 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1162 :
1163 : /* We index the rgroup_controls vector with nscalars_per_iter
1164 : which we keep constant and instead have a varying nvectors,
1165 : remembering the vector mask with the fewest nV. */
1166 45403 : if (masks->rgc_vec.length () < nscalars_per_iter)
1167 23447 : masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1168 34398 : rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1169 :
1170 34398 : if (!rgm->type || rgm->factor > nvectors)
1171 : {
1172 25263 : rgm->type = truth_type_for (vectype);
1173 25263 : rgm->compare_type = NULL_TREE;
1174 25263 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1175 25263 : rgm->factor = nvectors;
1176 25263 : rgm->bias_adjusted_ctrl = NULL_TREE;
1177 : }
1178 : }
1179 :
1180 : /* There is no fixed compare type we are going to use but we have to
1181 : be able to get at one for each mask group. */
1182 23393 : unsigned int min_ni_width
1183 23393 : = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1184 :
1185 23393 : bool ok = true;
1186 88574 : for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1187 : {
1188 24458 : tree mask_type = rgc.type;
1189 24458 : if (!mask_type)
1190 986 : continue;
1191 :
1192 : /* For now vect_get_loop_mask only supports integer mode masks
1193 : when we need to split it. */
1194 23472 : if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1195 23472 : || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1196 : {
1197 : ok = false;
1198 : break;
1199 : }
1200 :
1201 : /* If iv_type is usable as compare type use that - we can elide the
1202 : saturation in that case. */
1203 17413 : if (TYPE_PRECISION (iv_type) >= min_ni_width)
1204 : {
1205 17413 : tree cmp_vectype
1206 17413 : = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1207 17413 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1208 5930 : rgc.compare_type = cmp_vectype;
1209 : }
1210 17413 : if (!rgc.compare_type)
1211 33019 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1212 : {
1213 33015 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1214 33015 : if (cmp_bits >= min_ni_width
1215 33015 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1216 : {
1217 33003 : tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1218 33003 : if (!cmp_type)
1219 0 : continue;
1220 :
1221 : /* Check whether we can produce the mask with cmp_type. */
1222 33003 : tree cmp_vectype
1223 33003 : = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1224 33003 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1225 : {
1226 11479 : rgc.compare_type = cmp_vectype;
1227 11479 : break;
1228 : }
1229 : }
1230 : }
1231 17413 : if (!rgc.compare_type)
1232 : {
1233 : ok = false;
1234 : break;
1235 : }
1236 : }
1237 23393 : if (!ok)
1238 : {
1239 6063 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1240 6063 : return false;
1241 : }
1242 :
1243 17330 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1244 17330 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1245 17330 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1246 17330 : return true;
1247 23393 : }
1248 :
1249 : /* Check whether we can use vector access with length based on precision
1250 : comparison. So far, to keep it simple, we only allow the case that the
1251 : precision of the target supported length is larger than the precision
1252 : required by loop niters. */
1253 :
1254 : static bool
1255 6 : vect_verify_loop_lens (loop_vec_info loop_vinfo)
1256 : {
1257 6 : if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1258 : return false;
1259 :
1260 0 : if (!VECTOR_MODE_P (loop_vinfo->vector_mode))
1261 : return false;
1262 :
1263 0 : machine_mode len_load_mode, len_store_mode;
1264 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1265 0 : .exists (&len_load_mode))
1266 0 : return false;
1267 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1268 0 : .exists (&len_store_mode))
1269 0 : return false;
1270 :
1271 0 : signed char partial_load_bias = internal_len_load_store_bias
1272 0 : (IFN_LEN_LOAD, len_load_mode);
1273 :
1274 0 : signed char partial_store_bias = internal_len_load_store_bias
1275 0 : (IFN_LEN_STORE, len_store_mode);
1276 :
1277 0 : gcc_assert (partial_load_bias == partial_store_bias);
1278 :
1279 0 : if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1280 : return false;
1281 :
1282 : /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1283 : len_loads with a length of zero. In order to avoid that we prohibit
1284 : more than one loop length here. */
1285 0 : if (partial_load_bias == -1
1286 0 : && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1287 : return false;
1288 :
1289 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1290 :
1291 0 : unsigned int max_nitems_per_iter = 1;
1292 0 : unsigned int i;
1293 0 : rgroup_controls *rgl;
1294 : /* Find the maximum number of items per iteration for every rgroup. */
1295 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1296 : {
1297 0 : unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1298 0 : max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1299 : }
1300 :
1301 : /* Work out how many bits we need to represent the length limit. */
1302 0 : unsigned int min_ni_prec
1303 0 : = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1304 :
1305 : /* Now use the maximum of below precisions for one suitable IV type:
1306 : - the IV's natural precision
1307 : - the precision needed to hold: the maximum number of scalar
1308 : iterations multiplied by the scale factor (min_ni_prec above)
1309 : - the Pmode precision
1310 :
1311 : If min_ni_prec is less than the precision of the current niters,
1312 : we prefer to still use the niters type. Prefer to use Pmode and
1313 : wider IV to avoid narrow conversions. */
1314 :
1315 0 : unsigned int ni_prec
1316 0 : = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1317 0 : min_ni_prec = MAX (min_ni_prec, ni_prec);
1318 0 : min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1319 :
1320 0 : tree iv_type = NULL_TREE;
1321 0 : opt_scalar_int_mode tmode_iter;
1322 0 : FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1323 : {
1324 0 : scalar_mode tmode = tmode_iter.require ();
1325 0 : unsigned int tbits = GET_MODE_BITSIZE (tmode);
1326 :
1327 : /* ??? Do we really want to construct one IV whose precision exceeds
1328 : BITS_PER_WORD? */
1329 0 : if (tbits > BITS_PER_WORD)
1330 : break;
1331 :
1332 : /* Find the first available standard integral type. */
1333 0 : if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1334 : {
1335 0 : iv_type = build_nonstandard_integer_type (tbits, true);
1336 0 : break;
1337 : }
1338 : }
1339 :
1340 0 : if (!iv_type)
1341 : {
1342 0 : if (dump_enabled_p ())
1343 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1344 : "can't vectorize with length-based partial vectors"
1345 : " because there is no suitable iv type.\n");
1346 0 : return false;
1347 : }
1348 :
1349 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1350 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1351 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1352 :
1353 0 : return true;
1354 : }
1355 :
1356 : /* Calculate the cost of one scalar iteration of the loop. */
1357 : static void
1358 363616 : vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1359 : {
1360 363616 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1361 363616 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1362 363616 : int nbbs = loop->num_nodes, factor;
1363 363616 : int innerloop_iters, i;
1364 :
1365 363616 : DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1366 :
1367 : /* Gather costs for statements in the scalar loop. */
1368 :
1369 : /* FORNOW. */
1370 363616 : innerloop_iters = 1;
1371 363616 : if (loop->inner)
1372 1626 : innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1373 :
1374 1249696 : for (i = 0; i < nbbs; i++)
1375 : {
1376 886080 : gimple_stmt_iterator si;
1377 886080 : basic_block bb = bbs[i];
1378 :
1379 886080 : if (bb->loop_father == loop->inner)
1380 : factor = innerloop_iters;
1381 : else
1382 882828 : factor = 1;
1383 :
1384 7284422 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1385 : {
1386 5512262 : gimple *stmt = gsi_stmt (si);
1387 5512262 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1388 :
1389 5512262 : if (!is_gimple_assign (stmt)
1390 : && !is_gimple_call (stmt)
1391 : && !is_a<gcond *> (stmt))
1392 1975585 : continue;
1393 :
1394 : /* Skip stmts that are not vectorized inside the loop. */
1395 3536677 : stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1396 3536677 : if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1397 1741447 : && (!STMT_VINFO_LIVE_P (vstmt_info)
1398 53 : || !VECTORIZABLE_CYCLE_DEF
1399 : (STMT_VINFO_DEF_TYPE (vstmt_info))))
1400 1741447 : continue;
1401 :
1402 1795230 : vect_cost_for_stmt kind;
1403 1795230 : if (STMT_VINFO_DATA_REF (stmt_info))
1404 : {
1405 863942 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1406 : kind = scalar_load;
1407 : else
1408 323206 : kind = scalar_store;
1409 : }
1410 931288 : else if (vect_nop_conversion_p (stmt_info))
1411 53761 : continue;
1412 : else
1413 : kind = scalar_stmt;
1414 :
1415 : /* We are using vect_prologue here to avoid scaling twice
1416 : by the inner loop factor. */
1417 1741469 : record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1418 : factor, kind, stmt_info, 0, vect_body);
1419 : }
1420 : }
1421 :
1422 : /* Now accumulate cost. */
1423 363616 : loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1424 363616 : add_stmt_costs (loop_vinfo->scalar_costs,
1425 : &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1426 363616 : loop_vinfo->scalar_costs->finish_cost (nullptr);
1427 363616 : }
1428 :
1429 : /* Function vect_analyze_loop_form.
1430 :
1431 : Verify that certain CFG restrictions hold, including:
1432 : - the loop has a pre-header
1433 : - the loop has a single entry
1434 : - nested loops can have only a single exit.
1435 : - the loop exit condition is simple enough
1436 : - the number of iterations can be analyzed, i.e, a countable loop. The
1437 : niter could be analyzed under some assumptions. */
1438 :
1439 : opt_result
1440 454156 : vect_analyze_loop_form (class loop *loop, gimple *loop_vectorized_call,
1441 : vect_loop_form_info *info)
1442 : {
1443 454156 : DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1444 :
1445 454156 : edge exit_e = vec_init_loop_exit_info (loop);
1446 454156 : if (!exit_e)
1447 29182 : return opt_result::failure_at (vect_location,
1448 : "not vectorized:"
1449 : " Infinite loop detected.\n");
1450 424974 : if (loop_vectorized_call)
1451 : {
1452 28761 : tree arg = gimple_call_arg (loop_vectorized_call, 1);
1453 28761 : class loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg));
1454 28761 : edge scalar_exit_e = vec_init_loop_exit_info (scalar_loop);
1455 28761 : if (!scalar_exit_e)
1456 0 : return opt_result::failure_at (vect_location,
1457 : "not vectorized:"
1458 : " could not determine main exit from"
1459 : " loop with multiple exits.\n");
1460 : }
1461 :
1462 424974 : info->loop_exit = exit_e;
1463 424974 : if (dump_enabled_p ())
1464 16041 : dump_printf_loc (MSG_NOTE, vect_location,
1465 : "using as main loop exit: %d -> %d [AUX: %p]\n",
1466 16041 : exit_e->src->index, exit_e->dest->index, exit_e->aux);
1467 :
1468 : /* Check if we have any control flow that doesn't leave the loop. */
1469 424974 : basic_block *bbs = get_loop_body (loop);
1470 1390602 : for (unsigned i = 0; i < loop->num_nodes; i++)
1471 1081009 : if (EDGE_COUNT (bbs[i]->succs) != 1
1472 1081009 : && (EDGE_COUNT (bbs[i]->succs) != 2
1473 646677 : || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1474 : {
1475 115381 : free (bbs);
1476 115381 : return opt_result::failure_at (vect_location,
1477 : "not vectorized:"
1478 : " unsupported control flow in loop.\n");
1479 : }
1480 :
1481 : /* Check if we have any control flow that doesn't leave the loop. */
1482 310676 : bool has_phi = false;
1483 310676 : for (unsigned i = 0; i < loop->num_nodes; i++)
1484 310225 : if (!gimple_seq_empty_p (phi_nodes (bbs[i])))
1485 : {
1486 : has_phi = true;
1487 : break;
1488 : }
1489 309593 : if (!has_phi)
1490 451 : return opt_result::failure_at (vect_location,
1491 : "not vectorized:"
1492 : " no scalar evolution detected in loop.\n");
1493 :
1494 309142 : free (bbs);
1495 :
1496 : /* Different restrictions apply when we are considering an inner-most loop,
1497 : vs. an outer (nested) loop.
1498 : (FORNOW. May want to relax some of these restrictions in the future). */
1499 :
1500 309142 : info->inner_loop_cond = NULL;
1501 309142 : if (!loop->inner)
1502 : {
1503 : /* Inner-most loop. */
1504 :
1505 290604 : if (empty_block_p (loop->header))
1506 0 : return opt_result::failure_at (vect_location,
1507 : "not vectorized: empty loop.\n");
1508 : }
1509 : else
1510 : {
1511 18538 : class loop *innerloop = loop->inner;
1512 18538 : edge entryedge;
1513 :
1514 : /* Nested loop. We currently require that the loop is doubly-nested,
1515 : contains a single inner loop with a single exit to the block
1516 : with the single exit condition in the outer loop.
1517 : Vectorizable outer-loops look like this:
1518 :
1519 : (pre-header)
1520 : |
1521 : header <---+
1522 : | |
1523 : inner-loop |
1524 : | |
1525 : tail ------+
1526 : |
1527 : (exit-bb)
1528 :
1529 : The inner-loop also has the properties expected of inner-most loops
1530 : as described above. */
1531 :
1532 18538 : if ((loop->inner)->inner || (loop->inner)->next)
1533 2968 : return opt_result::failure_at (vect_location,
1534 : "not vectorized:"
1535 : " multiple nested loops.\n");
1536 :
1537 15570 : entryedge = loop_preheader_edge (innerloop);
1538 15570 : if (entryedge->src != loop->header
1539 15052 : || !single_exit (innerloop)
1540 26989 : || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1541 4445 : return opt_result::failure_at (vect_location,
1542 : "not vectorized:"
1543 : " unsupported outerloop form.\n");
1544 :
1545 : /* Analyze the inner-loop. */
1546 11125 : vect_loop_form_info inner;
1547 11125 : opt_result res = vect_analyze_loop_form (loop->inner, NULL, &inner);
1548 11125 : if (!res)
1549 : {
1550 416 : if (dump_enabled_p ())
1551 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1552 : "not vectorized: Bad inner loop.\n");
1553 416 : return res;
1554 : }
1555 :
1556 : /* Don't support analyzing niter under assumptions for inner
1557 : loop. */
1558 10709 : if (!integer_onep (inner.assumptions))
1559 257 : return opt_result::failure_at (vect_location,
1560 : "not vectorized: Bad inner loop.\n");
1561 :
1562 10452 : if (inner.number_of_iterations == chrec_dont_know
1563 10452 : || !expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1564 1837 : return opt_result::failure_at (vect_location,
1565 : "not vectorized: inner-loop count not"
1566 : " invariant.\n");
1567 :
1568 8615 : if (dump_enabled_p ())
1569 1050 : dump_printf_loc (MSG_NOTE, vect_location,
1570 : "Considering outer-loop vectorization.\n");
1571 8615 : info->inner_loop_cond = inner.conds[0];
1572 11125 : }
1573 :
1574 299219 : if (EDGE_COUNT (loop->header->preds) != 2)
1575 0 : return opt_result::failure_at (vect_location,
1576 : "not vectorized:"
1577 : " too many incoming edges.\n");
1578 :
1579 : /* We assume that the latch is empty. */
1580 299219 : basic_block latch = loop->latch;
1581 299219 : do
1582 : {
1583 299219 : if (!empty_block_p (latch)
1584 299219 : || !gimple_seq_empty_p (phi_nodes (latch)))
1585 20671 : return opt_result::failure_at (vect_location,
1586 : "not vectorized: latch block not "
1587 : "empty.\n");
1588 278548 : latch = single_pred (latch);
1589 : }
1590 557096 : while (single_succ_p (latch));
1591 :
1592 : /* Make sure there is no abnormal exit. */
1593 278548 : auto_vec<edge> exits = get_loop_exit_edges (loop);
1594 956164 : for (edge e : exits)
1595 : {
1596 399101 : if (e->flags & EDGE_ABNORMAL)
1597 33 : return opt_result::failure_at (vect_location,
1598 : "not vectorized:"
1599 : " abnormal loop exit edge.\n");
1600 : }
1601 :
1602 278515 : info->conds
1603 278515 : = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1604 : &info->number_of_iterations,
1605 278515 : &info->number_of_iterationsm1);
1606 278515 : if (info->conds.is_empty ())
1607 33 : return opt_result::failure_at
1608 33 : (vect_location,
1609 : "not vectorized: complicated exit condition.\n");
1610 :
1611 : /* Determine what the primary and alternate exit conds are. */
1612 677517 : for (unsigned i = 0; i < info->conds.length (); i++)
1613 : {
1614 399035 : gcond *cond = info->conds[i];
1615 399035 : if (exit_e->src == gimple_bb (cond))
1616 278482 : std::swap (info->conds[0], info->conds[i]);
1617 : }
1618 :
1619 278482 : if (chrec_contains_undetermined (info->number_of_iterations))
1620 : {
1621 58214 : if (dump_enabled_p ())
1622 259 : dump_printf_loc (MSG_NOTE, vect_location,
1623 : "Loop being analyzed as uncounted.\n");
1624 58214 : if (loop->inner)
1625 562 : return opt_result::failure_at
1626 562 : (vect_location,
1627 : "not vectorized: outer loop vectorization of uncounted loops"
1628 : " is unsupported.\n");
1629 57652 : return opt_result::success ();
1630 : }
1631 :
1632 220268 : if (integer_zerop (info->assumptions))
1633 4 : return opt_result::failure_at
1634 4 : (info->conds[0],
1635 : "not vectorized: number of iterations cannot be computed.\n");
1636 :
1637 220264 : if (integer_zerop (info->number_of_iterations))
1638 12 : return opt_result::failure_at
1639 12 : (info->conds[0],
1640 : "not vectorized: number of iterations = 0.\n");
1641 :
1642 220252 : if (!(tree_fits_shwi_p (info->number_of_iterations)
1643 121238 : && tree_to_shwi (info->number_of_iterations) > 0))
1644 : {
1645 99014 : if (dump_enabled_p ())
1646 : {
1647 2481 : dump_printf_loc (MSG_NOTE, vect_location,
1648 : "Symbolic number of iterations is ");
1649 2481 : dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1650 2481 : dump_printf (MSG_NOTE, "\n");
1651 : }
1652 : }
1653 :
1654 220252 : if (!integer_onep (info->assumptions))
1655 : {
1656 8623 : if (dump_enabled_p ())
1657 : {
1658 68 : dump_printf_loc (MSG_NOTE, vect_location,
1659 : "Loop to be versioned with niter assumption ");
1660 68 : dump_generic_expr (MSG_NOTE, TDF_SLIM, info->assumptions);
1661 68 : dump_printf (MSG_NOTE, "\n");
1662 : }
1663 : }
1664 :
1665 220252 : return opt_result::success ();
1666 278548 : }
1667 :
1668 : /* Create a loop_vec_info for LOOP with SHARED and the
1669 : vect_analyze_loop_form result. */
1670 :
1671 : loop_vec_info
1672 575326 : vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1673 : const vect_loop_form_info *info,
1674 : loop_vec_info orig_loop_info)
1675 : {
1676 575326 : loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1677 575326 : LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1678 575326 : LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1679 575326 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1680 575326 : LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_info;
1681 575326 : if (orig_loop_info && LOOP_VINFO_EPILOGUE_P (orig_loop_info))
1682 344 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo)
1683 344 : = LOOP_VINFO_MAIN_LOOP_INFO (orig_loop_info);
1684 : else
1685 574982 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo) = orig_loop_info;
1686 : /* Also record the assumptions for versioning. */
1687 575326 : if (!integer_onep (info->assumptions) && !orig_loop_info)
1688 19517 : LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1689 :
1690 2552075 : for (gcond *cond : info->conds)
1691 : {
1692 826097 : stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1693 : /* Mark the statement as a condition. */
1694 826097 : STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1695 : }
1696 :
1697 575326 : unsigned cond_id = 0;
1698 575326 : if (!LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo))
1699 491225 : LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[cond_id++];
1700 :
1701 910198 : for (; cond_id < info->conds.length (); cond_id ++)
1702 334872 : LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[cond_id]);
1703 :
1704 575326 : LOOP_VINFO_MAIN_EXIT (loop_vinfo) = info->loop_exit;
1705 :
1706 : /* Check to see if we're vectorizing multiple exits. */
1707 575326 : LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1708 575326 : = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1709 :
1710 : /* At the moment we can't support no epilogs for multiple exits, result of
1711 : the first compare should be masked by that of the second. We can only
1712 : allow it if the early exits have the same live values. for differing
1713 : values we have to calculate a third mask to disambiguate. */
1714 575326 : LOOP_VINFO_EARLY_BRK_NEEDS_EPILOG (loop_vinfo)
1715 575326 : = LOOP_VINFO_LOOP_CONDS (loop_vinfo).length () > 1;
1716 :
1717 575326 : if (info->inner_loop_cond)
1718 : {
1719 : /* If we have an estimate on the number of iterations of the inner
1720 : loop use that to limit the scale for costing, otherwise use
1721 : --param vect-inner-loop-cost-factor literally. */
1722 9042 : widest_int nit;
1723 9042 : if (estimated_stmt_executions (loop->inner, &nit))
1724 7735 : LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1725 7735 : = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1726 9042 : }
1727 :
1728 575326 : return loop_vinfo;
1729 : }
1730 :
1731 :
1732 :
1733 : /* Return true if we know that the iteration count is smaller than the
1734 : vectorization factor. Return false if it isn't, or if we can't be sure
1735 : either way. */
1736 :
1737 : static bool
1738 154345 : vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1739 : {
1740 154345 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1741 :
1742 154345 : HOST_WIDE_INT max_niter;
1743 154345 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1744 79233 : max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1745 : else
1746 75112 : max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1747 :
1748 154345 : if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1749 10936 : return true;
1750 :
1751 : return false;
1752 : }
1753 :
1754 : /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1755 : is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1756 : definitely no, or -1 if it's worth retrying. */
1757 :
1758 : static int
1759 154354 : vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1760 : unsigned *suggested_unroll_factor)
1761 : {
1762 154354 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1763 154354 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1764 :
1765 : /* Only loops that can handle partially-populated vectors can have iteration
1766 : counts less than the vectorization factor. */
1767 154354 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
1768 154354 : && vect_known_niters_smaller_than_vf (loop_vinfo))
1769 : {
1770 10926 : if (dump_enabled_p ())
1771 236 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1772 : "not vectorized: iteration count smaller than "
1773 : "vectorization factor.\n");
1774 10926 : return 0;
1775 : }
1776 :
1777 : /* If we know the number of iterations we can do better, for the
1778 : epilogue we can also decide whether the main loop leaves us
1779 : with enough iterations, preferring a smaller vector epilog then
1780 : also possibly used for the case we skip the vector loop. */
1781 143428 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1782 : {
1783 69582 : widest_int scalar_niters
1784 69582 : = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
1785 69582 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1786 : {
1787 2639 : loop_vec_info orig_loop_vinfo
1788 : = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1789 2639 : loop_vec_info main_loop_vinfo
1790 : = LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo);
1791 2639 : unsigned lowest_vf
1792 2639 : = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
1793 2639 : int prolog_peeling = 0;
1794 2639 : if (!vect_use_loop_mask_for_alignment_p (main_loop_vinfo))
1795 2639 : prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo);
1796 2639 : if (prolog_peeling >= 0
1797 2639 : && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
1798 : lowest_vf))
1799 : {
1800 5268 : unsigned gap
1801 2634 : = LOOP_VINFO_PEELING_FOR_GAPS (main_loop_vinfo) ? 1 : 0;
1802 5268 : scalar_niters = ((scalar_niters - gap - prolog_peeling)
1803 5268 : % lowest_vf + gap);
1804 : }
1805 : }
1806 : /* Reject vectorizing for a single scalar iteration, even if
1807 : we could in principle implement that using partial vectors.
1808 : But allow such vectorization if VF == 1 in case we do not
1809 : need to peel for gaps (if we need, avoid vectorization for
1810 : reasons of code footprint). */
1811 69582 : unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
1812 69582 : if (scalar_niters <= peeling_gap + 1
1813 69582 : && (assumed_vf > 1 || peeling_gap != 0))
1814 : {
1815 655 : if (dump_enabled_p ())
1816 159 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1817 : "not vectorized: loop only has a single "
1818 : "scalar iteration.\n");
1819 655 : return 0;
1820 : }
1821 :
1822 68927 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1823 : {
1824 : /* Check that the loop processes at least one full vector. */
1825 68916 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1826 68916 : if (known_lt (scalar_niters, vf))
1827 : {
1828 348 : if (dump_enabled_p ())
1829 296 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1830 : "loop does not have enough iterations "
1831 : "to support vectorization.\n");
1832 388 : return 0;
1833 : }
1834 :
1835 : /* If we need to peel an extra epilogue iteration to handle data
1836 : accesses with gaps, check that there are enough scalar iterations
1837 : available.
1838 :
1839 : The check above is redundant with this one when peeling for gaps,
1840 : but the distinction is useful for diagnostics. */
1841 68568 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1842 68874 : && known_le (scalar_niters, vf))
1843 : {
1844 40 : if (dump_enabled_p ())
1845 9 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1846 : "loop does not have enough iterations "
1847 : "to support peeling for gaps.\n");
1848 40 : return 0;
1849 : }
1850 : }
1851 69582 : }
1852 :
1853 : /* If using the "very cheap" model. reject cases in which we'd keep
1854 : a copy of the scalar code (even if we might be able to vectorize it). */
1855 142385 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1856 142385 : && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1857 75438 : || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
1858 : {
1859 721 : if (dump_enabled_p ())
1860 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1861 : "some scalar iterations would need to be peeled\n");
1862 721 : return 0;
1863 : }
1864 :
1865 141664 : int min_profitable_iters, min_profitable_estimate;
1866 141664 : vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1867 : &min_profitable_estimate,
1868 : suggested_unroll_factor);
1869 :
1870 141664 : if (min_profitable_iters < 0)
1871 : {
1872 23820 : if (dump_enabled_p ())
1873 30 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1874 : "not vectorized: vectorization not profitable.\n");
1875 23820 : if (dump_enabled_p ())
1876 30 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1877 : "not vectorized: vector version will never be "
1878 : "profitable.\n");
1879 23820 : return -1;
1880 : }
1881 :
1882 117844 : int min_scalar_loop_bound = (param_min_vect_loop_bound
1883 117844 : * assumed_vf);
1884 :
1885 : /* Use the cost model only if it is more conservative than user specified
1886 : threshold. */
1887 117844 : unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1888 : min_profitable_iters);
1889 :
1890 117844 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1891 :
1892 63442 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1893 181286 : && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1894 : {
1895 436 : if (dump_enabled_p ())
1896 1 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1897 : "not vectorized: vectorization not profitable.\n");
1898 436 : if (dump_enabled_p ())
1899 1 : dump_printf_loc (MSG_NOTE, vect_location,
1900 : "not vectorized: iteration count smaller than user "
1901 : "specified loop bound parameter or minimum profitable "
1902 : "iterations (whichever is more conservative).\n");
1903 436 : return 0;
1904 : }
1905 :
1906 : /* The static profitablity threshold min_profitable_estimate includes
1907 : the cost of having to check at runtime whether the scalar loop
1908 : should be used instead. If it turns out that we don't need or want
1909 : such a check, the threshold we should use for the static estimate
1910 : is simply the point at which the vector loop becomes more profitable
1911 : than the scalar loop. */
1912 117408 : if (min_profitable_estimate > min_profitable_iters
1913 25133 : && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1914 24578 : && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1915 613 : && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1916 118021 : && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1917 : {
1918 12 : if (dump_enabled_p ())
1919 7 : dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1920 : " choice between the scalar and vector loops\n");
1921 12 : min_profitable_estimate = min_profitable_iters;
1922 : }
1923 :
1924 : /* If the vector loop needs multiple iterations to be beneficial then
1925 : things are probably too close to call, and the conservative thing
1926 : would be to stick with the scalar code. */
1927 117408 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1928 117408 : && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1929 : {
1930 18319 : if (dump_enabled_p ())
1931 223 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1932 : "one iteration of the vector loop would be"
1933 : " more expensive than the equivalent number of"
1934 : " iterations of the scalar loop\n");
1935 18319 : return 0;
1936 : }
1937 :
1938 99089 : HOST_WIDE_INT estimated_niter;
1939 :
1940 : /* If we are vectorizing an epilogue then we know the maximum number of
1941 : scalar iterations it will cover is at least one lower than the
1942 : vectorization factor of the main loop. */
1943 99089 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1944 12044 : estimated_niter
1945 12044 : = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1946 : else
1947 : {
1948 87045 : estimated_niter = estimated_stmt_executions_int (loop);
1949 87045 : if (estimated_niter == -1)
1950 31573 : estimated_niter = likely_max_stmt_executions_int (loop);
1951 : }
1952 43617 : if (estimated_niter != -1
1953 96140 : && ((unsigned HOST_WIDE_INT) estimated_niter
1954 96140 : < MAX (th, (unsigned) min_profitable_estimate)))
1955 : {
1956 4297 : if (dump_enabled_p ())
1957 32 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1958 : "not vectorized: estimated iteration count too "
1959 : "small.\n");
1960 4297 : if (dump_enabled_p ())
1961 32 : dump_printf_loc (MSG_NOTE, vect_location,
1962 : "not vectorized: estimated iteration count smaller "
1963 : "than specified loop bound parameter or minimum "
1964 : "profitable iterations (whichever is more "
1965 : "conservative).\n");
1966 4297 : return -1;
1967 : }
1968 :
1969 : /* As we cannot use a runtime check to gate profitability for uncounted
1970 : loops require either an estimate or if none, at least a profitable
1971 : vectorization within the first vector iteration (that condition
1972 : will practically never be true due to the required epilog and
1973 : likely alignment prologue). */
1974 94792 : if (LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo)
1975 163 : && estimated_niter == -1
1976 94928 : && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1977 : {
1978 120 : if (dump_enabled_p ())
1979 2 : dump_printf_loc (MSG_NOTE, vect_location,
1980 : "not vectorized: no loop iteration estimate on the "
1981 : "uncounted loop and not trivially profitable.\n");
1982 120 : return -1;
1983 : }
1984 :
1985 : return 1;
1986 : }
1987 :
1988 : /* Gather data references in LOOP with body BBS and store them into
1989 : *DATAREFS. */
1990 :
1991 : static opt_result
1992 276153 : vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1993 : vec<data_reference_p> *datarefs)
1994 : {
1995 826477 : for (unsigned i = 0; i < loop->num_nodes; i++)
1996 1224034 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1997 5285394 : !gsi_end_p (gsi); gsi_next (&gsi))
1998 : {
1999 4735070 : gimple *stmt = gsi_stmt (gsi);
2000 4735070 : if (is_gimple_debug (stmt))
2001 2234013 : continue;
2002 2501187 : opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2003 : NULL, 0);
2004 2501187 : if (!res)
2005 : {
2006 61823 : if (is_gimple_call (stmt) && loop->safelen)
2007 : {
2008 402 : tree fndecl = gimple_call_fndecl (stmt), op;
2009 402 : if (fndecl == NULL_TREE
2010 402 : && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2011 : {
2012 0 : fndecl = gimple_call_arg (stmt, 0);
2013 0 : gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2014 0 : fndecl = TREE_OPERAND (fndecl, 0);
2015 0 : gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2016 : }
2017 402 : if (fndecl != NULL_TREE)
2018 : {
2019 365 : cgraph_node *node = cgraph_node::get (fndecl);
2020 365 : if (node != NULL && node->simd_clones != NULL)
2021 : {
2022 131 : unsigned int j, n = gimple_call_num_args (stmt);
2023 545 : for (j = 0; j < n; j++)
2024 : {
2025 284 : op = gimple_call_arg (stmt, j);
2026 284 : if (DECL_P (op)
2027 284 : || (REFERENCE_CLASS_P (op)
2028 0 : && get_base_address (op)))
2029 : break;
2030 : }
2031 131 : op = gimple_call_lhs (stmt);
2032 : /* Ignore #pragma omp declare simd functions
2033 : if they don't have data references in the
2034 : call stmt itself. */
2035 261 : if (j == n
2036 131 : && !(op
2037 120 : && (DECL_P (op)
2038 120 : || (REFERENCE_CLASS_P (op)
2039 0 : && get_base_address (op)))))
2040 130 : continue;
2041 : }
2042 : }
2043 : }
2044 61693 : return res;
2045 : }
2046 : /* If dependence analysis will give up due to the limit on the
2047 : number of datarefs stop here and fail fatally. */
2048 4278784 : if (datarefs->length ()
2049 1839420 : > (unsigned)param_loop_max_datarefs_for_datadeps)
2050 0 : return opt_result::failure_at (stmt, "exceeded param "
2051 : "loop-max-datarefs-for-datadeps\n");
2052 : }
2053 214460 : return opt_result::success ();
2054 : }
2055 :
2056 : /* Determine if operating on full vectors for LOOP_VINFO might leave
2057 : some scalar iterations still to do. If so, decide how we should
2058 : handle those scalar iterations. The possibilities are:
2059 :
2060 : (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2061 : In this case:
2062 :
2063 : LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2064 : LOOP_VINFO_PEELING_FOR_NITER == false
2065 :
2066 : (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2067 : to handle the remaining scalar iterations. In this case:
2068 :
2069 : LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2070 : LOOP_VINFO_PEELING_FOR_NITER == true
2071 :
2072 : The MASKED_P argument specifies to what extent
2073 : param_vect_partial_vector_usage is to be honored. For MASKED_P == 0
2074 : no partial vectors are to be used, for MASKED_P == -1 it's
2075 : param_vect_partial_vector_usage that gets to decide whether we may
2076 : consider partial vector usage. For MASKED_P == 1 partial vectors
2077 : may be used if possible.
2078 :
2079 : */
2080 :
2081 : static opt_result
2082 155269 : vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2083 : int masked_p)
2084 : {
2085 : /* Determine whether there would be any scalar iterations left over. */
2086 155269 : bool need_peeling_or_partial_vectors_p
2087 155269 : = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2088 :
2089 : /* Decide whether to vectorize the loop with partial vectors. */
2090 155269 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2091 155269 : if (masked_p == 0
2092 155269 : || (masked_p == -1 && param_vect_partial_vector_usage == 0))
2093 : /* If requested explicitly do not use partial vectors. */
2094 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2095 207 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2096 65 : && LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo))
2097 0 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2098 207 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2099 65 : && need_peeling_or_partial_vectors_p)
2100 : {
2101 : /* For partial-vector-usage=1, try to push the handling of partial
2102 : vectors to the epilogue, with the main loop continuing to operate
2103 : on full vectors.
2104 :
2105 : If we are unrolling we also do not want to use partial vectors. This
2106 : is to avoid the overhead of generating multiple masks and also to
2107 : avoid having to execute entire iterations of FALSE masked instructions
2108 : when dealing with one or less full iterations.
2109 :
2110 : ??? We could then end up failing to use partial vectors if we
2111 : decide to peel iterations into a prologue, and if the main loop
2112 : then ends up processing fewer than VF iterations. */
2113 43 : if ((param_vect_partial_vector_usage == 1
2114 11 : || loop_vinfo->suggested_unroll_factor > 1)
2115 32 : && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2116 65 : && !vect_known_niters_smaller_than_vf (loop_vinfo))
2117 : ;
2118 : else
2119 31 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2120 : }
2121 :
2122 155269 : if (LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo)
2123 0 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2124 0 : return opt_result::failure_at (vect_location,
2125 : "not vectorized: loop needs but cannot "
2126 : "use partial vectors\n");
2127 :
2128 155269 : if (dump_enabled_p ())
2129 12551 : dump_printf_loc (MSG_NOTE, vect_location,
2130 : "operating on %s vectors%s.\n",
2131 12551 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2132 : ? "partial" : "full",
2133 12551 : LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2134 : ? " for epilogue loop" : "");
2135 :
2136 155269 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2137 310538 : = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2138 155269 : && need_peeling_or_partial_vectors_p);
2139 :
2140 155269 : return opt_result::success ();
2141 : }
2142 :
2143 : /* Function vect_analyze_loop_2.
2144 :
2145 : Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2146 : analyses will record information in some members of LOOP_VINFO. FATAL
2147 : indicates if some analysis meets fatal error. If one non-NULL pointer
2148 : SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2149 : worked out suggested unroll factor, while one NULL pointer shows it's
2150 : going to apply the suggested unroll factor.
2151 : SINGLE_LANE_SLP_DONE_FOR_SUGGESTED_UF is to hold whether single-lane
2152 : slp was forced when the suggested unroll factor was worked out. */
2153 : static opt_result
2154 574626 : vect_analyze_loop_2 (loop_vec_info loop_vinfo, int masked_p, bool &fatal,
2155 : unsigned *suggested_unroll_factor,
2156 : bool& single_lane_slp_done_for_suggested_uf)
2157 : {
2158 574626 : opt_result ok = opt_result::success ();
2159 574626 : int res;
2160 574626 : unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2161 574626 : loop_vec_info orig_loop_vinfo = NULL;
2162 :
2163 : /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2164 : loop_vec_info of the first vectorized loop. */
2165 574626 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2166 13842 : orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2167 : else
2168 : orig_loop_vinfo = loop_vinfo;
2169 13842 : gcc_assert (orig_loop_vinfo);
2170 :
2171 : /* We can't mask on niters for uncounted loops due to unknown upper bound. */
2172 574626 : if (LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo))
2173 84101 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2174 :
2175 : /* The first group of checks is independent of the vector size. */
2176 574626 : fatal = true;
2177 :
2178 574626 : if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2179 574626 : && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2180 5 : return opt_result::failure_at (vect_location,
2181 : "not vectorized: simd if(0)\n");
2182 :
2183 : /* Find all data references in the loop (which correspond to vdefs/vuses)
2184 : and analyze their evolution in the loop. */
2185 :
2186 574621 : loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2187 :
2188 : /* Gather the data references. */
2189 574621 : if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2190 : {
2191 276153 : opt_result res
2192 276153 : = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2193 : &LOOP_VINFO_DATAREFS (loop_vinfo));
2194 276153 : if (!res)
2195 : {
2196 61693 : if (dump_enabled_p ())
2197 1630 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2198 : "not vectorized: loop contains function "
2199 : "calls or data references that cannot "
2200 : "be analyzed\n");
2201 61693 : return res;
2202 : }
2203 214460 : loop_vinfo->shared->save_datarefs ();
2204 : }
2205 : else
2206 298468 : loop_vinfo->shared->check_datarefs ();
2207 :
2208 : /* Analyze the data references and also adjust the minimal
2209 : vectorization factor according to the loads and stores. */
2210 :
2211 512928 : ok = vect_analyze_data_refs (loop_vinfo, &fatal);
2212 512928 : if (!ok)
2213 : {
2214 72476 : if (dump_enabled_p ())
2215 1231 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2216 : "bad data references.\n");
2217 72476 : return ok;
2218 : }
2219 :
2220 : /* Check if we are applying unroll factor now. */
2221 440452 : bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2222 440452 : gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2223 :
2224 : /* When single-lane SLP was forced and we are applying suggested unroll
2225 : factor, keep that decision here. */
2226 880904 : bool force_single_lane = (applying_suggested_uf
2227 440452 : && single_lane_slp_done_for_suggested_uf);
2228 :
2229 : /* Classify all cross-iteration scalar data-flow cycles.
2230 : Cross-iteration cycles caused by virtual phis are analyzed separately. */
2231 440452 : vect_analyze_scalar_cycles (loop_vinfo);
2232 :
2233 440452 : vect_pattern_recog (loop_vinfo);
2234 :
2235 : /* Analyze the access patterns of the data-refs in the loop (consecutive,
2236 : complex, etc.). FORNOW: Only handle consecutive access pattern. */
2237 :
2238 440452 : ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2239 440452 : if (!ok)
2240 : {
2241 8010 : if (dump_enabled_p ())
2242 292 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2243 : "bad data access.\n");
2244 8010 : return ok;
2245 : }
2246 :
2247 : /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2248 :
2249 432442 : ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2250 432442 : if (!ok)
2251 : {
2252 45036 : if (dump_enabled_p ())
2253 398 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2254 : "unexpected pattern.\n");
2255 45036 : return ok;
2256 : }
2257 :
2258 : /* While the rest of the analysis below depends on it in some way. */
2259 387406 : fatal = false;
2260 :
2261 : /* Analyze data dependences between the data-refs in the loop
2262 : and adjust the maximum vectorization factor according to
2263 : the dependences.
2264 : FORNOW: fail at the first data dependence that we encounter. */
2265 :
2266 387406 : ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2267 387406 : if (!ok)
2268 : {
2269 23790 : if (dump_enabled_p ())
2270 538 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2271 : "bad data dependence.\n");
2272 23790 : return ok;
2273 : }
2274 363616 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2275 :
2276 : /* Compute the scalar iteration cost. */
2277 363616 : vect_compute_single_scalar_iteration_cost (loop_vinfo);
2278 :
2279 363616 : bool saved_can_use_partial_vectors_p
2280 : = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2281 :
2282 : /* This is the point where we can re-start analysis with single-lane
2283 : SLP forced. */
2284 498511 : start_over:
2285 :
2286 : /* Check the SLP opportunities in the loop, analyze and build
2287 : SLP trees. */
2288 997022 : ok = vect_analyze_slp (loop_vinfo, loop_vinfo->stmt_vec_infos.length (),
2289 : force_single_lane);
2290 498511 : if (!ok)
2291 24865 : return ok;
2292 :
2293 : /* If there are any SLP instances mark them as pure_slp and compute
2294 : the overall vectorization factor. */
2295 473646 : if (!vect_make_slp_decision (loop_vinfo))
2296 61279 : return opt_result::failure_at (vect_location, "no stmts to vectorize.\n");
2297 :
2298 412367 : if (dump_enabled_p ())
2299 19116 : dump_printf_loc (MSG_NOTE, vect_location, "Loop contains only SLP stmts\n");
2300 :
2301 : /* Dump the vectorization factor from the SLP decision. */
2302 412367 : if (dump_enabled_p ())
2303 : {
2304 19116 : dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
2305 19116 : dump_dec (MSG_NOTE, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2306 19116 : dump_printf (MSG_NOTE, "\n");
2307 : }
2308 :
2309 : /* We don't expect to have to roll back to anything other than an empty
2310 : set of rgroups. */
2311 412367 : gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2312 :
2313 : /* Apply the suggested unrolling factor, this was determined by the backend
2314 : during finish_cost the first time we ran the analysis for this
2315 : vector mode. */
2316 412367 : if (applying_suggested_uf)
2317 456 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2318 :
2319 : /* Now the vectorization factor is final. */
2320 412367 : poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2321 412367 : gcc_assert (known_ne (vectorization_factor, 0U));
2322 :
2323 : /* Optimize the SLP graph with the vectorization factor fixed. */
2324 412367 : vect_optimize_slp (loop_vinfo);
2325 :
2326 : /* Gather the loads reachable from the SLP graph entries. */
2327 412367 : vect_gather_slp_loads (loop_vinfo);
2328 :
2329 412367 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2330 : {
2331 14203 : dump_printf_loc (MSG_NOTE, vect_location,
2332 : "vectorization_factor = ");
2333 14203 : dump_dec (MSG_NOTE, vectorization_factor);
2334 14203 : dump_printf (MSG_NOTE, ", niters = %wd\n",
2335 14203 : LOOP_VINFO_INT_NITERS (loop_vinfo));
2336 : }
2337 :
2338 412367 : if (max_vf != MAX_VECTORIZATION_FACTOR
2339 412367 : && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2340 41 : return opt_result::failure_at (vect_location, "bad data dependence.\n");
2341 :
2342 412326 : loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2343 :
2344 : /* Analyze the alignment of the data-refs in the loop. */
2345 412326 : vect_analyze_data_refs_alignment (loop_vinfo);
2346 :
2347 : /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2348 : It is important to call pruning after vect_analyze_data_ref_accesses,
2349 : since we use grouping information gathered by interleaving analysis. */
2350 412326 : ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2351 412326 : if (!ok)
2352 16739 : return ok;
2353 :
2354 : /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2355 : vectorization, since we do not want to add extra peeling or
2356 : add versioning for alignment. */
2357 395587 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2358 : /* This pass will decide on using loop versioning and/or loop peeling in
2359 : order to enhance the alignment of data references in the loop. */
2360 380940 : ok = vect_enhance_data_refs_alignment (loop_vinfo);
2361 395587 : if (!ok)
2362 0 : return ok;
2363 :
2364 : /* Analyze operations in the SLP instances. We can't simply
2365 : remove unsupported SLP instances as this makes the above
2366 : SLP kind detection invalid and might also affect the VF. */
2367 395587 : if (! vect_slp_analyze_operations (loop_vinfo))
2368 : {
2369 240318 : ok = opt_result::failure_at (vect_location,
2370 : "unsupported SLP instances\n");
2371 240318 : goto again;
2372 : }
2373 :
2374 : /* For now, we don't expect to mix both masking and length approaches for one
2375 : loop, disable it if both are recorded. */
2376 155269 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2377 23399 : && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2378 178662 : && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2379 : {
2380 0 : if (dump_enabled_p ())
2381 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2382 : "can't vectorize a loop with partial vectors"
2383 : " because we don't expect to mix different"
2384 : " approaches with partial vectors for the"
2385 : " same loop.\n");
2386 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2387 : }
2388 :
2389 : /* If we still have the option of using partial vectors,
2390 : check whether we can generate the necessary loop controls. */
2391 155269 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2392 : {
2393 23399 : if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2394 : {
2395 23393 : if (!vect_verify_full_masking (loop_vinfo)
2396 23393 : && !vect_verify_full_masking_avx512 (loop_vinfo))
2397 6063 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2398 : }
2399 : else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
2400 6 : if (!vect_verify_loop_lens (loop_vinfo))
2401 6 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2402 : }
2403 :
2404 : /* Decide whether this loop_vinfo should use partial vectors or peeling,
2405 : assuming that the loop will be used as a main loop. We will redo
2406 : this analysis later if we instead decide to use the loop as an
2407 : epilogue loop. */
2408 155269 : ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, masked_p);
2409 155269 : if (!ok)
2410 0 : return ok;
2411 :
2412 : /* If we're vectorizing a loop that uses length "controls" and
2413 : can iterate more than once, we apply decrementing IV approach
2414 : in loop control. */
2415 155269 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2416 31 : && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
2417 0 : && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
2418 155269 : && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2419 0 : && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
2420 : LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
2421 0 : LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
2422 :
2423 : /* If a loop uses length controls and has a decrementing loop control IV,
2424 : we will normally pass that IV through a MIN_EXPR to calcaluate the
2425 : basis for the length controls. E.g. in a loop that processes one
2426 : element per scalar iteration, the number of elements would be
2427 : MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
2428 :
2429 : This MIN_EXPR approach allows us to use pointer IVs with an invariant
2430 : step, since only the final iteration of the vector loop can have
2431 : inactive lanes.
2432 :
2433 : However, some targets have a dedicated instruction for calculating the
2434 : preferred length, given the total number of elements that still need to
2435 : be processed. This is encapsulated in the SELECT_VL internal function.
2436 :
2437 : If the target supports SELECT_VL, we can use it instead of MIN_EXPR
2438 : to determine the basis for the length controls. However, unlike the
2439 : MIN_EXPR calculation, the SELECT_VL calculation can decide to make
2440 : lanes inactive in any iteration of the vector loop, not just the last
2441 : iteration. This SELECT_VL approach therefore requires us to use pointer
2442 : IVs with variable steps.
2443 :
2444 : Once we've decided how many elements should be processed by one
2445 : iteration of the vector loop, we need to populate the rgroup controls.
2446 : If a loop has multiple rgroups, we need to make sure that those rgroups
2447 : "line up" (that is, they must be consistent about which elements are
2448 : active and which aren't). This is done by vect_adjust_loop_lens_control.
2449 :
2450 : In principle, it would be possible to use vect_adjust_loop_lens_control
2451 : on either the result of a MIN_EXPR or the result of a SELECT_VL.
2452 : However:
2453 :
2454 : (1) In practice, it only makes sense to use SELECT_VL when a vector
2455 : operation will be controlled directly by the result. It is not
2456 : worth using SELECT_VL if it would only be the input to other
2457 : calculations.
2458 :
2459 : (2) If we use SELECT_VL for an rgroup that has N controls, each associated
2460 : pointer IV will need N updates by a variable amount (N-1 updates
2461 : within the iteration and 1 update to move to the next iteration).
2462 :
2463 : Because of this, we prefer to use the MIN_EXPR approach whenever there
2464 : is more than one length control.
2465 :
2466 : In addition, SELECT_VL always operates to a granularity of 1 unit.
2467 : If we wanted to use it to control an SLP operation on N consecutive
2468 : elements, we would need to make the SELECT_VL inputs measure scalar
2469 : iterations (rather than elements) and then multiply the SELECT_VL
2470 : result by N. But using SELECT_VL this way is inefficient because
2471 : of (1) above.
2472 :
2473 : 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
2474 : satisfied:
2475 :
2476 : (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
2477 : (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
2478 :
2479 : Since SELECT_VL (variable step) will make SCEV analysis failed and then
2480 : we will fail to gain benefits of following unroll optimizations. We prefer
2481 : using the MIN_EXPR approach in this situation. */
2482 155269 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
2483 : {
2484 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
2485 0 : if (LOOP_VINFO_LENS (loop_vinfo).length () == 1
2486 0 : && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1
2487 0 : && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2488 : || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
2489 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
2490 :
2491 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
2492 0 : for (auto rgc : LOOP_VINFO_LENS (loop_vinfo))
2493 0 : if (rgc.type
2494 0 : && !direct_internal_fn_supported_p (IFN_SELECT_VL,
2495 : rgc.type, iv_type,
2496 : OPTIMIZE_FOR_SPEED))
2497 : {
2498 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2499 0 : break;
2500 : }
2501 :
2502 : /* If any of the SLP instances cover more than a single lane
2503 : we cannot use .SELECT_VL at the moment, even if the number
2504 : of lanes is uniform throughout the SLP graph. */
2505 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
2506 0 : for (slp_instance inst : LOOP_VINFO_SLP_INSTANCES (loop_vinfo))
2507 0 : if (SLP_TREE_LANES (SLP_INSTANCE_TREE (inst)) != 1
2508 0 : && !(SLP_INSTANCE_KIND (inst) == slp_inst_kind_store
2509 0 : && SLP_INSTANCE_TREE (inst)->ldst_lanes))
2510 : {
2511 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2512 0 : break;
2513 : }
2514 : }
2515 :
2516 : /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2517 : to be able to handle fewer than VF scalars, or needs to have a lower VF
2518 : than the main loop. */
2519 155269 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2520 13440 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2521 : {
2522 13426 : poly_uint64 unscaled_vf
2523 13426 : = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2524 : orig_loop_vinfo->suggested_unroll_factor);
2525 13426 : if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
2526 379 : return opt_result::failure_at (vect_location,
2527 : "Vectorization factor too high for"
2528 : " epilogue loop.\n");
2529 : }
2530 :
2531 : /* If the epilogue needs peeling for gaps but the main loop doesn't give
2532 : up on the epilogue. */
2533 154890 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2534 13061 : && LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2535 73 : && (LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo)
2536 : != LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
2537 4 : return opt_result::failure_at (vect_location,
2538 : "Epilogue loop requires peeling for gaps "
2539 : "but main loop does not.\n");
2540 :
2541 : /* If an epilogue loop is required make sure we can create one. */
2542 154886 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2543 153609 : || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2544 56587 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
2545 : {
2546 99814 : if (dump_enabled_p ())
2547 5552 : dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2548 99814 : if (!vect_can_advance_ivs_p (loop_vinfo)
2549 199096 : || !slpeel_can_duplicate_loop_p (loop,
2550 : LOOP_VINFO_MAIN_EXIT (loop_vinfo),
2551 99282 : LOOP_VINFO_MAIN_EXIT (loop_vinfo)))
2552 : {
2553 532 : ok = opt_result::failure_at (vect_location,
2554 : "not vectorized: can't create required "
2555 : "epilog loop\n");
2556 532 : goto again;
2557 : }
2558 : }
2559 :
2560 : /* Check the costings of the loop make vectorizing worthwhile. */
2561 154354 : res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2562 154354 : if (res < 0 && !param_vect_allow_possibly_not_worthwhile_vectorizations)
2563 : {
2564 28237 : ok = opt_result::failure_at (vect_location,
2565 : "Loop costings may not be worthwhile.\n");
2566 28237 : goto again;
2567 : }
2568 126117 : if (!res)
2569 31445 : return opt_result::failure_at (vect_location,
2570 : "Loop costings not worthwhile.\n");
2571 :
2572 : /* During peeling, we need to check if number of loop iterations is
2573 : enough for both peeled prolog loop and vector loop. This check
2574 : can be merged along with threshold check of loop versioning, so
2575 : increase threshold for this case if necessary.
2576 :
2577 : If we are analyzing an epilogue we still want to check what its
2578 : versioning threshold would be. If we decide to vectorize the epilogues we
2579 : will want to use the lowest versioning threshold of all epilogues and main
2580 : loop. This will enable us to enter a vectorized epilogue even when
2581 : versioning the loop. We can't simply check whether the epilogue requires
2582 : versioning though since we may have skipped some versioning checks when
2583 : analyzing the epilogue. For instance, checks for alias versioning will be
2584 : skipped when dealing with epilogues as we assume we already checked them
2585 : for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2586 94672 : if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2587 : {
2588 8949 : poly_uint64 niters_th = 0;
2589 8949 : unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2590 :
2591 8949 : if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2592 : {
2593 : /* Niters for peeled prolog loop. */
2594 8949 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2595 : {
2596 115 : dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2597 115 : tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2598 115 : niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2599 : }
2600 : else
2601 8834 : niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2602 : }
2603 :
2604 : /* Niters for at least one iteration of vectorized loop. */
2605 8949 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2606 8945 : niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2607 : /* One additional iteration because of peeling for gap. */
2608 8949 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2609 66 : niters_th += 1;
2610 :
2611 : /* Use the same condition as vect_transform_loop to decide when to use
2612 : the cost to determine a versioning threshold. */
2613 8949 : if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2614 8949 : && ordered_p (th, niters_th))
2615 6631 : niters_th = ordered_max (poly_uint64 (th), niters_th);
2616 :
2617 8949 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2618 : }
2619 :
2620 94672 : gcc_assert (known_eq (vectorization_factor,
2621 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2622 :
2623 94672 : single_lane_slp_done_for_suggested_uf = force_single_lane;
2624 :
2625 : /* Ok to vectorize! */
2626 94672 : LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2627 94672 : return opt_result::success ();
2628 :
2629 269087 : again:
2630 : /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2631 269087 : gcc_assert (!ok);
2632 :
2633 : /* Try again with single-lane SLP. */
2634 269087 : if (force_single_lane)
2635 133230 : return ok;
2636 :
2637 : /* If we are applying suggested unroll factor, we don't need to
2638 : re-try any more as we want to keep the SLP mode fixed. */
2639 135857 : if (applying_suggested_uf)
2640 10 : return ok;
2641 :
2642 : /* Likewise if the grouped loads or stores in the SLP cannot be handled
2643 : via interleaving or lane instructions. */
2644 : slp_instance instance;
2645 : slp_tree node;
2646 : unsigned i, j;
2647 368637 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2648 : {
2649 233742 : if (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance)) != vect_internal_def)
2650 0 : continue;
2651 :
2652 233742 : stmt_vec_info vinfo;
2653 233742 : vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2654 233742 : if (!vinfo || !STMT_VINFO_GROUPED_ACCESS (vinfo))
2655 230980 : continue;
2656 2762 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2657 2762 : unsigned int size = DR_GROUP_SIZE (vinfo);
2658 2762 : tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
2659 2762 : if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
2660 4849 : && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2661 5518 : && ! vect_grouped_store_supported (vectype, size))
2662 669 : return opt_result::failure_at (vinfo->stmt,
2663 : "unsupported grouped store\n");
2664 236308 : FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2665 : {
2666 2238 : vinfo = SLP_TREE_REPRESENTATIVE (node);
2667 2238 : if (STMT_VINFO_GROUPED_ACCESS (vinfo))
2668 : {
2669 1957 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2670 1957 : bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2671 1957 : size = DR_GROUP_SIZE (vinfo);
2672 1957 : vectype = SLP_TREE_VECTYPE (node);
2673 1957 : if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
2674 1957 : && ! vect_grouped_load_supported (vectype, single_element_p,
2675 : size))
2676 283 : return opt_result::failure_at (vinfo->stmt,
2677 : "unsupported grouped load\n");
2678 : }
2679 : }
2680 : }
2681 :
2682 : /* Roll back state appropriately. Force single-lane SLP this time. */
2683 134895 : force_single_lane = true;
2684 134895 : if (dump_enabled_p ())
2685 3381 : dump_printf_loc (MSG_NOTE, vect_location,
2686 : "re-trying with single-lane SLP\n");
2687 :
2688 : /* Reset the vectorization factor. */
2689 134895 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = 0;
2690 : /* Free the SLP instances. */
2691 367678 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2692 232783 : vect_free_slp_instance (instance);
2693 134895 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2694 : /* Reset altered state on stmts. */
2695 512633 : for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2696 : {
2697 377738 : basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2698 377738 : for (gimple_stmt_iterator si = gsi_start_phis (bb);
2699 679514 : !gsi_end_p (si); gsi_next (&si))
2700 : {
2701 301776 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2702 301776 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2703 301776 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2704 : {
2705 : /* vectorizable_reduction adjusts reduction stmt def-types,
2706 : restore them to that of the PHI. */
2707 25668 : STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2708 25668 : = STMT_VINFO_DEF_TYPE (stmt_info);
2709 25668 : STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2710 : (STMT_VINFO_REDUC_DEF (stmt_info)))
2711 25668 : = STMT_VINFO_DEF_TYPE (stmt_info);
2712 : }
2713 : }
2714 : }
2715 : /* Free optimized alias test DDRS. */
2716 134895 : LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2717 134895 : LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2718 134895 : LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2719 : /* Reset target cost data. */
2720 134895 : delete loop_vinfo->vector_costs;
2721 134895 : loop_vinfo->vector_costs = nullptr;
2722 : /* Reset accumulated rgroup information. */
2723 134895 : LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
2724 134895 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
2725 134895 : release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2726 : /* Reset assorted flags. */
2727 134895 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2728 134895 : LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2729 134895 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2730 134895 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2731 134895 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2732 134895 : = saved_can_use_partial_vectors_p;
2733 134895 : LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2734 134895 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2735 134895 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2736 134895 : LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = false;
2737 :
2738 134895 : if (loop_vinfo->scan_map)
2739 122 : loop_vinfo->scan_map->empty ();
2740 :
2741 134895 : goto start_over;
2742 : }
2743 :
2744 : /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2745 : to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2746 : OLD_LOOP_VINFO is better unless something specifically indicates
2747 : otherwise.
2748 :
2749 : Note that this deliberately isn't a partial order. */
2750 :
2751 : static bool
2752 32461 : vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2753 : loop_vec_info old_loop_vinfo)
2754 : {
2755 32461 : struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2756 32461 : gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2757 :
2758 32461 : poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2759 32461 : poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2760 :
2761 : /* Always prefer a VF of loop->simdlen over any other VF. */
2762 32461 : if (loop->simdlen)
2763 : {
2764 0 : bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2765 0 : bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2766 0 : if (new_simdlen_p != old_simdlen_p)
2767 : return new_simdlen_p;
2768 : }
2769 :
2770 32461 : const auto *old_costs = old_loop_vinfo->vector_costs;
2771 32461 : const auto *new_costs = new_loop_vinfo->vector_costs;
2772 32461 : if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2773 1482 : return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2774 :
2775 30979 : return new_costs->better_main_loop_than_p (old_costs);
2776 : }
2777 :
2778 : /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2779 : true if we should. */
2780 :
2781 : static bool
2782 32461 : vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2783 : loop_vec_info old_loop_vinfo)
2784 : {
2785 32461 : if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2786 : return false;
2787 :
2788 1348 : if (dump_enabled_p ())
2789 12 : dump_printf_loc (MSG_NOTE, vect_location,
2790 : "***** Preferring vector mode %s to vector mode %s\n",
2791 12 : GET_MODE_NAME (new_loop_vinfo->vector_mode),
2792 12 : GET_MODE_NAME (old_loop_vinfo->vector_mode));
2793 : return true;
2794 : }
2795 :
2796 : /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if ORIG_LOOP_VINFO is
2797 : not NULL. When MASKED_P is not -1 override the default
2798 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P with it.
2799 : Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance MODE_I to the next
2800 : mode useful to analyze.
2801 : Return the loop_vinfo on success and wrapped null on failure. */
2802 :
2803 : static opt_loop_vec_info
2804 574170 : vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2805 : const vect_loop_form_info *loop_form_info,
2806 : loop_vec_info orig_loop_vinfo,
2807 : const vector_modes &vector_modes, unsigned &mode_i,
2808 : int masked_p,
2809 : machine_mode &autodetected_vector_mode,
2810 : bool &fatal)
2811 : {
2812 574170 : loop_vec_info loop_vinfo
2813 574170 : = vect_create_loop_vinfo (loop, shared, loop_form_info, orig_loop_vinfo);
2814 :
2815 574170 : machine_mode vector_mode = vector_modes[mode_i];
2816 574170 : loop_vinfo->vector_mode = vector_mode;
2817 574170 : unsigned int suggested_unroll_factor = 1;
2818 574170 : bool single_lane_slp_done_for_suggested_uf = false;
2819 :
2820 : /* Run the main analysis. */
2821 574170 : opt_result res = vect_analyze_loop_2 (loop_vinfo, masked_p, fatal,
2822 : &suggested_unroll_factor,
2823 : single_lane_slp_done_for_suggested_uf);
2824 574170 : if (dump_enabled_p ())
2825 21257 : dump_printf_loc (MSG_NOTE, vect_location,
2826 : "***** Analysis %s with vector mode %s\n",
2827 21257 : res ? "succeeded" : "failed",
2828 21257 : GET_MODE_NAME (loop_vinfo->vector_mode));
2829 :
2830 574170 : auto user_unroll = LOOP_VINFO_LOOP (loop_vinfo)->unroll;
2831 574170 : if (res && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2832 : /* Check to see if the user wants to unroll or if the target wants to. */
2833 660116 : && (suggested_unroll_factor > 1 || user_unroll > 1))
2834 : {
2835 482 : if (suggested_unroll_factor == 1)
2836 : {
2837 66 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
2838 66 : suggested_unroll_factor = user_unroll / assumed_vf;
2839 66 : if (suggested_unroll_factor > 1)
2840 : {
2841 40 : if (dump_enabled_p ())
2842 20 : dump_printf_loc (MSG_NOTE, vect_location,
2843 : "setting unroll factor to %d based on user requested "
2844 : "unroll factor %d and suggested vectorization "
2845 : "factor: %d\n",
2846 : suggested_unroll_factor, user_unroll, assumed_vf);
2847 : }
2848 : }
2849 :
2850 482 : if (suggested_unroll_factor > 1)
2851 : {
2852 456 : if (dump_enabled_p ())
2853 62 : dump_printf_loc (MSG_NOTE, vect_location,
2854 : "***** Re-trying analysis for unrolling"
2855 : " with unroll factor %d and %s slp.\n",
2856 : suggested_unroll_factor,
2857 : single_lane_slp_done_for_suggested_uf
2858 : ? "single-lane" : "");
2859 456 : loop_vec_info unroll_vinfo
2860 456 : = vect_create_loop_vinfo (loop, shared, loop_form_info, NULL);
2861 456 : unroll_vinfo->vector_mode = vector_mode;
2862 456 : unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
2863 456 : opt_result new_res
2864 456 : = vect_analyze_loop_2 (unroll_vinfo, masked_p, fatal, NULL,
2865 : single_lane_slp_done_for_suggested_uf);
2866 456 : if (new_res)
2867 : {
2868 397 : delete loop_vinfo;
2869 397 : loop_vinfo = unroll_vinfo;
2870 : }
2871 : else
2872 59 : delete unroll_vinfo;
2873 : }
2874 :
2875 : /* Record that we have honored a user unroll factor. */
2876 482 : LOOP_VINFO_USER_UNROLL (loop_vinfo) = user_unroll > 1;
2877 : }
2878 :
2879 : /* Remember the autodetected vector mode. */
2880 574170 : if (vector_mode == VOIDmode)
2881 266495 : autodetected_vector_mode = loop_vinfo->vector_mode;
2882 :
2883 : /* Advance mode_i, first skipping modes that would result in the
2884 : same analysis result. */
2885 2533746 : while (mode_i + 1 < vector_modes.length ()
2886 1743491 : && vect_chooses_same_modes_p (loop_vinfo,
2887 763703 : vector_modes[mode_i + 1]))
2888 : {
2889 405618 : if (dump_enabled_p ())
2890 17055 : dump_printf_loc (MSG_NOTE, vect_location,
2891 : "***** The result for vector mode %s would"
2892 : " be the same\n",
2893 17055 : GET_MODE_NAME (vector_modes[mode_i + 1]));
2894 405618 : mode_i += 1;
2895 : }
2896 574170 : if (mode_i + 1 < vector_modes.length ()
2897 932255 : && vect_chooses_same_modes_p (autodetected_vector_mode,
2898 358085 : vector_modes[mode_i + 1]))
2899 : {
2900 420 : if (dump_enabled_p ())
2901 11 : dump_printf_loc (MSG_NOTE, vect_location,
2902 : "***** Skipping vector mode %s, which would"
2903 : " repeat the analysis for %s\n",
2904 11 : GET_MODE_NAME (vector_modes[mode_i + 1]),
2905 11 : GET_MODE_NAME (autodetected_vector_mode));
2906 420 : mode_i += 1;
2907 : }
2908 574170 : mode_i++;
2909 :
2910 574170 : if (!res)
2911 : {
2912 479895 : delete loop_vinfo;
2913 479895 : if (fatal)
2914 102457 : gcc_checking_assert (orig_loop_vinfo == NULL);
2915 479895 : return opt_loop_vec_info::propagate_failure (res);
2916 : }
2917 :
2918 94275 : return opt_loop_vec_info::success (loop_vinfo);
2919 : }
2920 :
2921 : /* Function vect_analyze_loop.
2922 :
2923 : Apply a set of analyses on LOOP, and create a loop_vec_info struct
2924 : for it. The different analyses will record information in the
2925 : loop_vec_info struct. */
2926 : opt_loop_vec_info
2927 464638 : vect_analyze_loop (class loop *loop, gimple *loop_vectorized_call,
2928 : vec_info_shared *shared)
2929 : {
2930 464638 : DUMP_VECT_SCOPE ("analyze_loop_nest");
2931 :
2932 464638 : if (loop_outer (loop)
2933 464638 : && loop_vec_info_for_loop (loop_outer (loop))
2934 465220 : && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2935 582 : return opt_loop_vec_info::failure_at (vect_location,
2936 : "outer-loop already vectorized.\n");
2937 :
2938 464056 : if (!find_loop_nest (loop, &shared->loop_nest))
2939 22302 : return opt_loop_vec_info::failure_at
2940 22302 : (vect_location,
2941 : "not vectorized: loop nest containing two or more consecutive inner"
2942 : " loops cannot be vectorized\n");
2943 :
2944 : /* Analyze the loop form. */
2945 441754 : vect_loop_form_info loop_form_info;
2946 441754 : opt_result res = vect_analyze_loop_form (loop, loop_vectorized_call,
2947 : &loop_form_info);
2948 441754 : if (!res)
2949 : {
2950 175259 : if (dump_enabled_p ())
2951 1527 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2952 : "bad loop form.\n");
2953 175259 : return opt_loop_vec_info::propagate_failure (res);
2954 : }
2955 266495 : if (!integer_onep (loop_form_info.assumptions))
2956 : {
2957 : /* We consider to vectorize this loop by versioning it under
2958 : some assumptions. In order to do this, we need to clear
2959 : existing information computed by scev and niter analyzer. */
2960 8366 : scev_reset_htab ();
2961 8366 : free_numbers_of_iterations_estimates (loop);
2962 : /* Also set flag for this loop so that following scev and niter
2963 : analysis are done under the assumptions. */
2964 8366 : loop_constraint_set (loop, LOOP_C_FINITE);
2965 : }
2966 : else
2967 : /* Clear the existing niter information to make sure the nonwrapping flag
2968 : will be calculated and set propriately. */
2969 258129 : free_numbers_of_iterations_estimates (loop);
2970 :
2971 266495 : auto_vector_modes vector_modes;
2972 : /* Autodetect first vector size we try. */
2973 266495 : vector_modes.safe_push (VOIDmode);
2974 266495 : unsigned int autovec_flags
2975 532990 : = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2976 266495 : loop->simdlen != 0);
2977 266495 : bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2978 266495 : && !unlimited_cost_model (loop));
2979 266495 : machine_mode autodetected_vector_mode = VOIDmode;
2980 266495 : opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2981 266495 : unsigned int mode_i = 0;
2982 266495 : unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2983 :
2984 : /* Keep track of the VF for each mode. Initialize all to 0 which indicates
2985 : a mode has not been analyzed. */
2986 266495 : auto_vec<poly_uint64, 8> cached_vf_per_mode;
2987 2676650 : for (unsigned i = 0; i < vector_modes.length (); ++i)
2988 1071830 : cached_vf_per_mode.safe_push (0);
2989 :
2990 : /* First determine the main loop vectorization mode, either the first
2991 : one that works, starting with auto-detecting the vector mode and then
2992 : following the targets order of preference, or the one with the
2993 : lowest cost if pick_lowest_cost_p. */
2994 854161 : while (1)
2995 : {
2996 560328 : bool fatal;
2997 560328 : unsigned int last_mode_i = mode_i;
2998 : /* Set cached VF to -1 prior to analysis, which indicates a mode has
2999 : failed. */
3000 560328 : cached_vf_per_mode[last_mode_i] = -1;
3001 560328 : opt_loop_vec_info loop_vinfo
3002 560328 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3003 : NULL, vector_modes, mode_i, -1,
3004 : autodetected_vector_mode, fatal);
3005 560328 : if (fatal)
3006 : break;
3007 :
3008 457871 : if (loop_vinfo)
3009 : {
3010 : /* Analysis has been successful so update the VF value. The
3011 : VF should always be a multiple of unroll_factor and we want to
3012 : capture the original VF here. */
3013 85946 : cached_vf_per_mode[last_mode_i]
3014 85946 : = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3015 85946 : loop_vinfo->suggested_unroll_factor);
3016 : /* Once we hit the desired simdlen for the first time,
3017 : discard any previous attempts. */
3018 85946 : if (simdlen
3019 85946 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3020 : {
3021 47 : delete first_loop_vinfo;
3022 : first_loop_vinfo = opt_loop_vec_info::success (NULL);
3023 : simdlen = 0;
3024 : }
3025 85899 : else if (pick_lowest_cost_p
3026 71921 : && first_loop_vinfo
3027 116878 : && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3028 : {
3029 : /* Pick loop_vinfo over first_loop_vinfo. */
3030 1178 : delete first_loop_vinfo;
3031 1178 : first_loop_vinfo = opt_loop_vec_info::success (NULL);
3032 : }
3033 85946 : if (first_loop_vinfo == NULL)
3034 : first_loop_vinfo = loop_vinfo;
3035 : else
3036 : {
3037 29803 : delete loop_vinfo;
3038 29803 : loop_vinfo = opt_loop_vec_info::success (NULL);
3039 : }
3040 :
3041 : /* Commit to first_loop_vinfo if we have no reason to try
3042 : alternatives. */
3043 85946 : if (!simdlen && !pick_lowest_cost_p)
3044 : break;
3045 : }
3046 443855 : if (mode_i == vector_modes.length ()
3047 443855 : || autodetected_vector_mode == VOIDmode)
3048 : break;
3049 :
3050 : /* Try the next biggest vector size. */
3051 293833 : if (dump_enabled_p ())
3052 4773 : dump_printf_loc (MSG_NOTE, vect_location,
3053 : "***** Re-trying analysis with vector mode %s\n",
3054 4773 : GET_MODE_NAME (vector_modes[mode_i]));
3055 293833 : }
3056 266495 : if (!first_loop_vinfo)
3057 211535 : return opt_loop_vec_info::propagate_failure (res);
3058 :
3059 54960 : if (dump_enabled_p ())
3060 9559 : dump_printf_loc (MSG_NOTE, vect_location,
3061 : "***** Choosing vector mode %s\n",
3062 9559 : GET_MODE_NAME (first_loop_vinfo->vector_mode));
3063 :
3064 : /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3065 : enabled, SIMDUID is not set, it is the innermost loop and we have
3066 : either already found the loop's SIMDLEN or there was no SIMDLEN to
3067 : begin with.
3068 : TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3069 54960 : bool vect_epilogues = (!simdlen
3070 54958 : && loop->inner == NULL
3071 54362 : && param_vect_epilogues_nomask
3072 53220 : && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3073 : /* No code motion support for multiple epilogues so for now
3074 : not supported when multiple exits. */
3075 26032 : && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3076 25568 : && !loop->simduid
3077 79115 : && loop_cost_model (loop) > VECT_COST_MODEL_VERY_CHEAP);
3078 54960 : if (!vect_epilogues)
3079 42019 : return first_loop_vinfo;
3080 :
3081 : /* Now analyze first_loop_vinfo for epilogue vectorization. */
3082 :
3083 : /* For epilogues start the analysis from the first mode. The motivation
3084 : behind starting from the beginning comes from cases where the VECTOR_MODES
3085 : array may contain length-agnostic and length-specific modes. Their
3086 : ordering is not guaranteed, so we could end up picking a mode for the main
3087 : loop that is after the epilogue's optimal mode. */
3088 12941 : int masked_p = -1;
3089 12941 : if (!unlimited_cost_model (loop)
3090 12941 : && (first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p)
3091 : != VOIDmode))
3092 : {
3093 4 : vector_modes[0]
3094 4 : = first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p);
3095 4 : cached_vf_per_mode[0] = 0;
3096 : }
3097 : else
3098 12937 : vector_modes[0] = autodetected_vector_mode;
3099 12941 : mode_i = 0;
3100 :
3101 12977 : bool supports_partial_vectors = (param_vect_partial_vector_usage != 0
3102 12941 : || masked_p == 1);
3103 : if (supports_partial_vectors
3104 36 : && !partial_vectors_supported_p ()
3105 36 : && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (first_loop_vinfo))
3106 : supports_partial_vectors = false;
3107 12941 : poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3108 :
3109 12941 : loop_vec_info orig_loop_vinfo = first_loop_vinfo;
3110 13107 : do
3111 : {
3112 : /* Let the user override what the target suggests. */
3113 13024 : if (OPTION_SET_P (param_vect_partial_vector_usage))
3114 45 : masked_p = -1;
3115 :
3116 50392 : while (1)
3117 : {
3118 : /* If the target does not support partial vectors we can shorten the
3119 : number of modes to analyze for the epilogue as we know we can't
3120 : pick a mode that would lead to a VF at least as big as the
3121 : FIRST_VINFO_VF. */
3122 67065 : if (!supports_partial_vectors
3123 50392 : && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3124 : {
3125 23805 : mode_i++;
3126 47610 : if (mode_i == vector_modes.length ())
3127 : break;
3128 29418 : continue;
3129 : }
3130 : /* We would need an exhaustive search to find all modes we
3131 : skipped but that would lead to the same result as the
3132 : analysis it was skipped for and where we'd could check
3133 : cached_vf_per_mode against.
3134 : Check for the autodetected mode, which is the common
3135 : situation on x86 which does not perform cost comparison. */
3136 39332 : if (!supports_partial_vectors
3137 26544 : && maybe_ge (cached_vf_per_mode[0], first_vinfo_vf)
3138 52344 : && vect_chooses_same_modes_p (autodetected_vector_mode,
3139 25757 : vector_modes[mode_i]))
3140 : {
3141 12745 : mode_i++;
3142 25490 : if (mode_i == vector_modes.length ())
3143 : break;
3144 12745 : continue;
3145 : }
3146 :
3147 13842 : if (dump_enabled_p ())
3148 3255 : dump_printf_loc (MSG_NOTE, vect_location,
3149 : "***** Re-trying epilogue analysis with vector "
3150 3255 : "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3151 :
3152 13842 : bool fatal;
3153 13842 : opt_loop_vec_info loop_vinfo
3154 13842 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3155 : orig_loop_vinfo,
3156 : vector_modes, mode_i, masked_p,
3157 : autodetected_vector_mode, fatal);
3158 13842 : if (fatal)
3159 : break;
3160 :
3161 13842 : if (loop_vinfo)
3162 : {
3163 8329 : if (pick_lowest_cost_p
3164 5379 : && orig_loop_vinfo->epilogue_vinfo
3165 9811 : && vect_joust_loop_vinfos (loop_vinfo,
3166 1482 : orig_loop_vinfo->epilogue_vinfo))
3167 : {
3168 170 : gcc_assert (vect_epilogues);
3169 170 : delete orig_loop_vinfo->epilogue_vinfo;
3170 170 : orig_loop_vinfo->epilogue_vinfo = nullptr;
3171 : }
3172 8329 : if (!orig_loop_vinfo->epilogue_vinfo)
3173 7017 : orig_loop_vinfo->epilogue_vinfo = loop_vinfo;
3174 : else
3175 : {
3176 1312 : delete loop_vinfo;
3177 1312 : loop_vinfo = opt_loop_vec_info::success (NULL);
3178 : }
3179 :
3180 : /* For now only allow one epilogue loop, but allow
3181 : pick_lowest_cost_p to replace it, so commit to the
3182 : first epilogue if we have no reason to try alternatives. */
3183 8329 : if (!pick_lowest_cost_p)
3184 : break;
3185 : }
3186 :
3187 : /* Revert back to the default from the suggested preferred
3188 : epilogue vectorization mode. */
3189 10892 : masked_p = -1;
3190 21784 : if (mode_i == vector_modes.length ())
3191 : break;
3192 : }
3193 :
3194 13024 : orig_loop_vinfo = orig_loop_vinfo->epilogue_vinfo;
3195 13024 : if (!orig_loop_vinfo)
3196 : break;
3197 :
3198 : /* When we selected a first vectorized epilogue, see if the target
3199 : suggests to have another one. */
3200 6847 : masked_p = -1;
3201 6847 : if (!unlimited_cost_model (loop)
3202 3903 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (orig_loop_vinfo)
3203 10744 : && (orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p)
3204 : != VOIDmode))
3205 : {
3206 166 : vector_modes[0]
3207 83 : = orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p);
3208 83 : cached_vf_per_mode[0] = 0;
3209 83 : mode_i = 0;
3210 : }
3211 : else
3212 : break;
3213 83 : }
3214 : while (1);
3215 :
3216 12941 : if (first_loop_vinfo->epilogue_vinfo)
3217 : {
3218 6772 : poly_uint64 lowest_th
3219 6772 : = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3220 6772 : loop_vec_info epilog_vinfo = first_loop_vinfo->epilogue_vinfo;
3221 6847 : do
3222 : {
3223 6847 : poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (epilog_vinfo);
3224 6847 : gcc_assert (!LOOP_REQUIRES_VERSIONING (epilog_vinfo)
3225 : || maybe_ne (lowest_th, 0U));
3226 : /* Keep track of the known smallest versioning threshold. */
3227 6847 : if (ordered_p (lowest_th, th))
3228 6847 : lowest_th = ordered_min (lowest_th, th);
3229 6847 : epilog_vinfo = epilog_vinfo->epilogue_vinfo;
3230 : }
3231 6847 : while (epilog_vinfo);
3232 6772 : LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3233 6772 : if (dump_enabled_p ())
3234 1449 : dump_printf_loc (MSG_NOTE, vect_location,
3235 : "***** Choosing epilogue vector mode %s\n",
3236 1449 : GET_MODE_NAME
3237 : (first_loop_vinfo->epilogue_vinfo->vector_mode));
3238 : }
3239 :
3240 12941 : return first_loop_vinfo;
3241 708249 : }
3242 :
3243 : /* Return true if there is an in-order reduction function for CODE, storing
3244 : it in *REDUC_FN if so. */
3245 :
3246 : static bool
3247 5083 : fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3248 : {
3249 : /* We support MINUS_EXPR by negating the operand. This also preserves an
3250 : initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3251 : (-0.0) = -0.0. */
3252 5083 : if (code == PLUS_EXPR || code == MINUS_EXPR)
3253 : {
3254 4411 : *reduc_fn = IFN_FOLD_LEFT_PLUS;
3255 0 : return true;
3256 : }
3257 : return false;
3258 : }
3259 :
3260 : /* Function reduction_fn_for_scalar_code
3261 :
3262 : Input:
3263 : CODE - tree_code of a reduction operations.
3264 :
3265 : Output:
3266 : REDUC_FN - the corresponding internal function to be used to reduce the
3267 : vector of partial results into a single scalar result, or IFN_LAST
3268 : if the operation is a supported reduction operation, but does not have
3269 : such an internal function.
3270 :
3271 : Return FALSE if CODE currently cannot be vectorized as reduction. */
3272 :
3273 : bool
3274 2030024 : reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3275 : {
3276 2030024 : if (code.is_tree_code ())
3277 2029966 : switch (tree_code (code))
3278 : {
3279 15328 : case MAX_EXPR:
3280 15328 : *reduc_fn = IFN_REDUC_MAX;
3281 15328 : return true;
3282 :
3283 62655 : case MIN_EXPR:
3284 62655 : *reduc_fn = IFN_REDUC_MIN;
3285 62655 : return true;
3286 :
3287 1102729 : case PLUS_EXPR:
3288 1102729 : *reduc_fn = IFN_REDUC_PLUS;
3289 1102729 : return true;
3290 :
3291 232490 : case BIT_AND_EXPR:
3292 232490 : *reduc_fn = IFN_REDUC_AND;
3293 232490 : return true;
3294 :
3295 287494 : case BIT_IOR_EXPR:
3296 287494 : *reduc_fn = IFN_REDUC_IOR;
3297 287494 : return true;
3298 :
3299 44295 : case BIT_XOR_EXPR:
3300 44295 : *reduc_fn = IFN_REDUC_XOR;
3301 44295 : return true;
3302 :
3303 284975 : case MULT_EXPR:
3304 284975 : case MINUS_EXPR:
3305 284975 : *reduc_fn = IFN_LAST;
3306 284975 : return true;
3307 :
3308 : default:
3309 : return false;
3310 : }
3311 : else
3312 58 : switch (combined_fn (code))
3313 : {
3314 34 : CASE_CFN_FMAX:
3315 34 : *reduc_fn = IFN_REDUC_FMAX;
3316 34 : return true;
3317 :
3318 24 : CASE_CFN_FMIN:
3319 24 : *reduc_fn = IFN_REDUC_FMIN;
3320 24 : return true;
3321 :
3322 : default:
3323 : return false;
3324 : }
3325 : }
3326 :
3327 : /* Set *SBOOL_FN to the corresponding function working on vector masks
3328 : for REDUC_FN. Return true if that exists, false otherwise. */
3329 :
3330 : static bool
3331 0 : sbool_reduction_fn_for_fn (internal_fn reduc_fn, internal_fn *sbool_fn)
3332 : {
3333 0 : switch (reduc_fn)
3334 : {
3335 0 : case IFN_REDUC_AND:
3336 0 : *sbool_fn = IFN_REDUC_SBOOL_AND;
3337 0 : return true;
3338 0 : case IFN_REDUC_IOR:
3339 0 : *sbool_fn = IFN_REDUC_SBOOL_IOR;
3340 0 : return true;
3341 0 : case IFN_REDUC_XOR:
3342 0 : *sbool_fn = IFN_REDUC_SBOOL_XOR;
3343 0 : return true;
3344 : default:
3345 : return false;
3346 : }
3347 : }
3348 :
3349 : /* If there is a neutral value X such that a reduction would not be affected
3350 : by the introduction of additional X elements, return that X, otherwise
3351 : return null. CODE is the code of the reduction and SCALAR_TYPE is type
3352 : of the scalar elements. If the reduction has just a single initial value
3353 : then INITIAL_VALUE is that value, otherwise it is null.
3354 : If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3355 : In that case no signed zero is returned. */
3356 :
3357 : tree
3358 77549 : neutral_op_for_reduction (tree scalar_type, code_helper code,
3359 : tree initial_value, bool as_initial)
3360 : {
3361 77549 : if (code.is_tree_code ())
3362 77491 : switch (tree_code (code))
3363 : {
3364 13836 : case DOT_PROD_EXPR:
3365 13836 : case SAD_EXPR:
3366 13836 : case MINUS_EXPR:
3367 13836 : case BIT_IOR_EXPR:
3368 13836 : case BIT_XOR_EXPR:
3369 13836 : return build_zero_cst (scalar_type);
3370 57475 : case WIDEN_SUM_EXPR:
3371 57475 : case PLUS_EXPR:
3372 57475 : if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3373 100 : return build_real (scalar_type, dconstm0);
3374 : else
3375 57375 : return build_zero_cst (scalar_type);
3376 :
3377 2165 : case MULT_EXPR:
3378 2165 : return build_one_cst (scalar_type);
3379 :
3380 1558 : case BIT_AND_EXPR:
3381 1558 : return build_all_ones_cst (scalar_type);
3382 :
3383 : case MAX_EXPR:
3384 : case MIN_EXPR:
3385 : return initial_value;
3386 :
3387 428 : default:
3388 428 : return NULL_TREE;
3389 : }
3390 : else
3391 58 : switch (combined_fn (code))
3392 : {
3393 : CASE_CFN_FMIN:
3394 : CASE_CFN_FMAX:
3395 : return initial_value;
3396 :
3397 0 : default:
3398 0 : return NULL_TREE;
3399 : }
3400 : }
3401 :
3402 : /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3403 : STMT is printed with a message MSG. */
3404 :
3405 : static void
3406 578 : report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3407 : {
3408 578 : dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3409 578 : }
3410 :
3411 : /* Return true if we need an in-order reduction for operation CODE
3412 : on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3413 : overflow must wrap. */
3414 :
3415 : bool
3416 6544927 : needs_fold_left_reduction_p (tree type, code_helper code)
3417 : {
3418 : /* CHECKME: check for !flag_finite_math_only too? */
3419 6544927 : if (SCALAR_FLOAT_TYPE_P (type))
3420 : {
3421 581529 : if (code.is_tree_code ())
3422 581475 : switch (tree_code (code))
3423 : {
3424 : case MIN_EXPR:
3425 : case MAX_EXPR:
3426 : return false;
3427 :
3428 579619 : default:
3429 579619 : return !flag_associative_math;
3430 : }
3431 : else
3432 54 : switch (combined_fn (code))
3433 : {
3434 : CASE_CFN_FMIN:
3435 : CASE_CFN_FMAX:
3436 : return false;
3437 :
3438 2 : default:
3439 2 : return !flag_associative_math;
3440 : }
3441 : }
3442 :
3443 5963398 : if (INTEGRAL_TYPE_P (type))
3444 5962515 : return (!code.is_tree_code ()
3445 5962515 : || !operation_no_trapping_overflow (type, tree_code (code)));
3446 :
3447 883 : if (SAT_FIXED_POINT_TYPE_P (type))
3448 : return true;
3449 :
3450 : return false;
3451 : }
3452 :
3453 : /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3454 : has a handled computation expression. Store the main reduction
3455 : operation in *CODE. */
3456 :
3457 : static bool
3458 101961 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3459 : tree loop_arg, code_helper *code,
3460 : vec<std::pair<ssa_op_iter, use_operand_p> > &path,
3461 : bool inner_loop_of_double_reduc)
3462 : {
3463 101961 : auto_bitmap visited;
3464 101961 : tree lookfor = PHI_RESULT (phi);
3465 101961 : ssa_op_iter curri;
3466 101961 : use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3467 212077 : while (USE_FROM_PTR (curr) != loop_arg)
3468 8155 : curr = op_iter_next_use (&curri);
3469 101961 : curri.i = curri.numops;
3470 949228 : do
3471 : {
3472 949228 : path.safe_push (std::make_pair (curri, curr));
3473 949228 : tree use = USE_FROM_PTR (curr);
3474 949228 : if (use == lookfor)
3475 : break;
3476 847677 : gimple *def = SSA_NAME_DEF_STMT (use);
3477 847677 : if (gimple_nop_p (def)
3478 847677 : || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3479 : {
3480 713432 : pop:
3481 713432 : do
3482 : {
3483 713432 : std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3484 713432 : curri = x.first;
3485 713432 : curr = x.second;
3486 780850 : do
3487 780850 : curr = op_iter_next_use (&curri);
3488 : /* Skip already visited or non-SSA operands (from iterating
3489 : over PHI args). */
3490 : while (curr != NULL_USE_OPERAND_P
3491 1561700 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3492 269813 : || ! bitmap_set_bit (visited,
3493 269813 : SSA_NAME_VERSION
3494 : (USE_FROM_PTR (curr)))));
3495 : }
3496 1426864 : while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3497 238660 : if (curr == NULL_USE_OPERAND_P)
3498 : break;
3499 : }
3500 : else
3501 : {
3502 712761 : if (gimple_code (def) == GIMPLE_PHI)
3503 72327 : curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3504 : else
3505 640434 : curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3506 : while (curr != NULL_USE_OPERAND_P
3507 850888 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3508 742007 : || ! bitmap_set_bit (visited,
3509 742007 : SSA_NAME_VERSION
3510 : (USE_FROM_PTR (curr)))))
3511 138127 : curr = op_iter_next_use (&curri);
3512 712761 : if (curr == NULL_USE_OPERAND_P)
3513 103744 : goto pop;
3514 : }
3515 : }
3516 : while (1);
3517 101961 : if (dump_file && (dump_flags & TDF_DETAILS))
3518 : {
3519 4111 : dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3520 4111 : unsigned i;
3521 4111 : std::pair<ssa_op_iter, use_operand_p> *x;
3522 13974 : FOR_EACH_VEC_ELT (path, i, x)
3523 9863 : dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3524 4111 : dump_printf (MSG_NOTE, "\n");
3525 : }
3526 :
3527 : /* Check whether the reduction path detected is valid. */
3528 101961 : bool fail = path.length () == 0;
3529 101961 : bool neg = false;
3530 101961 : int sign = -1;
3531 101961 : *code = ERROR_MARK;
3532 219475 : for (unsigned i = 1; i < path.length (); ++i)
3533 : {
3534 124207 : gimple *use_stmt = USE_STMT (path[i].second);
3535 124207 : gimple_match_op op;
3536 124207 : if (!gimple_extract_op (use_stmt, &op))
3537 : {
3538 : fail = true;
3539 6693 : break;
3540 : }
3541 123304 : unsigned int opi = op.num_ops;
3542 123304 : if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3543 : {
3544 : /* The following make sure we can compute the operand index
3545 : easily plus it mostly disallows chaining via COND_EXPR condition
3546 : operands. */
3547 190654 : for (opi = 0; opi < op.num_ops; ++opi)
3548 189641 : if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3549 : break;
3550 : }
3551 6230 : else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3552 : {
3553 12485 : for (opi = 0; opi < op.num_ops; ++opi)
3554 12485 : if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3555 : break;
3556 : }
3557 123304 : if (opi == op.num_ops)
3558 : {
3559 : fail = true;
3560 : break;
3561 : }
3562 122291 : op.code = canonicalize_code (op.code, op.type);
3563 122291 : if (op.code == MINUS_EXPR)
3564 : {
3565 5668 : op.code = PLUS_EXPR;
3566 : /* Track whether we negate the reduction value each iteration. */
3567 5668 : if (op.ops[1] == op.ops[opi])
3568 34 : neg = ! neg;
3569 : }
3570 116623 : else if (op.code == IFN_COND_SUB)
3571 : {
3572 9 : op.code = IFN_COND_ADD;
3573 : /* Track whether we negate the reduction value each iteration. */
3574 9 : if (op.ops[2] == op.ops[opi])
3575 0 : neg = ! neg;
3576 : }
3577 : /* For an FMA the reduction code is the PLUS if the addition chain
3578 : is the reduction. */
3579 116614 : else if (op.code == IFN_FMA && opi == 2)
3580 33 : op.code = PLUS_EXPR;
3581 122291 : if (CONVERT_EXPR_CODE_P (op.code)
3582 122291 : && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3583 : ;
3584 116813 : else if (*code == ERROR_MARK)
3585 : {
3586 99732 : *code = op.code;
3587 99732 : sign = TYPE_SIGN (op.type);
3588 : }
3589 17081 : else if (op.code != *code)
3590 : {
3591 : fail = true;
3592 : break;
3593 : }
3594 15761 : else if ((op.code == MIN_EXPR
3595 15605 : || op.code == MAX_EXPR)
3596 15776 : && sign != TYPE_SIGN (op.type))
3597 : {
3598 : fail = true;
3599 : break;
3600 : }
3601 : /* Check there's only a single stmt the op is used on. For the
3602 : not value-changing tail and the last stmt allow out-of-loop uses,
3603 : but not when this is the inner loop of a double reduction.
3604 : ??? We could relax this and handle arbitrary live stmts by
3605 : forcing a scalar epilogue for example. */
3606 120968 : imm_use_iterator imm_iter;
3607 120968 : use_operand_p use_p;
3608 120968 : gimple *op_use_stmt;
3609 120968 : unsigned cnt = 0;
3610 127163 : bool cond_fn_p = op.code.is_internal_fn ()
3611 6195 : && (conditional_internal_fn_code (internal_fn (op.code))
3612 120968 : != ERROR_MARK);
3613 :
3614 411222 : FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3615 : {
3616 : /* In case of a COND_OP (mask, op1, op2, op1) reduction we should
3617 : have op1 twice (once as definition, once as else) in the same
3618 : operation. Enforce this. */
3619 169286 : if (cond_fn_p && op_use_stmt == use_stmt)
3620 : {
3621 6129 : gcall *call = as_a<gcall *> (use_stmt);
3622 6129 : unsigned else_pos
3623 6129 : = internal_fn_else_index (internal_fn (op.code));
3624 6129 : if (gimple_call_arg (call, else_pos) != op.ops[opi])
3625 : {
3626 : fail = true;
3627 : break;
3628 : }
3629 30645 : for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
3630 : {
3631 24516 : if (j == else_pos)
3632 6129 : continue;
3633 18387 : if (gimple_call_arg (call, j) == op.ops[opi])
3634 6129 : cnt++;
3635 : }
3636 : }
3637 163157 : else if (!is_gimple_debug (op_use_stmt)
3638 163157 : && ((*code != ERROR_MARK || inner_loop_of_double_reduc)
3639 2813 : || flow_bb_inside_loop_p (loop,
3640 2813 : gimple_bb (op_use_stmt))))
3641 236733 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3642 118371 : cnt++;
3643 120968 : }
3644 :
3645 120968 : if (cnt != 1)
3646 : {
3647 : fail = true;
3648 : break;
3649 : }
3650 : }
3651 109075 : return ! fail && ! neg && *code != ERROR_MARK;
3652 101961 : }
3653 :
3654 : bool
3655 21 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3656 : tree loop_arg, enum tree_code code)
3657 : {
3658 21 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3659 21 : code_helper code_;
3660 21 : return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path, false)
3661 21 : && code_ == code);
3662 21 : }
3663 :
3664 :
3665 :
3666 : /* Function vect_is_simple_reduction
3667 :
3668 : (1) Detect a cross-iteration def-use cycle that represents a simple
3669 : reduction computation. We look for the following pattern:
3670 :
3671 : loop_header:
3672 : a1 = phi < a0, a2 >
3673 : a3 = ...
3674 : a2 = operation (a3, a1)
3675 :
3676 : or
3677 :
3678 : a3 = ...
3679 : loop_header:
3680 : a1 = phi < a0, a2 >
3681 : a2 = operation (a3, a1)
3682 :
3683 : such that:
3684 : 1. operation is commutative and associative and it is safe to
3685 : change the order of the computation
3686 : 2. no uses for a2 in the loop (a2 is used out of the loop)
3687 : 3. no uses of a1 in the loop besides the reduction operation
3688 : 4. no uses of a1 outside the loop.
3689 :
3690 : Conditions 1,4 are tested here.
3691 : Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3692 :
3693 : (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3694 : nested cycles.
3695 :
3696 : (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3697 : reductions:
3698 :
3699 : a1 = phi < a0, a2 >
3700 : inner loop (def of a3)
3701 : a2 = phi < a3 >
3702 :
3703 : (4) Detect condition expressions, ie:
3704 : for (int i = 0; i < N; i++)
3705 : if (a[i] < val)
3706 : ret_val = a[i];
3707 :
3708 : */
3709 :
3710 : static stmt_vec_info
3711 164677 : vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3712 : gphi **double_reduc)
3713 : {
3714 164677 : gphi *phi = as_a <gphi *> (phi_info->stmt);
3715 164677 : gimple *phi_use_stmt = NULL;
3716 164677 : imm_use_iterator imm_iter;
3717 164677 : use_operand_p use_p;
3718 :
3719 : /* When double_reduc is NULL we are testing the inner loop of a
3720 : double reduction. */
3721 164677 : bool inner_loop_of_double_reduc = double_reduc == NULL;
3722 164677 : if (double_reduc)
3723 163570 : *double_reduc = NULL;
3724 164677 : STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3725 :
3726 164677 : tree phi_name = PHI_RESULT (phi);
3727 : /* ??? If there are no uses of the PHI result the inner loop reduction
3728 : won't be detected as possibly double-reduction by vectorizable_reduction
3729 : because that tries to walk the PHI arg from the preheader edge which
3730 : can be constant. See PR60382. */
3731 164677 : if (has_zero_uses (phi_name))
3732 : return NULL;
3733 164541 : class loop *loop = (gimple_bb (phi))->loop_father;
3734 164541 : unsigned nphi_def_loop_uses = 0;
3735 621914 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3736 : {
3737 304439 : gimple *use_stmt = USE_STMT (use_p);
3738 304439 : if (is_gimple_debug (use_stmt))
3739 82616 : continue;
3740 :
3741 221823 : if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3742 : {
3743 11607 : if (dump_enabled_p ())
3744 35 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3745 : "intermediate value used outside loop.\n");
3746 :
3747 11607 : return NULL;
3748 : }
3749 :
3750 : /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
3751 : op1 twice (once as definition, once as else) in the same operation.
3752 : Only count it as one. */
3753 210216 : if (use_stmt != phi_use_stmt)
3754 : {
3755 203679 : nphi_def_loop_uses++;
3756 203679 : phi_use_stmt = use_stmt;
3757 : }
3758 11607 : }
3759 :
3760 152934 : tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3761 152934 : if (TREE_CODE (latch_def) != SSA_NAME)
3762 : {
3763 1448 : if (dump_enabled_p ())
3764 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3765 : "reduction: not ssa_name: %T\n", latch_def);
3766 1448 : return NULL;
3767 : }
3768 :
3769 151486 : stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3770 151486 : if (!def_stmt_info
3771 151486 : || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3772 161 : return NULL;
3773 :
3774 151325 : bool nested_in_vect_loop
3775 151325 : = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3776 151325 : unsigned nlatch_def_loop_uses = 0;
3777 151325 : auto_vec<gphi *, 3> lcphis;
3778 743930 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3779 : {
3780 441280 : gimple *use_stmt = USE_STMT (use_p);
3781 441280 : if (is_gimple_debug (use_stmt))
3782 135828 : continue;
3783 305452 : if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3784 190578 : nlatch_def_loop_uses++;
3785 : else
3786 : /* We can have more than one loop-closed PHI. */
3787 114874 : lcphis.safe_push (as_a <gphi *> (use_stmt));
3788 151325 : }
3789 :
3790 : /* If we are vectorizing an inner reduction we are executing that
3791 : in the original order only in case we are not dealing with a
3792 : double reduction. */
3793 151325 : if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3794 : {
3795 2431 : if (dump_enabled_p ())
3796 434 : report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3797 : "detected nested cycle: ");
3798 2431 : return def_stmt_info;
3799 : }
3800 :
3801 : /* When the inner loop of a double reduction ends up with more than
3802 : one loop-closed PHI we have failed to classify alternate such
3803 : PHIs as double reduction, leading to wrong code. See PR103237. */
3804 149989 : if (inner_loop_of_double_reduc && lcphis.length () != 1)
3805 : {
3806 1 : if (dump_enabled_p ())
3807 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3808 : "unhandle double reduction\n");
3809 1 : return NULL;
3810 : }
3811 :
3812 : /* If this isn't a nested cycle or if the nested cycle reduction value
3813 : is used outside of the inner loop we cannot handle uses of the reduction
3814 : value. */
3815 148893 : if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3816 : {
3817 45591 : if (dump_enabled_p ())
3818 403 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3819 : "reduction used in loop.\n");
3820 45591 : return NULL;
3821 : }
3822 :
3823 : /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3824 : defined in the inner loop. */
3825 103302 : if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3826 : {
3827 1362 : tree op1 = PHI_ARG_DEF (def_stmt, 0);
3828 1362 : if (gimple_phi_num_args (def_stmt) != 1
3829 1362 : || TREE_CODE (op1) != SSA_NAME)
3830 : {
3831 91 : if (dump_enabled_p ())
3832 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3833 : "unsupported phi node definition.\n");
3834 :
3835 91 : return NULL;
3836 : }
3837 :
3838 : /* Verify there is an inner cycle composed of the PHI phi_use_stmt
3839 : and the latch definition op1. */
3840 1271 : gimple *def1 = SSA_NAME_DEF_STMT (op1);
3841 1271 : if (gimple_bb (def1)
3842 1271 : && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3843 1271 : && loop->inner
3844 1217 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3845 1217 : && (is_gimple_assign (def1) || is_gimple_call (def1))
3846 1208 : && is_a <gphi *> (phi_use_stmt)
3847 1196 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
3848 1196 : && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
3849 : loop_latch_edge (loop->inner)))
3850 2465 : && lcphis.length () == 1)
3851 : {
3852 1107 : if (dump_enabled_p ())
3853 144 : report_vect_op (MSG_NOTE, def_stmt,
3854 : "detected double reduction: ");
3855 :
3856 1107 : *double_reduc = as_a <gphi *> (phi_use_stmt);
3857 1107 : return def_stmt_info;
3858 : }
3859 :
3860 164 : return NULL;
3861 : }
3862 :
3863 : /* Look for the expression computing latch_def from then loop PHI result. */
3864 101940 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3865 101940 : code_helper code;
3866 101940 : if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3867 : path, inner_loop_of_double_reduc))
3868 : {
3869 94826 : STMT_VINFO_REDUC_CODE (phi_info) = code;
3870 94826 : if (code == COND_EXPR && !nested_in_vect_loop)
3871 8251 : STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3872 :
3873 : /* Fill in STMT_VINFO_REDUC_IDX. */
3874 94826 : unsigned i;
3875 305505 : for (i = path.length () - 1; i >= 1; --i)
3876 : {
3877 115853 : gimple *stmt = USE_STMT (path[i].second);
3878 115853 : stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3879 115853 : gimple_match_op op;
3880 115853 : if (!gimple_extract_op (stmt, &op))
3881 0 : gcc_unreachable ();
3882 115853 : if (gassign *assign = dyn_cast<gassign *> (stmt))
3883 109643 : STMT_VINFO_REDUC_IDX (stmt_info)
3884 109643 : = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3885 : else
3886 : {
3887 6210 : gcall *call = as_a<gcall *> (stmt);
3888 6210 : STMT_VINFO_REDUC_IDX (stmt_info)
3889 6210 : = path[i].second->use - gimple_call_arg_ptr (call, 0);
3890 : }
3891 : }
3892 94826 : if (dump_enabled_p ())
3893 4104 : dump_printf_loc (MSG_NOTE, vect_location,
3894 : "reduction: detected reduction\n");
3895 :
3896 94826 : return def_stmt_info;
3897 : }
3898 :
3899 7114 : if (dump_enabled_p ())
3900 89 : dump_printf_loc (MSG_NOTE, vect_location,
3901 : "reduction: unknown pattern\n");
3902 :
3903 : return NULL;
3904 253265 : }
3905 :
3906 : /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3907 : PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3908 : or -1 if not known. */
3909 :
3910 : static int
3911 481820 : vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3912 : {
3913 481820 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
3914 481820 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3915 : {
3916 204120 : if (dump_enabled_p ())
3917 3587 : dump_printf_loc (MSG_NOTE, vect_location,
3918 : "cost model: epilogue peel iters set to vf/2 "
3919 : "because loop iterations are unknown .\n");
3920 204120 : return assumed_vf / 2;
3921 : }
3922 : else
3923 : {
3924 277700 : int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3925 277700 : peel_iters_prologue = MIN (niters, peel_iters_prologue);
3926 277700 : int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3927 : /* If we need to peel for gaps, but no peeling is required, we have to
3928 : peel VF iterations. */
3929 277700 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3930 277700 : peel_iters_epilogue = assumed_vf;
3931 277700 : return peel_iters_epilogue;
3932 : }
3933 : }
3934 :
3935 : /* Calculate cost of peeling the scalar loop PEEL_ITERS_PROLOGUE times for
3936 : a prologue and the corresponding times for the epilogue. */
3937 : int
3938 357440 : vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue)
3939 : {
3940 357440 : int retval = 0;
3941 :
3942 357440 : int peel_iters_epilogue
3943 357440 : = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3944 :
3945 357440 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3946 : {
3947 : /* If peeled iterations are known but number of scalar loop
3948 : iterations are unknown, count a taken branch per peeled loop. */
3949 138259 : if (peel_iters_prologue > 0)
3950 84384 : retval = builtin_vectorization_cost (cond_branch_taken, NULL_TREE, 0);
3951 138259 : if (peel_iters_epilogue > 0)
3952 138151 : retval += builtin_vectorization_cost (cond_branch_taken, NULL_TREE, 0);
3953 : }
3954 :
3955 714880 : retval += ((peel_iters_prologue + peel_iters_epilogue)
3956 357440 : * loop_vinfo->scalar_costs->body_cost ());
3957 714880 : retval += (((peel_iters_prologue != 0) + (peel_iters_epilogue != 0))
3958 357440 : * loop_vinfo->scalar_costs->outside_cost ());
3959 :
3960 357440 : return retval;
3961 : }
3962 :
3963 : /* Function vect_estimate_min_profitable_iters
3964 :
3965 : Return the number of iterations required for the vector version of the
3966 : loop to be profitable relative to the cost of the scalar version of the
3967 : loop.
3968 :
3969 : *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3970 : of iterations for vectorization. -1 value means loop vectorization
3971 : is not profitable. This returned value may be used for dynamic
3972 : profitability check.
3973 :
3974 : *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3975 : for static check against estimated number of iterations. */
3976 :
3977 : static void
3978 141664 : vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3979 : int *ret_min_profitable_niters,
3980 : int *ret_min_profitable_estimate,
3981 : unsigned *suggested_unroll_factor)
3982 : {
3983 141664 : int min_profitable_iters;
3984 141664 : int min_profitable_estimate;
3985 141664 : int peel_iters_prologue;
3986 141664 : int peel_iters_epilogue;
3987 141664 : unsigned vec_inside_cost = 0;
3988 141664 : int vec_outside_cost = 0;
3989 141664 : unsigned vec_prologue_cost = 0;
3990 141664 : unsigned vec_epilogue_cost = 0;
3991 141664 : int scalar_single_iter_cost = 0;
3992 141664 : int scalar_outside_cost = 0;
3993 141664 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
3994 141664 : int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3995 141664 : vector_costs *target_cost_data = loop_vinfo->vector_costs;
3996 :
3997 : /* Cost model disabled. */
3998 141664 : if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3999 : {
4000 16979 : if (dump_enabled_p ())
4001 10654 : dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4002 16979 : *ret_min_profitable_niters = 0;
4003 16979 : *ret_min_profitable_estimate = 0;
4004 16979 : return;
4005 : }
4006 :
4007 : /* Requires loop versioning tests to handle misalignment. */
4008 124685 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4009 : {
4010 : /* FIXME: Make cost depend on complexity of individual check. */
4011 18 : unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4012 18 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4013 18 : if (dump_enabled_p ())
4014 2 : dump_printf (MSG_NOTE,
4015 : "cost model: Adding cost of checks for loop "
4016 : "versioning to treat misalignment.\n");
4017 : }
4018 :
4019 : /* Requires loop versioning with alias checks. */
4020 124685 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4021 : {
4022 : /* FIXME: Make cost depend on complexity of individual check. */
4023 7114 : unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4024 7114 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4025 7114 : len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4026 4 : if (len)
4027 : /* Count LEN - 1 ANDs and LEN comparisons. */
4028 4 : (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4029 : scalar_stmt, vect_prologue);
4030 7114 : len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4031 1272 : if (len)
4032 : {
4033 : /* Count LEN - 1 ANDs and LEN comparisons. */
4034 1272 : unsigned int nstmts = len * 2 - 1;
4035 : /* +1 for each bias that needs adding. */
4036 2544 : for (unsigned int i = 0; i < len; ++i)
4037 1272 : if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4038 151 : nstmts += 1;
4039 1272 : (void) add_stmt_cost (target_cost_data, nstmts,
4040 : scalar_stmt, vect_prologue);
4041 : }
4042 7114 : if (dump_enabled_p ())
4043 32 : dump_printf (MSG_NOTE,
4044 : "cost model: Adding cost of checks for loop "
4045 : "versioning aliasing.\n");
4046 : }
4047 :
4048 : /* Requires loop versioning with niter checks. */
4049 124685 : if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4050 : {
4051 : /* FIXME: Make cost depend on complexity of individual check. */
4052 751 : (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4053 : NULL, NULL, NULL_TREE, 0, vect_prologue);
4054 751 : if (dump_enabled_p ())
4055 1 : dump_printf (MSG_NOTE,
4056 : "cost model: Adding cost of checks for loop "
4057 : "versioning niters.\n");
4058 : }
4059 :
4060 124685 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4061 7877 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4062 : vect_prologue);
4063 :
4064 : /* Count statements in scalar loop. Using this as scalar cost for a single
4065 : iteration for now.
4066 :
4067 : TODO: Add outer loop support.
4068 :
4069 : TODO: Consider assigning different costs to different scalar
4070 : statements. */
4071 :
4072 124685 : scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4073 :
4074 : /* Add additional cost for the peeled instructions in prologue and epilogue
4075 : loop. (For fully-masked loops there will be no peeling.)
4076 :
4077 : FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4078 : at compile-time - we assume it's vf/2 (the worst would be vf-1).
4079 :
4080 : TODO: Build an expression that represents peel_iters for prologue and
4081 : epilogue to be used in a run-time test. */
4082 :
4083 124685 : bool prologue_need_br_taken_cost = false;
4084 124685 : bool prologue_need_br_not_taken_cost = false;
4085 :
4086 : /* Calculate peel_iters_prologue. */
4087 124685 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4088 : peel_iters_prologue = 0;
4089 124685 : else if (npeel < 0)
4090 : {
4091 279 : peel_iters_prologue = assumed_vf / 2;
4092 279 : if (dump_enabled_p ())
4093 8 : dump_printf (MSG_NOTE, "cost model: "
4094 : "prologue peel iters set to vf/2.\n");
4095 :
4096 : /* If peeled iterations are unknown, count a taken branch and a not taken
4097 : branch per peeled loop. Even if scalar loop iterations are known,
4098 : vector iterations are not known since peeled prologue iterations are
4099 : not known. Hence guards remain the same. */
4100 : prologue_need_br_taken_cost = true;
4101 : prologue_need_br_not_taken_cost = true;
4102 : }
4103 : else
4104 : {
4105 124406 : peel_iters_prologue = npeel;
4106 124406 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4107 : /* If peeled iterations are known but number of scalar loop
4108 : iterations are unknown, count a taken branch per peeled loop. */
4109 124685 : prologue_need_br_taken_cost = true;
4110 : }
4111 :
4112 124685 : bool epilogue_need_br_taken_cost = false;
4113 124685 : bool epilogue_need_br_not_taken_cost = false;
4114 :
4115 : /* Calculate peel_iters_epilogue. */
4116 124685 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4117 : /* We need to peel exactly one iteration for gaps. */
4118 26 : peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4119 124659 : else if (npeel < 0)
4120 : {
4121 : /* If peeling for alignment is unknown, loop bound of main loop
4122 : becomes unknown. */
4123 279 : peel_iters_epilogue = assumed_vf / 2;
4124 279 : if (dump_enabled_p ())
4125 8 : dump_printf (MSG_NOTE, "cost model: "
4126 : "epilogue peel iters set to vf/2 because "
4127 : "peeling for alignment is unknown.\n");
4128 :
4129 : /* See the same reason above in peel_iters_prologue calculation. */
4130 : epilogue_need_br_taken_cost = true;
4131 : epilogue_need_br_not_taken_cost = true;
4132 : }
4133 : else
4134 : {
4135 124380 : peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4136 124380 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4137 : /* If peeled iterations are known but number of scalar loop
4138 : iterations are unknown, count a taken branch per peeled loop. */
4139 124685 : epilogue_need_br_taken_cost = true;
4140 : }
4141 :
4142 : /* The way we cummulate peeling costs into the vector prologue/epilogue
4143 : cost is a bit awkward given we cannot reuse scalar_costs which is
4144 : already computed and also because it cannot take into account any
4145 : epilogue vectorization we'll carry out in the end. */
4146 :
4147 124685 : stmt_info_for_cost *si;
4148 124685 : int j;
4149 : /* Add costs associated with peel_iters_prologue. */
4150 124685 : if (peel_iters_prologue)
4151 1068 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4152 : {
4153 775 : (void) add_stmt_cost (target_cost_data,
4154 775 : si->count * peel_iters_prologue, si->kind,
4155 : si->stmt_info, si->node, si->vectype,
4156 : si->misalign, vect_prologue);
4157 : }
4158 :
4159 : /* Add costs associated with peel_iters_epilogue. */
4160 124685 : if (peel_iters_epilogue)
4161 387630 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4162 : {
4163 310568 : (void) add_stmt_cost (target_cost_data,
4164 310568 : si->count * peel_iters_epilogue, si->kind,
4165 : si->stmt_info, si->node, si->vectype,
4166 : si->misalign, vect_epilogue);
4167 : }
4168 :
4169 : /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4170 :
4171 124685 : if (prologue_need_br_taken_cost)
4172 279 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4173 : vect_prologue);
4174 :
4175 124685 : if (prologue_need_br_not_taken_cost)
4176 279 : (void) add_stmt_cost (target_cost_data, 1,
4177 : cond_branch_not_taken, vect_prologue);
4178 :
4179 124685 : if (epilogue_need_br_taken_cost)
4180 65274 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4181 : vect_epilogue);
4182 :
4183 124685 : if (epilogue_need_br_not_taken_cost)
4184 279 : (void) add_stmt_cost (target_cost_data, 1,
4185 : cond_branch_not_taken, vect_epilogue);
4186 :
4187 : /* Take care of special costs for rgroup controls of partial vectors. */
4188 26 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4189 124711 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4190 : == vect_partial_vectors_avx512))
4191 : {
4192 : /* Calculate how many masks we need to generate. */
4193 26 : unsigned int num_masks = 0;
4194 26 : bool need_saturation = false;
4195 108 : for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4196 30 : if (rgm.type)
4197 : {
4198 26 : unsigned nvectors = rgm.factor;
4199 26 : num_masks += nvectors;
4200 26 : if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4201 26 : < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4202 9 : need_saturation = true;
4203 : }
4204 :
4205 : /* ??? The target isn't able to identify the costs below as
4206 : producing masks so it cannot penaltize cases where we'd run
4207 : out of mask registers for example. */
4208 :
4209 : /* ??? We are also failing to account for smaller vector masks
4210 : we generate by splitting larger masks in vect_get_loop_mask. */
4211 :
4212 : /* In the worst case, we need to generate each mask in the prologue
4213 : and in the loop body. We need one splat per group and one
4214 : compare per mask.
4215 :
4216 : Sometimes the prologue mask will fold to a constant,
4217 : so the actual prologue cost might be smaller. However, it's
4218 : simpler and safer to use the worst-case cost; if this ends up
4219 : being the tie-breaker between vectorizing or not, then it's
4220 : probably better not to vectorize. */
4221 26 : (void) add_stmt_cost (target_cost_data,
4222 : num_masks
4223 26 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4224 : vector_stmt, NULL, NULL, NULL_TREE, 0,
4225 : vect_prologue);
4226 52 : (void) add_stmt_cost (target_cost_data,
4227 : num_masks
4228 52 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4229 : vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4230 :
4231 : /* When we need saturation we need it both in the prologue and
4232 : the epilogue. */
4233 26 : if (need_saturation)
4234 : {
4235 9 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4236 : NULL, NULL, NULL_TREE, 0, vect_prologue);
4237 9 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4238 : NULL, NULL, NULL_TREE, 0, vect_body);
4239 : }
4240 : }
4241 0 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4242 124659 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4243 : == vect_partial_vectors_while_ult))
4244 : {
4245 : /* Calculate how many masks we need to generate. */
4246 : unsigned int num_masks = 0;
4247 : rgroup_controls *rgm;
4248 : unsigned int num_vectors_m1;
4249 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4250 : num_vectors_m1, rgm)
4251 0 : if (rgm->type)
4252 0 : num_masks += num_vectors_m1 + 1;
4253 0 : gcc_assert (num_masks > 0);
4254 :
4255 : /* In the worst case, we need to generate each mask in the prologue
4256 : and in the loop body. One of the loop body mask instructions
4257 : replaces the comparison in the scalar loop, and since we don't
4258 : count the scalar comparison against the scalar body, we shouldn't
4259 : count that vector instruction against the vector body either.
4260 :
4261 : Sometimes we can use unpacks instead of generating prologue
4262 : masks and sometimes the prologue mask will fold to a constant,
4263 : so the actual prologue cost might be smaller. However, it's
4264 : simpler and safer to use the worst-case cost; if this ends up
4265 : being the tie-breaker between vectorizing or not, then it's
4266 : probably better not to vectorize. */
4267 0 : (void) add_stmt_cost (target_cost_data, num_masks,
4268 : vector_stmt, NULL, NULL, NULL_TREE, 0,
4269 : vect_prologue);
4270 0 : (void) add_stmt_cost (target_cost_data, num_masks - 1,
4271 : vector_stmt, NULL, NULL, NULL_TREE, 0,
4272 : vect_body);
4273 : }
4274 124659 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4275 : {
4276 : /* Referring to the functions vect_set_loop_condition_partial_vectors
4277 : and vect_set_loop_controls_directly, we need to generate each
4278 : length in the prologue and in the loop body if required. Although
4279 : there are some possible optimizations, we consider the worst case
4280 : here. */
4281 :
4282 0 : bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4283 0 : signed char partial_load_store_bias
4284 : = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4285 0 : bool need_iterate_p
4286 0 : = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4287 0 : && !vect_known_niters_smaller_than_vf (loop_vinfo));
4288 :
4289 : /* Calculate how many statements to be added. */
4290 0 : unsigned int prologue_stmts = 0;
4291 0 : unsigned int body_stmts = 0;
4292 :
4293 0 : rgroup_controls *rgc;
4294 0 : unsigned int num_vectors_m1;
4295 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4296 0 : if (rgc->type)
4297 : {
4298 : /* May need one SHIFT for nitems_total computation. */
4299 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4300 0 : if (nitems != 1 && !niters_known_p)
4301 0 : prologue_stmts += 1;
4302 :
4303 : /* May need one MAX and one MINUS for wrap around. */
4304 0 : if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4305 0 : prologue_stmts += 2;
4306 :
4307 : /* Need one MAX and one MINUS for each batch limit excepting for
4308 : the 1st one. */
4309 0 : prologue_stmts += num_vectors_m1 * 2;
4310 :
4311 0 : unsigned int num_vectors = num_vectors_m1 + 1;
4312 :
4313 : /* Need to set up lengths in prologue, only one MIN required
4314 : for each since start index is zero. */
4315 0 : prologue_stmts += num_vectors;
4316 :
4317 : /* If we have a non-zero partial load bias, we need one PLUS
4318 : to adjust the load length. */
4319 0 : if (partial_load_store_bias != 0)
4320 0 : body_stmts += 1;
4321 :
4322 0 : unsigned int length_update_cost = 0;
4323 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4324 : /* For decrement IV style, Each only need a single SELECT_VL
4325 : or MIN since beginning to calculate the number of elements
4326 : need to be processed in current iteration. */
4327 : length_update_cost = 1;
4328 : else
4329 : /* For increment IV stype, Each may need two MINs and one MINUS to
4330 : update lengths in body for next iteration. */
4331 0 : length_update_cost = 3;
4332 :
4333 0 : if (need_iterate_p)
4334 0 : body_stmts += length_update_cost * num_vectors;
4335 : }
4336 :
4337 0 : (void) add_stmt_cost (target_cost_data, prologue_stmts,
4338 : scalar_stmt, vect_prologue);
4339 0 : (void) add_stmt_cost (target_cost_data, body_stmts,
4340 : scalar_stmt, vect_body);
4341 : }
4342 :
4343 : /* FORNOW: The scalar outside cost is incremented in one of the
4344 : following ways:
4345 :
4346 : 1. The vectorizer checks for alignment and aliasing and generates
4347 : a condition that allows dynamic vectorization. A cost model
4348 : check is ANDED with the versioning condition. Hence scalar code
4349 : path now has the added cost of the versioning check.
4350 :
4351 : if (cost > th & versioning_check)
4352 : jmp to vector code
4353 :
4354 : Hence run-time scalar is incremented by not-taken branch cost.
4355 :
4356 : 2. The vectorizer then checks if a prologue is required. If the
4357 : cost model check was not done before during versioning, it has to
4358 : be done before the prologue check.
4359 :
4360 : if (cost <= th)
4361 : prologue = scalar_iters
4362 : if (prologue == 0)
4363 : jmp to vector code
4364 : else
4365 : execute prologue
4366 : if (prologue == num_iters)
4367 : go to exit
4368 :
4369 : Hence the run-time scalar cost is incremented by a taken branch,
4370 : plus a not-taken branch, plus a taken branch cost.
4371 :
4372 : 3. The vectorizer then checks if an epilogue is required. If the
4373 : cost model check was not done before during prologue check, it
4374 : has to be done with the epilogue check.
4375 :
4376 : if (prologue == 0)
4377 : jmp to vector code
4378 : else
4379 : execute prologue
4380 : if (prologue == num_iters)
4381 : go to exit
4382 : vector code:
4383 : if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4384 : jmp to epilogue
4385 :
4386 : Hence the run-time scalar cost should be incremented by 2 taken
4387 : branches.
4388 :
4389 : TODO: The back end may reorder the BBS's differently and reverse
4390 : conditions/branch directions. Change the estimates below to
4391 : something more reasonable. */
4392 :
4393 : /* If the number of iterations is known and we do not do versioning, we can
4394 : decide whether to vectorize at compile time. Hence the scalar version
4395 : do not carry cost model guard costs. */
4396 58567 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4397 183252 : || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4398 : {
4399 : /* Cost model check occurs at versioning. */
4400 67209 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4401 7877 : scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4402 : else
4403 : {
4404 : /* Cost model check occurs at prologue generation. */
4405 59332 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4406 152 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4407 152 : + vect_get_stmt_cost (cond_branch_not_taken);
4408 : /* Cost model check occurs at epilogue generation. */
4409 : else
4410 59180 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4411 : }
4412 : }
4413 :
4414 : /* Complete the target-specific cost calculations. */
4415 124685 : loop_vinfo->vector_costs->finish_cost (loop_vinfo->scalar_costs);
4416 124685 : vec_prologue_cost = loop_vinfo->vector_costs->prologue_cost ();
4417 124685 : vec_inside_cost = loop_vinfo->vector_costs->body_cost ();
4418 124685 : vec_epilogue_cost = loop_vinfo->vector_costs->epilogue_cost ();
4419 124685 : if (suggested_unroll_factor)
4420 124298 : *suggested_unroll_factor
4421 124298 : = loop_vinfo->vector_costs->suggested_unroll_factor ();
4422 :
4423 124298 : if (suggested_unroll_factor && *suggested_unroll_factor > 1
4424 416 : && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4425 0 : && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4426 : *suggested_unroll_factor,
4427 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4428 : {
4429 0 : if (dump_enabled_p ())
4430 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4431 : "can't unroll as unrolled vectorization factor larger"
4432 : " than maximum vectorization factor: "
4433 : HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4434 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4435 0 : *suggested_unroll_factor = 1;
4436 : }
4437 :
4438 124685 : vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4439 :
4440 124685 : if (dump_enabled_p ())
4441 : {
4442 1087 : dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4443 1087 : dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4444 : vec_inside_cost);
4445 1087 : dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4446 : vec_prologue_cost);
4447 1087 : dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4448 : vec_epilogue_cost);
4449 1087 : dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4450 : scalar_single_iter_cost);
4451 1087 : dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4452 : scalar_outside_cost);
4453 1087 : dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4454 : vec_outside_cost);
4455 1087 : dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4456 : peel_iters_prologue);
4457 1087 : dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4458 : peel_iters_epilogue);
4459 : }
4460 :
4461 : /* Calculate number of iterations required to make the vector version
4462 : profitable, relative to the loop bodies only. The following condition
4463 : must hold true:
4464 : SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4465 : where
4466 : SIC = scalar iteration cost, VIC = vector iteration cost,
4467 : VOC = vector outside cost, VF = vectorization factor,
4468 : NPEEL = prologue iterations + epilogue iterations,
4469 : SOC = scalar outside cost for run time cost model check. */
4470 :
4471 124685 : int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4472 124685 : - vec_inside_cost);
4473 124685 : if (saving_per_viter <= 0)
4474 : {
4475 23820 : if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4476 0 : warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4477 : "vectorization did not happen for a simd loop");
4478 :
4479 23820 : if (dump_enabled_p ())
4480 30 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4481 : "cost model: the vector iteration cost = %d "
4482 : "divided by the scalar iteration cost = %d "
4483 : "is greater or equal to the vectorization factor = %d"
4484 : ".\n",
4485 : vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4486 23820 : *ret_min_profitable_niters = -1;
4487 23820 : *ret_min_profitable_estimate = -1;
4488 23820 : return;
4489 : }
4490 :
4491 : /* ??? The "if" arm is written to handle all cases; see below for what
4492 : we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4493 100865 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4494 : {
4495 : /* Rewriting the condition above in terms of the number of
4496 : vector iterations (vniters) rather than the number of
4497 : scalar iterations (niters) gives:
4498 :
4499 : SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4500 :
4501 : <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4502 :
4503 : For integer N, X and Y when X > 0:
4504 :
4505 : N * X > Y <==> N >= (Y /[floor] X) + 1. */
4506 18 : int outside_overhead = (vec_outside_cost
4507 18 : - scalar_single_iter_cost * peel_iters_prologue
4508 18 : - scalar_single_iter_cost * peel_iters_epilogue
4509 : - scalar_outside_cost);
4510 : /* We're only interested in cases that require at least one
4511 : vector iteration. */
4512 18 : int min_vec_niters = 1;
4513 18 : if (outside_overhead > 0)
4514 13 : min_vec_niters = outside_overhead / saving_per_viter + 1;
4515 :
4516 18 : if (dump_enabled_p ())
4517 7 : dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4518 : min_vec_niters);
4519 :
4520 18 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4521 : {
4522 : /* Now that we know the minimum number of vector iterations,
4523 : find the minimum niters for which the scalar cost is larger:
4524 :
4525 : SIC * niters > VIC * vniters + VOC - SOC
4526 :
4527 : We know that the minimum niters is no more than
4528 : vniters * VF + NPEEL, but it might be (and often is) less
4529 : than that if a partial vector iteration is cheaper than the
4530 : equivalent scalar code. */
4531 18 : int threshold = (vec_inside_cost * min_vec_niters
4532 18 : + vec_outside_cost
4533 18 : - scalar_outside_cost);
4534 18 : if (threshold <= 0)
4535 : min_profitable_iters = 1;
4536 : else
4537 18 : min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4538 : }
4539 : else
4540 : /* Convert the number of vector iterations into a number of
4541 : scalar iterations. */
4542 0 : min_profitable_iters = (min_vec_niters * assumed_vf
4543 0 : + peel_iters_prologue
4544 : + peel_iters_epilogue);
4545 : }
4546 : else
4547 : {
4548 100847 : min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4549 100847 : * assumed_vf
4550 100847 : - vec_inside_cost * peel_iters_prologue
4551 100847 : - vec_inside_cost * peel_iters_epilogue);
4552 100847 : if (min_profitable_iters <= 0)
4553 : min_profitable_iters = 0;
4554 : else
4555 : {
4556 85838 : min_profitable_iters /= saving_per_viter;
4557 :
4558 85838 : if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4559 85838 : <= (((int) vec_inside_cost * min_profitable_iters)
4560 85838 : + (((int) vec_outside_cost - scalar_outside_cost)
4561 : * assumed_vf)))
4562 85838 : min_profitable_iters++;
4563 : }
4564 : }
4565 :
4566 100865 : if (dump_enabled_p ())
4567 1057 : dump_printf (MSG_NOTE,
4568 : " Calculated minimum iters for profitability: %d\n",
4569 : min_profitable_iters);
4570 :
4571 100865 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4572 100847 : && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4573 : /* We want the vectorized loop to execute at least once. */
4574 : min_profitable_iters = assumed_vf + peel_iters_prologue;
4575 22061 : else if (min_profitable_iters < peel_iters_prologue)
4576 : /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4577 : vectorized loop executes at least once. */
4578 : min_profitable_iters = peel_iters_prologue;
4579 :
4580 100865 : if (dump_enabled_p ())
4581 1057 : dump_printf_loc (MSG_NOTE, vect_location,
4582 : " Runtime profitability threshold = %d\n",
4583 : min_profitable_iters);
4584 :
4585 100865 : *ret_min_profitable_niters = min_profitable_iters;
4586 :
4587 : /* Calculate number of iterations required to make the vector version
4588 : profitable, relative to the loop bodies only.
4589 :
4590 : Non-vectorized variant is SIC * niters and it must win over vector
4591 : variant on the expected loop trip count. The following condition must hold true:
4592 : SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4593 :
4594 100865 : if (vec_outside_cost <= 0)
4595 : min_profitable_estimate = 0;
4596 : /* ??? This "else if" arm is written to handle all cases; see below for
4597 : what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4598 90305 : else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4599 : {
4600 : /* This is a repeat of the code above, but with + SOC rather
4601 : than - SOC. */
4602 18 : int outside_overhead = (vec_outside_cost
4603 18 : - scalar_single_iter_cost * peel_iters_prologue
4604 18 : - scalar_single_iter_cost * peel_iters_epilogue
4605 : + scalar_outside_cost);
4606 18 : int min_vec_niters = 1;
4607 18 : if (outside_overhead > 0)
4608 18 : min_vec_niters = outside_overhead / saving_per_viter + 1;
4609 :
4610 18 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4611 : {
4612 18 : int threshold = (vec_inside_cost * min_vec_niters
4613 18 : + vec_outside_cost
4614 18 : + scalar_outside_cost);
4615 18 : min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4616 : }
4617 : else
4618 : min_profitable_estimate = (min_vec_niters * assumed_vf
4619 : + peel_iters_prologue
4620 : + peel_iters_epilogue);
4621 : }
4622 : else
4623 : {
4624 90287 : min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4625 90287 : * assumed_vf
4626 90287 : - vec_inside_cost * peel_iters_prologue
4627 90287 : - vec_inside_cost * peel_iters_epilogue)
4628 90287 : / ((scalar_single_iter_cost * assumed_vf)
4629 : - vec_inside_cost);
4630 : }
4631 100865 : min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4632 100865 : if (dump_enabled_p ())
4633 1057 : dump_printf_loc (MSG_NOTE, vect_location,
4634 : " Static estimate profitability threshold = %d\n",
4635 : min_profitable_estimate);
4636 :
4637 100865 : *ret_min_profitable_estimate = min_profitable_estimate;
4638 : }
4639 :
4640 : /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4641 : vector elements (not bits) for a vector with NELT elements. */
4642 : static void
4643 2293 : calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4644 : vec_perm_builder *sel)
4645 : {
4646 : /* The encoding is a single stepped pattern. Any wrap-around is handled
4647 : by vec_perm_indices. */
4648 2293 : sel->new_vector (nelt, 1, 3);
4649 9172 : for (unsigned int i = 0; i < 3; i++)
4650 6879 : sel->quick_push (i + offset);
4651 2293 : }
4652 :
4653 : /* Checks whether the target supports whole-vector shifts for vectors of mode
4654 : MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4655 : it supports vec_perm_const with masks for all necessary shift amounts. */
4656 : static bool
4657 13726 : have_whole_vector_shift (machine_mode mode)
4658 : {
4659 13726 : if (can_implement_p (vec_shr_optab, mode))
4660 : return true;
4661 :
4662 : /* Variable-length vectors should be handled via the optab. */
4663 63 : unsigned int nelt;
4664 126 : if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4665 : return false;
4666 :
4667 63 : vec_perm_builder sel;
4668 63 : vec_perm_indices indices;
4669 315 : for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4670 : {
4671 252 : calc_vec_perm_mask_for_shift (i, nelt, &sel);
4672 252 : indices.new_vector (sel, 2, nelt);
4673 252 : if (!can_vec_perm_const_p (mode, mode, indices, false))
4674 : return false;
4675 : }
4676 : return true;
4677 63 : }
4678 :
4679 : /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
4680 : multiplication operands have differing signs and (b) we intend
4681 : to emulate the operation using a series of signed DOT_PROD_EXPRs.
4682 : See vect_emulate_mixed_dot_prod for the actual sequence used. */
4683 :
4684 : static bool
4685 2457 : vect_is_emulated_mixed_dot_prod (slp_tree slp_node)
4686 : {
4687 2457 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
4688 2457 : gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
4689 2004 : if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
4690 : return false;
4691 :
4692 825 : tree rhs1 = gimple_assign_rhs1 (assign);
4693 825 : tree rhs2 = gimple_assign_rhs2 (assign);
4694 825 : if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
4695 : return false;
4696 :
4697 627 : return !directly_supported_p (DOT_PROD_EXPR,
4698 : SLP_TREE_VECTYPE (slp_node),
4699 209 : SLP_TREE_VECTYPE
4700 : (SLP_TREE_CHILDREN (slp_node)[0]),
4701 209 : optab_vector_mixed_sign);
4702 : }
4703 :
4704 : /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4705 : functions. Design better to avoid maintenance issues. */
4706 :
4707 : /* Function vect_model_reduction_cost.
4708 :
4709 : Models cost for a reduction operation, including the vector ops
4710 : generated within the strip-mine loop in some cases, the initial
4711 : definition before the loop, and the epilogue code that must be generated. */
4712 :
4713 : static void
4714 72216 : vect_model_reduction_cost (loop_vec_info loop_vinfo,
4715 : slp_tree node, internal_fn reduc_fn,
4716 : vect_reduction_type reduction_type,
4717 : int ncopies, stmt_vector_for_cost *cost_vec)
4718 : {
4719 72216 : int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4720 72216 : tree vectype;
4721 72216 : machine_mode mode;
4722 72216 : class loop *loop = NULL;
4723 :
4724 72216 : if (loop_vinfo)
4725 72216 : loop = LOOP_VINFO_LOOP (loop_vinfo);
4726 :
4727 : /* Condition reductions generate two reductions in the loop. */
4728 72216 : if (reduction_type == COND_REDUCTION)
4729 324 : ncopies *= 2;
4730 :
4731 72216 : vectype = SLP_TREE_VECTYPE (node);
4732 72216 : mode = TYPE_MODE (vectype);
4733 72216 : stmt_vec_info orig_stmt_info
4734 72216 : = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
4735 :
4736 72216 : gimple_match_op op;
4737 72216 : if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4738 0 : gcc_unreachable ();
4739 :
4740 72216 : if (reduction_type == EXTRACT_LAST_REDUCTION)
4741 : /* No extra instructions are needed in the prologue. The loop body
4742 : operations are costed in vectorizable_condition. */
4743 : inside_cost = 0;
4744 72216 : else if (reduction_type == FOLD_LEFT_REDUCTION)
4745 : {
4746 : /* No extra instructions needed in the prologue. */
4747 4281 : prologue_cost = 0;
4748 :
4749 4281 : if (reduc_fn != IFN_LAST)
4750 : /* Count one reduction-like operation per vector. */
4751 0 : inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4752 : node, 0, vect_body);
4753 : else
4754 : {
4755 : /* Use NCOPIES deconstructs and NELEMENTS scalar ops. */
4756 4281 : unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4757 4281 : inside_cost = record_stmt_cost (cost_vec, ncopies,
4758 : vec_deconstruct, node, 0,
4759 : vect_body);
4760 4281 : inside_cost += record_stmt_cost (cost_vec, nelements,
4761 : scalar_stmt, node, 0,
4762 : vect_body);
4763 : }
4764 : }
4765 : else
4766 : {
4767 : /* Add in the cost of the initial definitions. */
4768 67935 : int prologue_stmts;
4769 67935 : if (reduction_type == COND_REDUCTION)
4770 : /* For cond reductions we have four vectors: initial index, step,
4771 : initial result of the data reduction, initial value of the index
4772 : reduction. */
4773 : prologue_stmts = 4;
4774 : else
4775 : /* We need the initial reduction value. */
4776 67611 : prologue_stmts = 1;
4777 67935 : prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4778 : scalar_to_vec, node, 0,
4779 : vect_prologue);
4780 : }
4781 :
4782 : /* Determine cost of epilogue code.
4783 :
4784 : We have a reduction operator that will reduce the vector in one statement.
4785 : Also requires scalar extract. */
4786 :
4787 72216 : if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4788 : {
4789 72032 : if (reduc_fn != IFN_LAST)
4790 : {
4791 52443 : if (reduction_type == COND_REDUCTION)
4792 : {
4793 : /* An EQ stmt and an COND_EXPR stmt. */
4794 18 : epilogue_cost += record_stmt_cost (cost_vec, 2,
4795 : vector_stmt, node, 0,
4796 : vect_epilogue);
4797 : /* Reduction of the max index and a reduction of the found
4798 : values. */
4799 18 : epilogue_cost += record_stmt_cost (cost_vec, 2,
4800 : vec_to_scalar, node, 0,
4801 : vect_epilogue);
4802 : /* A broadcast of the max value. */
4803 18 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4804 : scalar_to_vec, node, 0,
4805 : vect_epilogue);
4806 : }
4807 : else
4808 : {
4809 52425 : epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4810 : node, 0, vect_epilogue);
4811 52425 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4812 : vec_to_scalar, node, 0,
4813 : vect_epilogue);
4814 : }
4815 : }
4816 19589 : else if (reduction_type == COND_REDUCTION)
4817 : {
4818 306 : unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4819 : /* Extraction of scalar elements. */
4820 306 : epilogue_cost += record_stmt_cost (cost_vec, 2,
4821 : vec_deconstruct, node, 0,
4822 : vect_epilogue);
4823 : /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4824 306 : epilogue_cost += record_stmt_cost (cost_vec,
4825 306 : 2 * estimated_nunits - 3,
4826 : scalar_stmt, node, 0,
4827 : vect_epilogue);
4828 : }
4829 19283 : else if (reduction_type == EXTRACT_LAST_REDUCTION
4830 19283 : || reduction_type == FOLD_LEFT_REDUCTION)
4831 : /* No extra instructions need in the epilogue. */
4832 : ;
4833 : else
4834 : {
4835 15002 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4836 15002 : tree bitsize = TYPE_SIZE (op.type);
4837 15002 : int element_bitsize = tree_to_uhwi (bitsize);
4838 15002 : int nelements = vec_size_in_bits / element_bitsize;
4839 :
4840 15002 : if (op.code == COND_EXPR)
4841 31 : op.code = MAX_EXPR;
4842 :
4843 : /* We have a whole vector shift available. */
4844 3141 : if (VECTOR_MODE_P (mode)
4845 15002 : && directly_supported_p (op.code, vectype)
4846 26759 : && have_whole_vector_shift (mode))
4847 : {
4848 : /* Final reduction via vector shifts and the reduction operator.
4849 : Also requires scalar extract. */
4850 35271 : epilogue_cost += record_stmt_cost (cost_vec,
4851 23514 : exact_log2 (nelements) * 2,
4852 : vector_stmt, node, 0,
4853 : vect_epilogue);
4854 11757 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4855 : vec_to_scalar, node, 0,
4856 : vect_epilogue);
4857 : }
4858 : else
4859 : /* Use extracts and reduction op for final reduction. For N
4860 : elements, we have N extracts and N-1 reduction ops. */
4861 3245 : epilogue_cost += record_stmt_cost (cost_vec,
4862 3245 : nelements + nelements - 1,
4863 : vector_stmt, node, 0,
4864 : vect_epilogue);
4865 : }
4866 : }
4867 :
4868 72216 : if (dump_enabled_p ())
4869 3009 : dump_printf (MSG_NOTE,
4870 : "vect_model_reduction_cost: inside_cost = %d, "
4871 : "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4872 : prologue_cost, epilogue_cost);
4873 72216 : }
4874 :
4875 : /* SEQ is a sequence of instructions that initialize the reduction
4876 : described by REDUC_INFO. Emit them in the appropriate place. */
4877 :
4878 : static void
4879 462 : vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4880 : vect_reduc_info reduc_info, gimple *seq)
4881 : {
4882 462 : if (VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info))
4883 : {
4884 : /* When reusing an accumulator from the main loop, we only need
4885 : initialization instructions if the main loop can be skipped.
4886 : In that case, emit the initialization instructions at the end
4887 : of the guard block that does the skip. */
4888 22 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
4889 22 : gcc_assert (skip_edge);
4890 22 : gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4891 22 : gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4892 : }
4893 : else
4894 : {
4895 : /* The normal case: emit the initialization instructions on the
4896 : preheader edge. */
4897 440 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4898 440 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4899 : }
4900 462 : }
4901 :
4902 : /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4903 : which performs a reduction involving GROUP_SIZE scalar statements.
4904 : NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
4905 : is nonnull, introducing extra elements of that value will not change the
4906 : result. */
4907 :
4908 : static void
4909 21799 : get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4910 : vect_reduc_info reduc_info,
4911 : tree vector_type,
4912 : vec<tree> *vec_oprnds,
4913 : unsigned int number_of_vectors,
4914 : unsigned int group_size, tree neutral_op)
4915 : {
4916 21799 : vec<tree> &initial_values = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info);
4917 21799 : unsigned HOST_WIDE_INT nunits;
4918 21799 : unsigned j, number_of_places_left_in_vector;
4919 21799 : unsigned int i;
4920 :
4921 43598 : gcc_assert (group_size == initial_values.length () || neutral_op);
4922 :
4923 : /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4924 : created vectors. It is greater than 1 if unrolling is performed.
4925 :
4926 : For example, we have two scalar operands, s1 and s2 (e.g., group of
4927 : strided accesses of size two), while NUNITS is four (i.e., four scalars
4928 : of this type can be packed in a vector). The output vector will contain
4929 : two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4930 : will be 2).
4931 :
4932 : If GROUP_SIZE > NUNITS, the scalars will be split into several
4933 : vectors containing the operands.
4934 :
4935 : For example, NUNITS is four as before, and the group size is 8
4936 : (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4937 : {s5, s6, s7, s8}. */
4938 :
4939 21799 : if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4940 : nunits = group_size;
4941 :
4942 21799 : tree vector_elt_type = TREE_TYPE (vector_type);
4943 21799 : number_of_places_left_in_vector = nunits;
4944 21799 : bool constant_p = true;
4945 21799 : tree_vector_builder elts (vector_type, nunits, 1);
4946 21799 : elts.quick_grow (nunits);
4947 21799 : gimple_seq ctor_seq = NULL;
4948 21799 : if (neutral_op
4949 43014 : && !useless_type_conversion_p (vector_elt_type,
4950 21215 : TREE_TYPE (neutral_op)))
4951 : {
4952 242 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4953 221 : neutral_op = gimple_build (&ctor_seq, COND_EXPR,
4954 : vector_elt_type,
4955 : neutral_op,
4956 : build_all_ones_cst (vector_elt_type),
4957 : build_zero_cst (vector_elt_type));
4958 : else
4959 21 : neutral_op = gimple_convert (&ctor_seq, vector_elt_type, neutral_op);
4960 : }
4961 204229 : for (j = 0; j < nunits * number_of_vectors; ++j)
4962 : {
4963 182430 : tree op;
4964 182430 : i = j % group_size;
4965 :
4966 : /* Get the def before the loop. In reduction chain we have only
4967 : one initial value. Else we have as many as PHIs in the group. */
4968 182430 : if (i >= initial_values.length () || (j > i && neutral_op))
4969 : op = neutral_op;
4970 : else
4971 : {
4972 51392 : if (!useless_type_conversion_p (vector_elt_type,
4973 25696 : TREE_TYPE (initial_values[i])))
4974 : {
4975 257 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4976 466 : initial_values[i] = gimple_build (&ctor_seq, COND_EXPR,
4977 : vector_elt_type,
4978 233 : initial_values[i],
4979 : build_all_ones_cst
4980 : (vector_elt_type),
4981 : build_zero_cst
4982 : (vector_elt_type));
4983 : else
4984 48 : initial_values[i] = gimple_convert (&ctor_seq,
4985 : vector_elt_type,
4986 24 : initial_values[i]);
4987 : }
4988 25696 : op = initial_values[i];
4989 : }
4990 :
4991 : /* Create 'vect_ = {op0,op1,...,opn}'. */
4992 182430 : number_of_places_left_in_vector--;
4993 182430 : elts[nunits - number_of_places_left_in_vector - 1] = op;
4994 182430 : if (!CONSTANT_CLASS_P (op))
4995 2497 : constant_p = false;
4996 :
4997 182430 : if (number_of_places_left_in_vector == 0)
4998 : {
4999 23365 : tree init;
5000 46730 : if (constant_p && !neutral_op
5001 46449 : ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5002 23365 : : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5003 : /* Build the vector directly from ELTS. */
5004 23365 : init = gimple_build_vector (&ctor_seq, &elts);
5005 0 : else if (neutral_op)
5006 : {
5007 : /* Build a vector of the neutral value and shift the
5008 : other elements into place. */
5009 0 : init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5010 : neutral_op);
5011 0 : int k = nunits;
5012 0 : while (k > 0 && operand_equal_p (elts[k - 1], neutral_op))
5013 : k -= 1;
5014 0 : while (k > 0)
5015 : {
5016 0 : k -= 1;
5017 0 : init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5018 0 : vector_type, init, elts[k]);
5019 : }
5020 : }
5021 : else
5022 : {
5023 : /* First time round, duplicate ELTS to fill the
5024 : required number of vectors. */
5025 0 : duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5026 : elts, number_of_vectors, *vec_oprnds);
5027 0 : break;
5028 : }
5029 23365 : vec_oprnds->quick_push (init);
5030 :
5031 23365 : number_of_places_left_in_vector = nunits;
5032 23365 : elts.new_vector (vector_type, nunits, 1);
5033 23365 : elts.quick_grow (nunits);
5034 23365 : constant_p = true;
5035 : }
5036 : }
5037 21799 : if (ctor_seq != NULL)
5038 462 : vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5039 21799 : }
5040 :
5041 : vect_reduc_info
5042 161998 : info_for_reduction (loop_vec_info loop_vinfo, slp_tree node)
5043 : {
5044 161998 : if (node->cycle_info.id == -1)
5045 : return NULL;
5046 160030 : return loop_vinfo->reduc_infos[node->cycle_info.id];
5047 : }
5048 :
5049 : /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5050 : REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5051 : return false. */
5052 :
5053 : static bool
5054 21440 : vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5055 : vect_reduc_info reduc_info, tree vectype)
5056 : {
5057 21440 : loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5058 21440 : if (!main_loop_vinfo)
5059 : return false;
5060 :
5061 4672 : if (VECT_REDUC_INFO_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5062 : return false;
5063 :
5064 : /* We are not set up to handle vector bools when they are not mapped
5065 : to vector integer data types. */
5066 4657 : if (VECTOR_BOOLEAN_TYPE_P (vectype)
5067 4729 : && GET_MODE_CLASS (TYPE_MODE (vectype)) != MODE_VECTOR_INT)
5068 : return false;
5069 :
5070 4655 : unsigned int num_phis = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).length ();
5071 4655 : auto_vec<tree, 16> main_loop_results (num_phis);
5072 4655 : auto_vec<tree, 16> initial_values (num_phis);
5073 4655 : if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5074 : {
5075 : /* The epilogue loop can be entered either from the main loop or
5076 : from an earlier guard block. */
5077 4432 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
5078 17752 : for (tree incoming_value : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info))
5079 : {
5080 : /* Look for:
5081 :
5082 : INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5083 : INITIAL_VALUE(guard block)>. */
5084 4456 : gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5085 :
5086 4456 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5087 4456 : gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5088 :
5089 4456 : tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5090 4456 : tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5091 :
5092 4456 : main_loop_results.quick_push (from_main_loop);
5093 4456 : initial_values.quick_push (from_skip);
5094 : }
5095 : }
5096 : else
5097 : /* The main loop dominates the epilogue loop. */
5098 223 : main_loop_results.splice (VECT_REDUC_INFO_INITIAL_VALUES (reduc_info));
5099 :
5100 : /* See if the main loop has the kind of accumulator we need. */
5101 4655 : vect_reusable_accumulator *accumulator
5102 4655 : = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5103 4655 : if (!accumulator
5104 9294 : || num_phis != VECT_REDUC_INFO_SCALAR_RESULTS (accumulator->reduc_info).length ()
5105 13945 : || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5106 : VECT_REDUC_INFO_SCALAR_RESULTS (accumulator->reduc_info).begin ()))
5107 : return false;
5108 :
5109 : /* Handle the case where we can reduce wider vectors to narrower ones. */
5110 4645 : tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5111 4645 : unsigned HOST_WIDE_INT m;
5112 4645 : if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5113 4645 : TYPE_VECTOR_SUBPARTS (vectype), &m))
5114 0 : return false;
5115 : /* Check the intermediate vector types and operations are available. */
5116 4645 : tree prev_vectype = old_vectype;
5117 4645 : poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5118 13561 : while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5119 : {
5120 4795 : intermediate_nunits = exact_div (intermediate_nunits, 2);
5121 4795 : tree intermediate_vectype = get_related_vectype_for_scalar_type
5122 4795 : (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5123 4795 : if (!intermediate_vectype
5124 4795 : || !directly_supported_p (VECT_REDUC_INFO_CODE (reduc_info),
5125 : intermediate_vectype)
5126 9070 : || !can_vec_extract (TYPE_MODE (prev_vectype),
5127 4275 : TYPE_MODE (intermediate_vectype)))
5128 : return false;
5129 : prev_vectype = intermediate_vectype;
5130 : }
5131 :
5132 : /* Non-SLP reductions might apply an adjustment after the reduction
5133 : operation, in order to simplify the initialization of the accumulator.
5134 : If the epilogue loop carries on from where the main loop left off,
5135 : it should apply the same adjustment to the final reduction result.
5136 :
5137 : If the epilogue loop can also be entered directly (rather than via
5138 : the main loop), we need to be able to handle that case in the same way,
5139 : with the same adjustment. (In principle we could add a PHI node
5140 : to select the correct adjustment, but in practice that shouldn't be
5141 : necessary.) */
5142 4121 : tree main_adjustment
5143 4121 : = VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5144 4121 : if (loop_vinfo->main_loop_edge && main_adjustment)
5145 : {
5146 3435 : gcc_assert (num_phis == 1);
5147 3435 : tree initial_value = initial_values[0];
5148 : /* Check that we can use INITIAL_VALUE as the adjustment and
5149 : initialize the accumulator with a neutral value instead. */
5150 3435 : if (!operand_equal_p (initial_value, main_adjustment))
5151 : return false;
5152 3425 : initial_values[0] = VECT_REDUC_INFO_NEUTRAL_OP (reduc_info);
5153 : }
5154 4111 : VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5155 4111 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).truncate (0);
5156 4111 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).splice (initial_values);
5157 4111 : VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info) = accumulator;
5158 4111 : return true;
5159 4655 : }
5160 :
5161 : /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5162 : CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5163 :
5164 : static tree
5165 4155 : vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5166 : gimple_seq *seq)
5167 : {
5168 4155 : gcc_assert (!VECTOR_BOOLEAN_TYPE_P (TREE_TYPE (vec_def))
5169 : || (GET_MODE_CLASS (TYPE_MODE (TREE_TYPE (vec_def)))
5170 : == MODE_VECTOR_INT));
5171 4155 : unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5172 4155 : unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5173 4155 : tree stype = TREE_TYPE (vectype);
5174 4155 : tree new_temp = vec_def;
5175 8453 : while (nunits > nunits1)
5176 : {
5177 4298 : nunits /= 2;
5178 4298 : tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5179 4298 : stype, nunits);
5180 4298 : unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5181 :
5182 : /* The target has to make sure we support lowpart/highpart
5183 : extraction, either via direct vector extract or through
5184 : an integer mode punning. */
5185 4298 : tree dst1, dst2;
5186 4298 : gimple *epilog_stmt;
5187 4298 : if (convert_optab_handler (vec_extract_optab,
5188 4298 : TYPE_MODE (TREE_TYPE (new_temp)),
5189 4298 : TYPE_MODE (vectype1))
5190 : != CODE_FOR_nothing)
5191 : {
5192 : /* Extract sub-vectors directly once vec_extract becomes
5193 : a conversion optab. */
5194 2618 : dst1 = make_ssa_name (vectype1);
5195 2618 : epilog_stmt
5196 5236 : = gimple_build_assign (dst1, BIT_FIELD_REF,
5197 : build3 (BIT_FIELD_REF, vectype1,
5198 2618 : new_temp, TYPE_SIZE (vectype1),
5199 : bitsize_int (0)));
5200 2618 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5201 2618 : dst2 = make_ssa_name (vectype1);
5202 2618 : epilog_stmt
5203 2618 : = gimple_build_assign (dst2, BIT_FIELD_REF,
5204 : build3 (BIT_FIELD_REF, vectype1,
5205 2618 : new_temp, TYPE_SIZE (vectype1),
5206 2618 : bitsize_int (bitsize)));
5207 2618 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5208 : }
5209 : else
5210 : {
5211 : /* Extract via punning to appropriately sized integer mode
5212 : vector. */
5213 1680 : tree eltype = build_nonstandard_integer_type (bitsize, 1);
5214 1680 : tree etype = build_vector_type (eltype, 2);
5215 3360 : gcc_assert (convert_optab_handler (vec_extract_optab,
5216 : TYPE_MODE (etype),
5217 : TYPE_MODE (eltype))
5218 : != CODE_FOR_nothing);
5219 1680 : tree tem = make_ssa_name (etype);
5220 1680 : epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5221 : build1 (VIEW_CONVERT_EXPR,
5222 : etype, new_temp));
5223 1680 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5224 1680 : new_temp = tem;
5225 1680 : tem = make_ssa_name (eltype);
5226 1680 : epilog_stmt
5227 3360 : = gimple_build_assign (tem, BIT_FIELD_REF,
5228 : build3 (BIT_FIELD_REF, eltype,
5229 1680 : new_temp, TYPE_SIZE (eltype),
5230 : bitsize_int (0)));
5231 1680 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5232 1680 : dst1 = make_ssa_name (vectype1);
5233 1680 : epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5234 : build1 (VIEW_CONVERT_EXPR,
5235 : vectype1, tem));
5236 1680 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5237 1680 : tem = make_ssa_name (eltype);
5238 1680 : epilog_stmt
5239 1680 : = gimple_build_assign (tem, BIT_FIELD_REF,
5240 : build3 (BIT_FIELD_REF, eltype,
5241 1680 : new_temp, TYPE_SIZE (eltype),
5242 1680 : bitsize_int (bitsize)));
5243 1680 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5244 1680 : dst2 = make_ssa_name (vectype1);
5245 1680 : epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5246 : build1 (VIEW_CONVERT_EXPR,
5247 : vectype1, tem));
5248 1680 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5249 : }
5250 :
5251 4298 : new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5252 : }
5253 4155 : if (!useless_type_conversion_p (vectype, TREE_TYPE (new_temp)))
5254 : {
5255 66 : tree dst3 = make_ssa_name (vectype);
5256 66 : gimple *epilog_stmt = gimple_build_assign (dst3, VIEW_CONVERT_EXPR,
5257 : build1 (VIEW_CONVERT_EXPR,
5258 : vectype, new_temp));
5259 66 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5260 66 : new_temp = dst3;
5261 : }
5262 :
5263 4155 : return new_temp;
5264 : }
5265 :
5266 : /* Function vect_create_epilog_for_reduction
5267 :
5268 : Create code at the loop-epilog to finalize the result of a reduction
5269 : computation.
5270 :
5271 : STMT_INFO is the scalar reduction stmt that is being vectorized.
5272 : SLP_NODE is an SLP node containing a group of reduction statements. The
5273 : first one in this group is STMT_INFO.
5274 : SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5275 : REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5276 : (counting from 0)
5277 : LOOP_EXIT is the edge to update in the merge block. In the case of a single
5278 : exit this edge is always the main loop exit.
5279 :
5280 : This function:
5281 : 1. Completes the reduction def-use cycles.
5282 : 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5283 : by calling the function specified by REDUC_FN if available, or by
5284 : other means (whole-vector shifts or a scalar loop).
5285 : The function also creates a new phi node at the loop exit to preserve
5286 : loop-closed form, as illustrated below.
5287 :
5288 : The flow at the entry to this function:
5289 :
5290 : loop:
5291 : vec_def = phi <vec_init, null> # REDUCTION_PHI
5292 : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5293 : s_loop = scalar_stmt # (scalar) STMT_INFO
5294 : loop_exit:
5295 : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5296 : use <s_out0>
5297 : use <s_out0>
5298 :
5299 : The above is transformed by this function into:
5300 :
5301 : loop:
5302 : vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5303 : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5304 : s_loop = scalar_stmt # (scalar) STMT_INFO
5305 : loop_exit:
5306 : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5307 : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5308 : v_out2 = reduce <v_out1>
5309 : s_out3 = extract_field <v_out2, 0>
5310 : s_out4 = adjust_result <s_out3>
5311 : use <s_out4>
5312 : use <s_out4>
5313 : */
5314 :
5315 : static void
5316 22146 : vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5317 : stmt_vec_info stmt_info,
5318 : slp_tree slp_node,
5319 : slp_instance slp_node_instance,
5320 : edge loop_exit)
5321 : {
5322 22146 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
5323 22146 : code_helper code = VECT_REDUC_INFO_CODE (reduc_info);
5324 22146 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
5325 22146 : tree vectype;
5326 22146 : machine_mode mode;
5327 22146 : basic_block exit_bb;
5328 22146 : gimple *new_phi = NULL, *phi = NULL;
5329 22146 : gimple_stmt_iterator exit_gsi;
5330 22146 : tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5331 22146 : gimple *epilog_stmt = NULL;
5332 22146 : gimple *exit_phi;
5333 22146 : tree def;
5334 22146 : tree orig_name, scalar_result;
5335 22146 : imm_use_iterator imm_iter;
5336 22146 : use_operand_p use_p;
5337 22146 : gimple *use_stmt;
5338 22146 : auto_vec<tree> reduc_inputs;
5339 22146 : int j, i;
5340 22146 : vec<tree> &scalar_results = VECT_REDUC_INFO_SCALAR_RESULTS (reduc_info);
5341 22146 : unsigned int k;
5342 : /* SLP reduction without reduction chain, e.g.,
5343 : # a1 = phi <a2, a0>
5344 : # b1 = phi <b2, b0>
5345 : a2 = operation (a1)
5346 : b2 = operation (b1) */
5347 22146 : const bool slp_reduc = !reduc_info->is_reduc_chain;
5348 22146 : tree induction_index = NULL_TREE;
5349 :
5350 22146 : unsigned int group_size = SLP_TREE_LANES (slp_node);
5351 :
5352 22146 : bool double_reduc = false;
5353 22146 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5354 22146 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5355 : {
5356 0 : double_reduc = true;
5357 0 : gcc_assert (slp_reduc);
5358 : }
5359 :
5360 22146 : vectype = VECT_REDUC_INFO_VECTYPE (reduc_info);
5361 22146 : gcc_assert (vectype);
5362 22146 : mode = TYPE_MODE (vectype);
5363 :
5364 22146 : tree induc_val = NULL_TREE;
5365 22146 : tree adjustment_def = NULL;
5366 : /* Optimize: for induction condition reduction, if we can't use zero
5367 : for induc_val, use initial_def. */
5368 22146 : if (VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5369 62 : induc_val = VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info);
5370 22084 : else if (double_reduc)
5371 : ;
5372 : else
5373 22084 : adjustment_def = VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info);
5374 :
5375 22146 : stmt_vec_info single_live_out_stmt[] = { stmt_info };
5376 22146 : array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5377 22146 : if (slp_reduc)
5378 : /* All statements produce live-out values. */
5379 43860 : live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5380 :
5381 22146 : unsigned vec_num
5382 22146 : = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5383 :
5384 : /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5385 : which is updated with the current index of the loop for every match of
5386 : the original loop's cond_expr (VEC_STMT). This results in a vector
5387 : containing the last time the condition passed for that vector lane.
5388 : The first match will be a 1 to allow 0 to be used for non-matching
5389 : indexes. If there are no matches at all then the vector will be all
5390 : zeroes.
5391 :
5392 : PR92772: This algorithm is broken for architectures that support
5393 : masked vectors, but do not provide fold_extract_last. */
5394 22146 : if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION)
5395 : {
5396 87 : gcc_assert (!double_reduc);
5397 87 : auto_vec<std::pair<tree, bool>, 2> ccompares;
5398 87 : slp_tree cond_node = slp_node_instance->root;
5399 183 : while (cond_node != slp_node_instance->reduc_phis)
5400 : {
5401 96 : stmt_vec_info cond_info = SLP_TREE_REPRESENTATIVE (cond_node);
5402 96 : if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5403 : {
5404 96 : gimple *vec_stmt
5405 96 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (cond_node)[0]);
5406 96 : gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5407 96 : ccompares.safe_push
5408 96 : (std::make_pair (gimple_assign_rhs1 (vec_stmt),
5409 96 : SLP_TREE_REDUC_IDX (cond_node) == 2));
5410 : }
5411 96 : int slp_reduc_idx = SLP_TREE_REDUC_IDX (cond_node);
5412 96 : cond_node = SLP_TREE_CHILDREN (cond_node)[slp_reduc_idx];
5413 : }
5414 87 : gcc_assert (ccompares.length () != 0);
5415 :
5416 87 : tree indx_before_incr, indx_after_incr;
5417 87 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5418 87 : int scalar_precision
5419 87 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5420 87 : tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5421 87 : tree cr_index_vector_type = get_related_vectype_for_scalar_type
5422 87 : (TYPE_MODE (vectype), cr_index_scalar_type,
5423 : TYPE_VECTOR_SUBPARTS (vectype));
5424 :
5425 : /* First we create a simple vector induction variable which starts
5426 : with the values {1,2,3,...} (SERIES_VECT) and increments by the
5427 : vector size (STEP). */
5428 :
5429 : /* Create a {1,2,3,...} vector. */
5430 87 : tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5431 :
5432 : /* Create a vector of the step value. */
5433 87 : tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5434 87 : tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5435 :
5436 : /* Create an induction variable. */
5437 87 : gimple_stmt_iterator incr_gsi;
5438 87 : bool insert_after;
5439 87 : vect_iv_increment_position (LOOP_VINFO_MAIN_EXIT (loop_vinfo),
5440 : &incr_gsi, &insert_after);
5441 87 : create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
5442 : insert_after, &indx_before_incr, &indx_after_incr);
5443 :
5444 : /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5445 : filled with zeros (VEC_ZERO). */
5446 :
5447 : /* Create a vector of 0s. */
5448 87 : tree zero = build_zero_cst (cr_index_scalar_type);
5449 87 : tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5450 :
5451 : /* Create a vector phi node. */
5452 87 : tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5453 87 : new_phi = create_phi_node (new_phi_tree, loop->header);
5454 87 : add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5455 : loop_preheader_edge (loop), UNKNOWN_LOCATION);
5456 :
5457 : /* Now take the condition from the loops original cond_exprs
5458 : and produce a new cond_exprs (INDEX_COND_EXPR) which for
5459 : every match uses values from the induction variable
5460 : (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5461 : (NEW_PHI_TREE).
5462 : Finally, we update the phi (NEW_PHI_TREE) to take the value of
5463 : the new cond_expr (INDEX_COND_EXPR). */
5464 87 : gimple_seq stmts = NULL;
5465 270 : for (int i = ccompares.length () - 1; i != -1; --i)
5466 : {
5467 96 : tree ccompare = ccompares[i].first;
5468 96 : if (ccompares[i].second)
5469 69 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5470 : cr_index_vector_type,
5471 : ccompare,
5472 : indx_before_incr, new_phi_tree);
5473 : else
5474 27 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5475 : cr_index_vector_type,
5476 : ccompare,
5477 : new_phi_tree, indx_before_incr);
5478 : }
5479 87 : gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5480 :
5481 : /* Update the phi with the vec cond. */
5482 87 : induction_index = new_phi_tree;
5483 87 : add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5484 : loop_latch_edge (loop), UNKNOWN_LOCATION);
5485 87 : }
5486 :
5487 : /* 2. Create epilog code.
5488 : The reduction epilog code operates across the elements of the vector
5489 : of partial results computed by the vectorized loop.
5490 : The reduction epilog code consists of:
5491 :
5492 : step 1: compute the scalar result in a vector (v_out2)
5493 : step 2: extract the scalar result (s_out3) from the vector (v_out2)
5494 : step 3: adjust the scalar result (s_out3) if needed.
5495 :
5496 : Step 1 can be accomplished using one the following three schemes:
5497 : (scheme 1) using reduc_fn, if available.
5498 : (scheme 2) using whole-vector shifts, if available.
5499 : (scheme 3) using a scalar loop. In this case steps 1+2 above are
5500 : combined.
5501 :
5502 : The overall epilog code looks like this:
5503 :
5504 : s_out0 = phi <s_loop> # original EXIT_PHI
5505 : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5506 : v_out2 = reduce <v_out1> # step 1
5507 : s_out3 = extract_field <v_out2, 0> # step 2
5508 : s_out4 = adjust_result <s_out3> # step 3
5509 :
5510 : (step 3 is optional, and steps 1 and 2 may be combined).
5511 : Lastly, the uses of s_out0 are replaced by s_out4. */
5512 :
5513 :
5514 : /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5515 : v_out1 = phi <VECT_DEF>
5516 : Store them in NEW_PHIS. */
5517 : /* We need to reduce values in all exits. */
5518 22146 : exit_bb = loop_exit->dest;
5519 22146 : exit_gsi = gsi_after_labels (exit_bb);
5520 22146 : reduc_inputs.create (vec_num);
5521 45868 : for (unsigned i = 0; i < vec_num; i++)
5522 : {
5523 23722 : gimple_seq stmts = NULL;
5524 23722 : def = vect_get_slp_vect_def (slp_node, i);
5525 23722 : tree new_def = copy_ssa_name (def);
5526 23722 : phi = create_phi_node (new_def, exit_bb);
5527 23722 : if (LOOP_VINFO_MAIN_EXIT (loop_vinfo) == loop_exit)
5528 23695 : SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
5529 : else
5530 : {
5531 57 : for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
5532 30 : SET_PHI_ARG_DEF (phi, k, def);
5533 : }
5534 23722 : new_def = gimple_convert (&stmts, vectype, new_def);
5535 23722 : reduc_inputs.quick_push (new_def);
5536 23722 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5537 : }
5538 :
5539 : /* 2.2 Get the original scalar reduction variable as defined in the loop.
5540 : In case STMT is a "pattern-stmt" (i.e. - it represents a reduction
5541 : pattern), the scalar-def is taken from the original stmt that the
5542 : pattern-stmt (STMT) replaces. */
5543 :
5544 22973 : tree scalar_dest = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
5545 22146 : tree scalar_type = TREE_TYPE (scalar_dest);
5546 22146 : scalar_results.truncate (0);
5547 22146 : scalar_results.reserve_exact (group_size);
5548 22146 : new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5549 :
5550 : /* True if we should implement SLP_REDUC using native reduction operations
5551 : instead of scalar operations. */
5552 22146 : const bool direct_slp_reduc
5553 22146 : = (reduc_fn != IFN_LAST
5554 22146 : && slp_reduc
5555 22146 : && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5556 :
5557 : /* If signed overflow is undefined we might need to perform reduction
5558 : computations in an unsigned type. */
5559 22146 : tree compute_vectype = vectype;
5560 22146 : if (ANY_INTEGRAL_TYPE_P (vectype)
5561 15061 : && TYPE_OVERFLOW_UNDEFINED (vectype)
5562 5614 : && code.is_tree_code ()
5563 27760 : && arith_code_with_undefined_signed_overflow ((tree_code) code))
5564 4108 : compute_vectype = unsigned_type_for (vectype);
5565 :
5566 : /* In case of reduction chain, e.g.,
5567 : # a1 = phi <a3, a0>
5568 : a2 = operation (a1)
5569 : a3 = operation (a2),
5570 :
5571 : we may end up with more than one vector result. Here we reduce them
5572 : to one vector.
5573 :
5574 : The same is true for a SLP reduction, e.g.,
5575 : # a1 = phi <a2, a0>
5576 : # b1 = phi <b2, b0>
5577 : a2 = operation (a1)
5578 : b2 = operation (a2),
5579 :
5580 : where we can end up with more than one vector as well. We can
5581 : easily accumulate vectors when the number of vector elements is
5582 : a multiple of the SLP group size.
5583 :
5584 : The same is true if we couldn't use a single defuse cycle. */
5585 22146 : if ((!slp_reduc
5586 : || direct_slp_reduc
5587 : || (slp_reduc
5588 22146 : && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size)))
5589 44292 : && reduc_inputs.length () > 1)
5590 : {
5591 542 : gimple_seq stmts = NULL;
5592 542 : tree single_input = reduc_inputs[0];
5593 542 : if (compute_vectype != vectype)
5594 156 : single_input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5595 : compute_vectype, single_input);
5596 1965 : for (k = 1; k < reduc_inputs.length (); k++)
5597 : {
5598 1423 : tree input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5599 1423 : compute_vectype, reduc_inputs[k]);
5600 1423 : single_input = gimple_build (&stmts, code, compute_vectype,
5601 : single_input, input);
5602 : }
5603 542 : if (compute_vectype != vectype)
5604 156 : single_input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5605 : vectype, single_input);
5606 542 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5607 :
5608 542 : reduc_inputs.truncate (0);
5609 542 : reduc_inputs.safe_push (single_input);
5610 : }
5611 :
5612 22146 : tree orig_reduc_input = reduc_inputs[0];
5613 :
5614 : /* If this loop is an epilogue loop that can be skipped after the
5615 : main loop, we can only share a reduction operation between the
5616 : main loop and the epilogue if we put it at the target of the
5617 : skip edge.
5618 :
5619 : We can still reuse accumulators if this check fails. Doing so has
5620 : the minor(?) benefit of making the epilogue loop's scalar result
5621 : independent of the main loop's scalar result. */
5622 22146 : bool unify_with_main_loop_p = false;
5623 22146 : if (VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)
5624 4111 : && loop_vinfo->skip_this_loop_edge
5625 3871 : && single_succ_p (exit_bb)
5626 22167 : && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5627 : {
5628 21 : unify_with_main_loop_p = true;
5629 :
5630 21 : basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5631 21 : reduc_inputs[0] = make_ssa_name (vectype);
5632 21 : gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5633 21 : add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5634 : UNKNOWN_LOCATION);
5635 21 : add_phi_arg (new_phi,
5636 21 : VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)->reduc_input,
5637 : loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5638 21 : exit_gsi = gsi_after_labels (reduc_block);
5639 : }
5640 :
5641 : /* Shouldn't be used beyond this point. */
5642 22146 : exit_bb = nullptr;
5643 :
5644 : /* If we are operating on a mask vector and do not support direct mask
5645 : reduction, work on a bool data vector instead of a mask vector. */
5646 22146 : if (VECTOR_BOOLEAN_TYPE_P (vectype)
5647 249 : && VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info)
5648 22345 : && vectype != VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info))
5649 : {
5650 199 : compute_vectype = vectype = VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info);
5651 199 : gimple_seq stmts = NULL;
5652 406 : for (unsigned i = 0; i < reduc_inputs.length (); ++i)
5653 414 : reduc_inputs[i] = gimple_build (&stmts, VEC_COND_EXPR, vectype,
5654 207 : reduc_inputs[i],
5655 : build_one_cst (vectype),
5656 : build_zero_cst (vectype));
5657 199 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5658 : }
5659 :
5660 22146 : if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
5661 87 : && reduc_fn != IFN_LAST)
5662 : {
5663 : /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5664 : various data values where the condition matched and another vector
5665 : (INDUCTION_INDEX) containing all the indexes of those matches. We
5666 : need to extract the last matching index (which will be the index with
5667 : highest value) and use this to index into the data vector.
5668 : For the case where there were no matches, the data vector will contain
5669 : all default values and the index vector will be all zeros. */
5670 :
5671 : /* Get various versions of the type of the vector of indexes. */
5672 14 : tree index_vec_type = TREE_TYPE (induction_index);
5673 14 : gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5674 14 : tree index_scalar_type = TREE_TYPE (index_vec_type);
5675 14 : tree index_vec_cmp_type = truth_type_for (index_vec_type);
5676 :
5677 : /* Get an unsigned integer version of the type of the data vector. */
5678 14 : int scalar_precision
5679 14 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5680 14 : tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5681 14 : tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5682 : vectype);
5683 :
5684 : /* First we need to create a vector (ZERO_VEC) of zeros and another
5685 : vector (MAX_INDEX_VEC) filled with the last matching index, which we
5686 : can create using a MAX reduction and then expanding.
5687 : In the case where the loop never made any matches, the max index will
5688 : be zero. */
5689 :
5690 : /* Vector of {0, 0, 0,...}. */
5691 14 : tree zero_vec = build_zero_cst (vectype);
5692 :
5693 : /* Find maximum value from the vector of found indexes. */
5694 14 : tree max_index = make_ssa_name (index_scalar_type);
5695 14 : gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5696 : 1, induction_index);
5697 14 : gimple_call_set_lhs (max_index_stmt, max_index);
5698 14 : gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5699 :
5700 : /* Vector of {max_index, max_index, max_index,...}. */
5701 14 : tree max_index_vec = make_ssa_name (index_vec_type);
5702 14 : tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5703 : max_index);
5704 14 : gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5705 : max_index_vec_rhs);
5706 14 : gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5707 :
5708 : /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5709 : with the vector (INDUCTION_INDEX) of found indexes, choosing values
5710 : from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5711 : otherwise. Only one value should match, resulting in a vector
5712 : (VEC_COND) with one data value and the rest zeros.
5713 : In the case where the loop never made any matches, every index will
5714 : match, resulting in a vector with all data values (which will all be
5715 : the default value). */
5716 :
5717 : /* Compare the max index vector to the vector of found indexes to find
5718 : the position of the max value. */
5719 14 : tree vec_compare = make_ssa_name (index_vec_cmp_type);
5720 14 : gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5721 : induction_index,
5722 : max_index_vec);
5723 14 : gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5724 :
5725 : /* Use the compare to choose either values from the data vector or
5726 : zero. */
5727 14 : tree vec_cond = make_ssa_name (vectype);
5728 14 : gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5729 : vec_compare,
5730 14 : reduc_inputs[0],
5731 : zero_vec);
5732 14 : gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5733 :
5734 : /* Finally we need to extract the data value from the vector (VEC_COND)
5735 : into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5736 : reduction, but because this doesn't exist, we can use a MAX reduction
5737 : instead. The data value might be signed or a float so we need to cast
5738 : it first.
5739 : In the case where the loop never made any matches, the data values are
5740 : all identical, and so will reduce down correctly. */
5741 :
5742 : /* Make the matched data values unsigned. */
5743 14 : tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5744 14 : tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5745 : vec_cond);
5746 14 : gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5747 : VIEW_CONVERT_EXPR,
5748 : vec_cond_cast_rhs);
5749 14 : gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5750 :
5751 : /* Reduce down to a scalar value. */
5752 14 : tree data_reduc = make_ssa_name (scalar_type_unsigned);
5753 14 : gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5754 : 1, vec_cond_cast);
5755 14 : gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5756 14 : gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5757 :
5758 : /* Convert the reduced value back to the result type and set as the
5759 : result. */
5760 14 : gimple_seq stmts = NULL;
5761 14 : new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5762 : data_reduc);
5763 14 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5764 14 : scalar_results.safe_push (new_temp);
5765 14 : }
5766 22132 : else if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
5767 73 : && reduc_fn == IFN_LAST)
5768 : {
5769 : /* Condition reduction without supported IFN_REDUC_MAX. Generate
5770 : idx = 0;
5771 : idx_val = induction_index[0];
5772 : val = data_reduc[0];
5773 : for (idx = 0, val = init, i = 0; i < nelts; ++i)
5774 : if (induction_index[i] > idx_val)
5775 : val = data_reduc[i], idx_val = induction_index[i];
5776 : return val; */
5777 :
5778 73 : tree data_eltype = TREE_TYPE (vectype);
5779 73 : tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5780 73 : unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5781 73 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5782 : /* Enforced by vectorizable_reduction, which ensures we have target
5783 : support before allowing a conditional reduction on variable-length
5784 : vectors. */
5785 73 : unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5786 73 : tree idx_val = NULL_TREE, val = NULL_TREE;
5787 469 : for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5788 : {
5789 396 : tree old_idx_val = idx_val;
5790 396 : tree old_val = val;
5791 396 : idx_val = make_ssa_name (idx_eltype);
5792 396 : epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5793 : build3 (BIT_FIELD_REF, idx_eltype,
5794 : induction_index,
5795 396 : bitsize_int (el_size),
5796 396 : bitsize_int (off)));
5797 396 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5798 396 : val = make_ssa_name (data_eltype);
5799 792 : epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5800 : build3 (BIT_FIELD_REF,
5801 : data_eltype,
5802 396 : reduc_inputs[0],
5803 396 : bitsize_int (el_size),
5804 396 : bitsize_int (off)));
5805 396 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5806 396 : if (off != 0)
5807 : {
5808 323 : tree new_idx_val = idx_val;
5809 323 : if (off != v_size - el_size)
5810 : {
5811 250 : new_idx_val = make_ssa_name (idx_eltype);
5812 250 : epilog_stmt = gimple_build_assign (new_idx_val,
5813 : MAX_EXPR, idx_val,
5814 : old_idx_val);
5815 250 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5816 : }
5817 323 : tree cond = make_ssa_name (boolean_type_node);
5818 323 : epilog_stmt = gimple_build_assign (cond, GT_EXPR,
5819 : idx_val, old_idx_val);
5820 323 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5821 323 : tree new_val = make_ssa_name (data_eltype);
5822 323 : epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
5823 : cond, val, old_val);
5824 323 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5825 323 : idx_val = new_idx_val;
5826 323 : val = new_val;
5827 : }
5828 : }
5829 : /* Convert the reduced value back to the result type and set as the
5830 : result. */
5831 73 : gimple_seq stmts = NULL;
5832 73 : val = gimple_convert (&stmts, scalar_type, val);
5833 73 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5834 73 : scalar_results.safe_push (val);
5835 73 : }
5836 :
5837 : /* 2.3 Create the reduction code, using one of the three schemes described
5838 : above. In SLP we simply need to extract all the elements from the
5839 : vector (without reducing them), so we use scalar shifts. */
5840 22059 : else if (reduc_fn != IFN_LAST && (!slp_reduc || group_size == 1))
5841 : {
5842 20090 : tree tmp;
5843 20090 : tree vec_elem_type;
5844 :
5845 : /* Case 1: Create:
5846 : v_out2 = reduc_expr <v_out1> */
5847 :
5848 20090 : if (dump_enabled_p ())
5849 1517 : dump_printf_loc (MSG_NOTE, vect_location,
5850 : "Reduce using direct vector reduction.\n");
5851 :
5852 20090 : gimple_seq stmts = NULL;
5853 20090 : vec_elem_type = TREE_TYPE (vectype);
5854 20090 : new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5855 20090 : vec_elem_type, reduc_inputs[0]);
5856 20090 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5857 20090 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5858 :
5859 20090 : if ((VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5860 62 : && induc_val)
5861 : {
5862 : /* Earlier we set the initial value to be a vector if induc_val
5863 : values. Check the result and if it is induc_val then replace
5864 : with the original initial value, unless induc_val is
5865 : the same as initial_def already. */
5866 60 : tree zcompare = make_ssa_name (boolean_type_node);
5867 60 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
5868 : new_temp, induc_val);
5869 60 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5870 60 : tree initial_def = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[0];
5871 60 : tmp = make_ssa_name (new_scalar_dest);
5872 60 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5873 : initial_def, new_temp);
5874 60 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5875 60 : new_temp = tmp;
5876 : }
5877 :
5878 20090 : scalar_results.safe_push (new_temp);
5879 20090 : }
5880 1782 : else if (direct_slp_reduc)
5881 : {
5882 : /* Here we create one vector for each of the GROUP_SIZE results,
5883 : with the elements for other SLP statements replaced with the
5884 : neutral value. We can then do a normal reduction on each vector. */
5885 :
5886 : /* Enforced by vectorizable_reduction. */
5887 : gcc_assert (reduc_inputs.length () == 1);
5888 : gcc_assert (pow2p_hwi (group_size));
5889 :
5890 : gimple_seq seq = NULL;
5891 :
5892 : /* Build a vector {0, 1, 2, ...}, with the same number of elements
5893 : and the same element size as VECTYPE. */
5894 : tree index = build_index_vector (vectype, 0, 1);
5895 : tree index_type = TREE_TYPE (index);
5896 : tree index_elt_type = TREE_TYPE (index_type);
5897 : tree mask_type = truth_type_for (index_type);
5898 :
5899 : /* Create a vector that, for each element, identifies which of
5900 : the results should use it. */
5901 : tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5902 : index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5903 : build_vector_from_val (index_type, index_mask));
5904 :
5905 : /* Get a neutral vector value. This is simply a splat of the neutral
5906 : scalar value if we have one, otherwise the initial scalar value
5907 : is itself a neutral value. */
5908 : tree vector_identity = NULL_TREE;
5909 : tree neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5910 : NULL_TREE, false);
5911 : if (neutral_op)
5912 : vector_identity = gimple_build_vector_from_val (&seq, vectype,
5913 : neutral_op);
5914 : for (unsigned int i = 0; i < group_size; ++i)
5915 : {
5916 : /* If there's no universal neutral value, we can use the
5917 : initial scalar value from the original PHI. This is used
5918 : for MIN and MAX reduction, for example. */
5919 : if (!neutral_op)
5920 : {
5921 : tree scalar_value
5922 : = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[i];
5923 : scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5924 : scalar_value);
5925 : vector_identity = gimple_build_vector_from_val (&seq, vectype,
5926 : scalar_value);
5927 : }
5928 :
5929 : /* Calculate the equivalent of:
5930 :
5931 : sel[j] = (index[j] == i);
5932 :
5933 : which selects the elements of REDUC_INPUTS[0] that should
5934 : be included in the result. */
5935 : tree compare_val = build_int_cst (index_elt_type, i);
5936 : compare_val = build_vector_from_val (index_type, compare_val);
5937 : tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5938 : index, compare_val);
5939 :
5940 : /* Calculate the equivalent of:
5941 :
5942 : vec = seq ? reduc_inputs[0] : vector_identity;
5943 :
5944 : VEC is now suitable for a full vector reduction. */
5945 : tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5946 : sel, reduc_inputs[0], vector_identity);
5947 :
5948 : /* Do the reduction and convert it to the appropriate type. */
5949 : tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5950 : TREE_TYPE (vectype), vec);
5951 : scalar = gimple_convert (&seq, scalar_type, scalar);
5952 : scalar_results.safe_push (scalar);
5953 : }
5954 : gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5955 : }
5956 : else
5957 : {
5958 1782 : bool reduce_with_shift;
5959 1782 : tree vec_temp;
5960 :
5961 1782 : gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5962 :
5963 : /* See if the target wants to do the final (shift) reduction
5964 : in a vector mode of smaller size and first reduce upper/lower
5965 : halves against each other. */
5966 1969 : enum machine_mode mode1 = mode;
5967 1969 : tree stype = TREE_TYPE (vectype);
5968 1969 : if (compute_vectype != vectype)
5969 : {
5970 546 : stype = unsigned_type_for (stype);
5971 546 : gimple_seq stmts = NULL;
5972 1150 : for (unsigned i = 0; i < reduc_inputs.length (); ++i)
5973 : {
5974 604 : tree new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5975 604 : compute_vectype, reduc_inputs[i]);
5976 604 : reduc_inputs[i] = new_temp;
5977 : }
5978 546 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5979 : }
5980 1969 : unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5981 1969 : unsigned nunits1 = nunits;
5982 1969 : if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5983 1969 : && reduc_inputs.length () == 1)
5984 : {
5985 41 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5986 : /* For SLP reductions we have to make sure lanes match up, but
5987 : since we're doing individual element final reduction reducing
5988 : vector width here is even more important.
5989 : ??? We can also separate lanes with permutes, for the common
5990 : case of power-of-two group-size odd/even extracts would work. */
5991 41 : if (slp_reduc && nunits != nunits1)
5992 : {
5993 41 : nunits1 = least_common_multiple (nunits1, group_size);
5994 82 : gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5995 : }
5996 : }
5997 1928 : else if (!slp_reduc
5998 1928 : && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5999 0 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6000 :
6001 1969 : tree vectype1 = compute_vectype;
6002 1969 : if (mode1 != mode)
6003 : {
6004 47 : vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6005 47 : stype, nunits1);
6006 : /* First reduce the vector to the desired vector size we should
6007 : do shift reduction on by combining upper and lower halves. */
6008 47 : gimple_seq stmts = NULL;
6009 47 : new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6010 : code, &stmts);
6011 47 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6012 47 : reduc_inputs[0] = new_temp;
6013 : }
6014 :
6015 1969 : reduce_with_shift = have_whole_vector_shift (mode1);
6016 728 : if (!VECTOR_MODE_P (mode1)
6017 2695 : || !directly_supported_p (code, vectype1))
6018 : reduce_with_shift = false;
6019 :
6020 1952 : if (reduce_with_shift && (!slp_reduc || group_size == 1))
6021 : {
6022 1729 : int element_bitsize = vector_element_bits (vectype1);
6023 : /* Enforced by vectorizable_reduction, which disallows SLP reductions
6024 : for variable-length vectors and also requires direct target support
6025 : for loop reductions. */
6026 1729 : int nelements = TYPE_VECTOR_SUBPARTS (vectype1).to_constant ();
6027 1729 : vec_perm_builder sel;
6028 1729 : vec_perm_indices indices;
6029 :
6030 1729 : int elt_offset;
6031 :
6032 1729 : tree zero_vec = build_zero_cst (vectype1);
6033 : /* Case 2: Create:
6034 : for (offset = nelements/2; offset >= 1; offset/=2)
6035 : {
6036 : Create: va' = vec_shift <va, offset>
6037 : Create: va = vop <va, va'>
6038 : } */
6039 :
6040 1729 : if (dump_enabled_p ())
6041 368 : dump_printf_loc (MSG_NOTE, vect_location,
6042 : "Reduce using vector shifts\n");
6043 :
6044 1729 : gimple_seq stmts = NULL;
6045 1729 : new_temp = gimple_convert (&stmts, vectype1, reduc_inputs[0]);
6046 1729 : for (elt_offset = nelements / 2;
6047 3770 : elt_offset >= 1;
6048 2041 : elt_offset /= 2)
6049 : {
6050 2041 : calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6051 2041 : indices.new_vector (sel, 2, nelements);
6052 2041 : tree mask = vect_gen_perm_mask_any (vectype1, indices);
6053 2041 : new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6054 : new_temp, zero_vec, mask);
6055 2041 : new_temp = gimple_build (&stmts, code,
6056 : vectype1, new_name, new_temp);
6057 : }
6058 :
6059 : /* 2.4 Extract the final scalar result. Create:
6060 : s_out3 = extract_field <v_out2, bitpos> */
6061 :
6062 1729 : if (dump_enabled_p ())
6063 368 : dump_printf_loc (MSG_NOTE, vect_location,
6064 : "extract scalar result\n");
6065 :
6066 1729 : new_temp = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype1),
6067 1729 : new_temp, bitsize_int (element_bitsize),
6068 1729 : bitsize_zero_node);
6069 1729 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6070 1729 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6071 1729 : scalar_results.safe_push (new_temp);
6072 1729 : }
6073 : else
6074 : {
6075 : /* Case 3: Create:
6076 : s = extract_field <v_out2, 0>
6077 : for (offset = element_size;
6078 : offset < vector_size;
6079 : offset += element_size;)
6080 : {
6081 : Create: s' = extract_field <v_out2, offset>
6082 : Create: s = op <s, s'> // For non SLP cases
6083 : } */
6084 :
6085 240 : if (dump_enabled_p ())
6086 151 : dump_printf_loc (MSG_NOTE, vect_location,
6087 : "Reduce using scalar code.\n");
6088 :
6089 240 : tree compute_type = TREE_TYPE (vectype1);
6090 240 : unsigned element_bitsize = vector_element_bits (vectype1);
6091 240 : unsigned vec_size_in_bits = element_bitsize
6092 240 : * TYPE_VECTOR_SUBPARTS (vectype1).to_constant ();
6093 240 : tree bitsize = bitsize_int (element_bitsize);
6094 240 : gimple_seq stmts = NULL;
6095 633 : FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6096 : {
6097 393 : unsigned bit_offset;
6098 786 : new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6099 393 : vec_temp, bitsize, bitsize_zero_node);
6100 :
6101 : /* In SLP we don't need to apply reduction operation, so we just
6102 : collect s' values in SCALAR_RESULTS. */
6103 393 : if (slp_reduc)
6104 383 : scalar_results.safe_push (new_temp);
6105 :
6106 955 : for (bit_offset = element_bitsize;
6107 1348 : bit_offset < vec_size_in_bits;
6108 955 : bit_offset += element_bitsize)
6109 : {
6110 955 : tree bitpos = bitsize_int (bit_offset);
6111 955 : new_name = gimple_build (&stmts, BIT_FIELD_REF,
6112 : compute_type, vec_temp,
6113 : bitsize, bitpos);
6114 955 : if (slp_reduc)
6115 : {
6116 : /* In SLP we don't need to apply reduction operation, so
6117 : we just collect s' values in SCALAR_RESULTS. */
6118 945 : new_temp = new_name;
6119 945 : scalar_results.safe_push (new_name);
6120 : }
6121 : else
6122 10 : new_temp = gimple_build (&stmts, code, compute_type,
6123 : new_name, new_temp);
6124 : }
6125 : }
6126 :
6127 : /* The only case where we need to reduce scalar results in a SLP
6128 : reduction, is unrolling. If the size of SCALAR_RESULTS is
6129 : greater than GROUP_SIZE, we reduce them combining elements modulo
6130 : GROUP_SIZE. */
6131 240 : if (slp_reduc)
6132 : {
6133 230 : tree res, first_res, new_res;
6134 :
6135 : /* Reduce multiple scalar results in case of SLP unrolling. */
6136 878 : for (j = group_size; scalar_results.iterate (j, &res);
6137 : j++)
6138 : {
6139 648 : first_res = scalar_results[j % group_size];
6140 648 : new_res = gimple_build (&stmts, code, compute_type,
6141 : first_res, res);
6142 648 : scalar_results[j % group_size] = new_res;
6143 : }
6144 230 : scalar_results.truncate (group_size);
6145 1140 : for (k = 0; k < group_size; k++)
6146 1360 : scalar_results[k] = gimple_convert (&stmts, scalar_type,
6147 680 : scalar_results[k]);
6148 : }
6149 : else
6150 : {
6151 : /* Reduction chain - we have one scalar to keep in
6152 : SCALAR_RESULTS. */
6153 10 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6154 10 : scalar_results.safe_push (new_temp);
6155 : }
6156 :
6157 240 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6158 : }
6159 :
6160 1969 : if ((VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6161 0 : && induc_val)
6162 : {
6163 : /* Earlier we set the initial value to be a vector if induc_val
6164 : values. Check the result and if it is induc_val then replace
6165 : with the original initial value, unless induc_val is
6166 : the same as initial_def already. */
6167 0 : tree zcompare = make_ssa_name (boolean_type_node);
6168 0 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6169 0 : scalar_results[0], induc_val);
6170 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6171 0 : tree initial_def = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[0];
6172 0 : tree tmp = make_ssa_name (new_scalar_dest);
6173 0 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6174 0 : initial_def, scalar_results[0]);
6175 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6176 0 : scalar_results[0] = tmp;
6177 : }
6178 : }
6179 :
6180 : /* 2.5 Adjust the final result by the initial value of the reduction
6181 : variable. (When such adjustment is not needed, then
6182 : 'adjustment_def' is zero). For example, if code is PLUS we create:
6183 : new_temp = loop_exit_def + adjustment_def */
6184 :
6185 22146 : if (adjustment_def)
6186 : {
6187 15738 : gcc_assert (!slp_reduc || group_size == 1);
6188 15738 : gimple_seq stmts = NULL;
6189 15738 : if (double_reduc)
6190 : {
6191 0 : gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6192 0 : adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6193 0 : new_temp = gimple_build (&stmts, code, vectype,
6194 0 : reduc_inputs[0], adjustment_def);
6195 : }
6196 : else
6197 : {
6198 15738 : new_temp = scalar_results[0];
6199 15738 : gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6200 15738 : adjustment_def = gimple_convert (&stmts, TREE_TYPE (compute_vectype),
6201 : adjustment_def);
6202 15738 : new_temp = gimple_convert (&stmts, TREE_TYPE (compute_vectype),
6203 : new_temp);
6204 15738 : new_temp = gimple_build (&stmts, code, TREE_TYPE (compute_vectype),
6205 : new_temp, adjustment_def);
6206 15738 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6207 : }
6208 :
6209 15738 : epilog_stmt = gimple_seq_last_stmt (stmts);
6210 15738 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6211 15738 : scalar_results[0] = new_temp;
6212 : }
6213 :
6214 : /* Record this operation if it could be reused by the epilogue loop. */
6215 22146 : if (VECT_REDUC_INFO_TYPE (reduc_info) == TREE_CODE_REDUCTION
6216 22146 : && reduc_inputs.length () == 1)
6217 21942 : loop_vinfo->reusable_accumulators.put (scalar_results[0],
6218 : { orig_reduc_input, reduc_info });
6219 :
6220 : /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6221 : phis with new adjusted scalar results, i.e., replace use <s_out0>
6222 : with use <s_out4>.
6223 :
6224 : Transform:
6225 : loop_exit:
6226 : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6227 : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6228 : v_out2 = reduce <v_out1>
6229 : s_out3 = extract_field <v_out2, 0>
6230 : s_out4 = adjust_result <s_out3>
6231 : use <s_out0>
6232 : use <s_out0>
6233 :
6234 : into:
6235 :
6236 : loop_exit:
6237 : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6238 : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6239 : v_out2 = reduce <v_out1>
6240 : s_out3 = extract_field <v_out2, 0>
6241 : s_out4 = adjust_result <s_out3>
6242 : use <s_out4>
6243 : use <s_out4> */
6244 :
6245 44292 : gcc_assert (live_out_stmts.size () == scalar_results.length ());
6246 22146 : auto_vec<gimple *> phis;
6247 44742 : for (k = 0; k < live_out_stmts.size (); k++)
6248 : {
6249 22596 : stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6250 22596 : tree scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6251 :
6252 : /* Find the loop-closed-use at the loop exit of the original scalar
6253 : result. (The reduction result is expected to have two immediate uses,
6254 : one at the latch block, and one at the loop exit). Note with
6255 : early break we can have two exit blocks, so pick the correct PHI. */
6256 114467 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6257 69275 : if (!is_gimple_debug (USE_STMT (use_p))
6258 69275 : && !flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6259 : {
6260 22591 : gcc_assert (is_a <gphi *> (USE_STMT (use_p)));
6261 22591 : if (gimple_bb (USE_STMT (use_p)) == loop_exit->dest)
6262 22583 : phis.safe_push (USE_STMT (use_p));
6263 22596 : }
6264 :
6265 45179 : FOR_EACH_VEC_ELT (phis, i, exit_phi)
6266 : {
6267 : /* Replace the uses: */
6268 22583 : orig_name = PHI_RESULT (exit_phi);
6269 :
6270 : /* Look for a single use at the target of the skip edge. */
6271 22583 : if (unify_with_main_loop_p)
6272 : {
6273 38 : use_operand_p use_p;
6274 38 : gimple *user;
6275 38 : if (!single_imm_use (orig_name, &use_p, &user))
6276 0 : gcc_unreachable ();
6277 38 : orig_name = gimple_get_lhs (user);
6278 : }
6279 :
6280 22583 : scalar_result = scalar_results[k];
6281 83717 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6282 : {
6283 38551 : gphi *use_phi = dyn_cast <gphi *> (use_stmt);
6284 115697 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6285 : {
6286 38573 : if (use_phi
6287 38573 : && (phi_arg_edge_from_use (use_p)->flags & EDGE_ABNORMAL))
6288 : {
6289 0 : gcc_assert (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (orig_name));
6290 0 : SSA_NAME_OCCURS_IN_ABNORMAL_PHI (scalar_result) = 1;
6291 : }
6292 38573 : SET_USE (use_p, scalar_result);
6293 : }
6294 38551 : update_stmt (use_stmt);
6295 22583 : }
6296 : }
6297 :
6298 22596 : phis.truncate (0);
6299 : }
6300 22146 : }
6301 :
6302 : /* Return a vector of type VECTYPE that is equal to the vector select
6303 : operation "MASK ? VEC : IDENTITY". Insert the select statements
6304 : before GSI. */
6305 :
6306 : static tree
6307 9 : merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6308 : tree vec, tree identity)
6309 : {
6310 9 : tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6311 9 : gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6312 : mask, vec, identity);
6313 9 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6314 9 : return cond;
6315 : }
6316 :
6317 : /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6318 : order, starting with LHS. Insert the extraction statements before GSI and
6319 : associate the new scalar SSA names with variable SCALAR_DEST.
6320 : If MASK is nonzero mask the input and then operate on it unconditionally.
6321 : Return the SSA name for the result. */
6322 :
6323 : static tree
6324 1161 : vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6325 : tree_code code, tree lhs, tree vector_rhs,
6326 : tree mask)
6327 : {
6328 1161 : tree vectype = TREE_TYPE (vector_rhs);
6329 1161 : tree scalar_type = TREE_TYPE (vectype);
6330 1161 : tree bitsize = TYPE_SIZE (scalar_type);
6331 1161 : unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6332 1161 : unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6333 :
6334 : /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6335 : to perform an unconditional element-wise reduction of it. */
6336 1161 : if (mask)
6337 : {
6338 84 : tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
6339 : "masked_vector_rhs");
6340 84 : tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
6341 : false);
6342 84 : tree vector_identity = build_vector_from_val (vectype, neutral_op);
6343 84 : gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
6344 : mask, vector_rhs, vector_identity);
6345 84 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6346 84 : vector_rhs = masked_vector_rhs;
6347 : }
6348 :
6349 1161 : for (unsigned HOST_WIDE_INT bit_offset = 0;
6350 5307 : bit_offset < vec_size_in_bits;
6351 4146 : bit_offset += element_bitsize)
6352 : {
6353 4146 : tree bitpos = bitsize_int (bit_offset);
6354 4146 : tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6355 : bitsize, bitpos);
6356 :
6357 4146 : gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6358 4146 : rhs = make_ssa_name (scalar_dest, stmt);
6359 4146 : gimple_assign_set_lhs (stmt, rhs);
6360 4146 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6361 : /* Fold the vector extract, combining it with a previous reversal
6362 : like seen in PR90579. */
6363 4146 : auto gsi2 = gsi_for_stmt (stmt);
6364 4146 : if (fold_stmt (&gsi2, follow_all_ssa_edges))
6365 358 : update_stmt (gsi_stmt (gsi2));
6366 :
6367 4146 : stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6368 4146 : tree new_name = make_ssa_name (scalar_dest, stmt);
6369 4146 : gimple_assign_set_lhs (stmt, new_name);
6370 4146 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6371 4146 : lhs = new_name;
6372 : }
6373 1161 : return lhs;
6374 : }
6375 :
6376 : /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6377 : type of the vector input. */
6378 :
6379 : static internal_fn
6380 2989 : get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6381 : {
6382 2989 : internal_fn mask_reduc_fn;
6383 2989 : internal_fn mask_len_reduc_fn;
6384 :
6385 2989 : switch (reduc_fn)
6386 : {
6387 0 : case IFN_FOLD_LEFT_PLUS:
6388 0 : mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6389 0 : mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6390 0 : break;
6391 :
6392 : default:
6393 : return IFN_LAST;
6394 : }
6395 :
6396 0 : if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6397 : OPTIMIZE_FOR_SPEED))
6398 : return mask_reduc_fn;
6399 0 : if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6400 : OPTIMIZE_FOR_SPEED))
6401 : return mask_len_reduc_fn;
6402 : return IFN_LAST;
6403 : }
6404 :
6405 : /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6406 : statement that sets the live-out value. REDUC_DEF_STMT is the phi
6407 : statement. CODE is the operation performed by STMT_INFO and OPS are
6408 : its scalar operands. REDUC_INDEX is the index of the operand in
6409 : OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6410 : implements in-order reduction, or IFN_LAST if we should open-code it.
6411 : VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6412 : that should be used to control the operation in a fully-masked loop. */
6413 :
6414 : static bool
6415 895 : vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6416 : stmt_vec_info stmt_info,
6417 : gimple_stmt_iterator *gsi,
6418 : slp_tree slp_node,
6419 : code_helper code, internal_fn reduc_fn,
6420 : int num_ops, tree vectype_in,
6421 : int reduc_index, vec_loop_masks *masks,
6422 : vec_loop_lens *lens)
6423 : {
6424 895 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6425 895 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
6426 895 : internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6427 :
6428 895 : gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6429 :
6430 895 : bool is_cond_op = false;
6431 895 : if (!code.is_tree_code ())
6432 : {
6433 30 : code = conditional_internal_fn_code (internal_fn (code));
6434 30 : gcc_assert (code != ERROR_MARK);
6435 : is_cond_op = true;
6436 : }
6437 :
6438 895 : gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
6439 :
6440 895 : gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6441 : TYPE_VECTOR_SUBPARTS (vectype_in)));
6442 :
6443 : /* ??? We should, when transforming the cycle PHI, record the existing
6444 : scalar def as vector def so looking up the vector def works. This
6445 : would also allow generalizing this for reduction paths of length > 1
6446 : and/or SLP reductions. */
6447 895 : slp_tree reduc_node = SLP_TREE_CHILDREN (slp_node)[reduc_index];
6448 895 : stmt_vec_info reduc_var_def = SLP_TREE_SCALAR_STMTS (reduc_node)[0];
6449 895 : tree reduc_var = gimple_get_lhs (STMT_VINFO_STMT (reduc_var_def));
6450 :
6451 : /* The operands either come from a binary operation or an IFN_COND operation.
6452 : The former is a gimple assign with binary rhs and the latter is a
6453 : gimple call with four arguments. */
6454 895 : gcc_assert (num_ops == 2 || num_ops == 4);
6455 :
6456 895 : auto_vec<tree> vec_oprnds0, vec_opmask;
6457 895 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[(is_cond_op ? 2 : 0)
6458 895 : + (1 - reduc_index)],
6459 : &vec_oprnds0);
6460 : /* For an IFN_COND_OP we also need the vector mask operand. */
6461 895 : if (is_cond_op)
6462 30 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], &vec_opmask);
6463 :
6464 : /* The transform below relies on preserving the original scalar PHI
6465 : and its latch def which we replace. So work backwards from there. */
6466 895 : tree scalar_dest
6467 895 : = gimple_phi_arg_def_from_edge (as_a <gphi *> (STMT_VINFO_STMT
6468 : (reduc_var_def)),
6469 895 : loop_latch_edge (loop));
6470 895 : stmt_vec_info scalar_dest_def_info
6471 895 : = vect_stmt_to_vectorize (loop_vinfo->lookup_def (scalar_dest));
6472 895 : tree scalar_type = TREE_TYPE (scalar_dest);
6473 :
6474 895 : int vec_num = vec_oprnds0.length ();
6475 895 : tree vec_elem_type = TREE_TYPE (vectype_out);
6476 895 : gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6477 :
6478 895 : tree vector_identity = NULL_TREE;
6479 895 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6480 : {
6481 2 : vector_identity = build_zero_cst (vectype_out);
6482 2 : if (!HONOR_SIGNED_ZEROS (vectype_out))
6483 : ;
6484 : else
6485 : {
6486 2 : gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
6487 2 : vector_identity = const_unop (NEGATE_EXPR, vectype_out,
6488 : vector_identity);
6489 : }
6490 : }
6491 :
6492 895 : tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6493 895 : int i;
6494 895 : tree def0;
6495 2056 : FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6496 : {
6497 1161 : gimple *new_stmt;
6498 1161 : tree mask = NULL_TREE;
6499 1161 : tree len = NULL_TREE;
6500 1161 : tree bias = NULL_TREE;
6501 1161 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6502 : {
6503 9 : tree loop_mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
6504 : vec_num, vectype_in, i);
6505 9 : if (is_cond_op)
6506 9 : mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (loop_mask),
6507 9 : loop_mask, vec_opmask[i], gsi);
6508 : else
6509 : mask = loop_mask;
6510 : }
6511 1152 : else if (is_cond_op)
6512 75 : mask = vec_opmask[i];
6513 1161 : if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
6514 : {
6515 0 : len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
6516 : i, 1, false);
6517 0 : signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
6518 0 : bias = build_int_cst (intQI_type_node, biasval);
6519 0 : if (!is_cond_op)
6520 0 : mask = build_minus_one_cst (truth_type_for (vectype_in));
6521 : }
6522 :
6523 : /* Handle MINUS by adding the negative. */
6524 1161 : if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6525 : {
6526 0 : tree negated = make_ssa_name (vectype_out);
6527 0 : new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6528 0 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6529 0 : def0 = negated;
6530 : }
6531 :
6532 9 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
6533 1170 : && mask && mask_reduc_fn == IFN_LAST)
6534 9 : def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6535 : vector_identity);
6536 :
6537 : /* On the first iteration the input is simply the scalar phi
6538 : result, and for subsequent iterations it is the output of
6539 : the preceding operation. */
6540 1161 : if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6541 : {
6542 0 : if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6543 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
6544 : def0, mask, len, bias);
6545 0 : else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
6546 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6547 : def0, mask);
6548 : else
6549 0 : new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6550 : def0);
6551 : /* For chained SLP reductions the output of the previous reduction
6552 : operation serves as the input of the next. For the final statement
6553 : the output cannot be a temporary - we reuse the original
6554 : scalar destination of the last statement. */
6555 0 : if (i != vec_num - 1)
6556 : {
6557 0 : gimple_set_lhs (new_stmt, scalar_dest_var);
6558 0 : reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6559 0 : gimple_set_lhs (new_stmt, reduc_var);
6560 : }
6561 : }
6562 : else
6563 : {
6564 1161 : reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
6565 : tree_code (code), reduc_var, def0,
6566 : mask);
6567 1161 : new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6568 : /* Remove the statement, so that we can use the same code paths
6569 : as for statements that we've just created. */
6570 1161 : gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6571 1161 : gsi_remove (&tmp_gsi, true);
6572 : }
6573 :
6574 1161 : if (i == vec_num - 1)
6575 : {
6576 895 : gimple_set_lhs (new_stmt, scalar_dest);
6577 895 : vect_finish_replace_stmt (loop_vinfo,
6578 : scalar_dest_def_info,
6579 : new_stmt);
6580 : }
6581 : else
6582 266 : vect_finish_stmt_generation (loop_vinfo,
6583 : scalar_dest_def_info,
6584 : new_stmt, gsi);
6585 :
6586 1161 : slp_node->push_vec_def (new_stmt);
6587 : }
6588 :
6589 895 : return true;
6590 895 : }
6591 :
6592 : /* Function is_nonwrapping_integer_induction.
6593 :
6594 : Check if STMT_VINO (which is part of loop LOOP) both increments and
6595 : does not cause overflow. */
6596 :
6597 : static bool
6598 408 : is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6599 : {
6600 408 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6601 408 : tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6602 408 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6603 408 : tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6604 408 : widest_int ni, max_loop_value, lhs_max;
6605 408 : wi::overflow_type overflow = wi::OVF_NONE;
6606 :
6607 : /* Make sure the loop is integer based. */
6608 408 : if (TREE_CODE (base) != INTEGER_CST
6609 109 : || TREE_CODE (step) != INTEGER_CST)
6610 : return false;
6611 :
6612 : /* Check that the max size of the loop will not wrap. */
6613 :
6614 109 : if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6615 : return true;
6616 :
6617 8 : if (! max_stmt_executions (loop, &ni))
6618 : return false;
6619 :
6620 8 : max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6621 8 : &overflow);
6622 8 : if (overflow)
6623 : return false;
6624 :
6625 8 : max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6626 16 : TYPE_SIGN (lhs_type), &overflow);
6627 8 : if (overflow)
6628 : return false;
6629 :
6630 8 : return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6631 8 : <= TYPE_PRECISION (lhs_type));
6632 408 : }
6633 :
6634 : /* Check if masking can be supported by inserting a conditional expression.
6635 : CODE is the code for the operation. COND_FN is the conditional internal
6636 : function, if it exists. VECTYPE_IN is the type of the vector input. */
6637 : static bool
6638 5962 : use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6639 : tree vectype_in)
6640 : {
6641 5962 : if (cond_fn != IFN_LAST
6642 5962 : && direct_internal_fn_supported_p (cond_fn, vectype_in,
6643 : OPTIMIZE_FOR_SPEED))
6644 : return false;
6645 :
6646 4221 : if (code.is_tree_code ())
6647 4207 : switch (tree_code (code))
6648 : {
6649 : case DOT_PROD_EXPR:
6650 : case SAD_EXPR:
6651 : return true;
6652 :
6653 : default:
6654 : break;
6655 : }
6656 : return false;
6657 : }
6658 :
6659 : /* Insert a conditional expression to enable masked vectorization. CODE is the
6660 : code for the operation. VOP is the array of operands. MASK is the loop
6661 : mask. GSI is a statement iterator used to place the new conditional
6662 : expression. */
6663 : static void
6664 4 : build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6665 : gimple_stmt_iterator *gsi)
6666 : {
6667 4 : switch (tree_code (code))
6668 : {
6669 4 : case DOT_PROD_EXPR:
6670 4 : {
6671 4 : tree vectype = TREE_TYPE (vop[1]);
6672 4 : tree zero = build_zero_cst (vectype);
6673 4 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6674 4 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6675 : mask, vop[1], zero);
6676 4 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6677 4 : vop[1] = masked_op1;
6678 4 : break;
6679 : }
6680 :
6681 0 : case SAD_EXPR:
6682 0 : {
6683 0 : tree vectype = TREE_TYPE (vop[1]);
6684 0 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6685 0 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6686 : mask, vop[1], vop[0]);
6687 0 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6688 0 : vop[1] = masked_op1;
6689 0 : break;
6690 : }
6691 :
6692 0 : default:
6693 0 : gcc_unreachable ();
6694 : }
6695 4 : }
6696 :
6697 : /* Given an operation with CODE in loop reduction path whose reduction PHI is
6698 : specified by REDUC_INFO, the operation has TYPE of scalar result, and its
6699 : input vectype is represented by VECTYPE_IN. The vectype of vectorized result
6700 : may be different from VECTYPE_IN, either in base type or vectype lanes,
6701 : lane-reducing operation is the case. This function check if it is possible,
6702 : and how to perform partial vectorization on the operation in the context
6703 : of LOOP_VINFO. */
6704 :
6705 : static void
6706 4093 : vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo,
6707 : vect_reduc_info reduc_info,
6708 : slp_tree slp_node,
6709 : code_helper code, tree type,
6710 : tree vectype_in)
6711 : {
6712 4093 : enum vect_reduction_type reduc_type = VECT_REDUC_INFO_TYPE (reduc_info);
6713 4093 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
6714 4093 : internal_fn cond_fn
6715 1157 : = ((code.is_internal_fn ()
6716 1157 : && internal_fn_mask_index ((internal_fn)code) != -1)
6717 4093 : ? (internal_fn)code : get_conditional_internal_fn (code, type));
6718 :
6719 4093 : if (reduc_type != FOLD_LEFT_REDUCTION
6720 3326 : && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6721 7306 : && (cond_fn == IFN_LAST
6722 3213 : || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6723 : OPTIMIZE_FOR_SPEED)))
6724 : {
6725 1999 : if (dump_enabled_p ())
6726 98 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6727 : "can't operate on partial vectors because"
6728 : " no conditional operation is available.\n");
6729 1999 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6730 : }
6731 2094 : else if (reduc_type == FOLD_LEFT_REDUCTION
6732 2094 : && reduc_fn == IFN_LAST
6733 2094 : && !expand_vec_cond_expr_p (vectype_in, truth_type_for (vectype_in)))
6734 : {
6735 0 : if (dump_enabled_p ())
6736 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6737 : "can't operate on partial vectors because"
6738 : " no conditional operation is available.\n");
6739 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6740 : }
6741 2094 : else if (reduc_type == FOLD_LEFT_REDUCTION
6742 767 : && internal_fn_mask_index (reduc_fn) == -1
6743 767 : && FLOAT_TYPE_P (vectype_in)
6744 2861 : && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
6745 : {
6746 0 : if (dump_enabled_p ())
6747 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6748 : "can't operate on partial vectors because"
6749 : " signed zeros cannot be preserved.\n");
6750 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6751 : }
6752 : else
6753 : {
6754 2094 : internal_fn mask_reduc_fn
6755 2094 : = get_masked_reduction_fn (reduc_fn, vectype_in);
6756 2094 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6757 2094 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
6758 2094 : unsigned nvectors = vect_get_num_copies (loop_vinfo, slp_node);
6759 :
6760 2094 : if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6761 0 : vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_in, 1);
6762 : else
6763 2094 : vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_in, NULL);
6764 : }
6765 4093 : }
6766 :
6767 : /* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
6768 : the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC,
6769 : and the analysis is for slp if SLP_NODE is not NULL.
6770 :
6771 : For a lane-reducing operation, the loop reduction path that it lies in,
6772 : may contain normal operation, or other lane-reducing operation of different
6773 : input type size, an example as:
6774 :
6775 : int sum = 0;
6776 : for (i)
6777 : {
6778 : ...
6779 : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
6780 : sum += w[i]; // widen-sum <vector(16) char>
6781 : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
6782 : sum += n[i]; // normal <vector(4) int>
6783 : ...
6784 : }
6785 :
6786 : Vectorization factor is essentially determined by operation whose input
6787 : vectype has the most lanes ("vector(16) char" in the example), while we
6788 : need to choose input vectype with the least lanes ("vector(4) int" in the
6789 : example) to determine effective number of vector reduction PHIs. */
6790 :
6791 : bool
6792 382206 : vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
6793 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
6794 : {
6795 382206 : gimple *stmt = stmt_info->stmt;
6796 :
6797 382206 : if (!lane_reducing_stmt_p (stmt))
6798 : return false;
6799 :
6800 716 : tree type = TREE_TYPE (gimple_assign_lhs (stmt));
6801 :
6802 716 : if (!INTEGRAL_TYPE_P (type))
6803 : return false;
6804 :
6805 : /* Do not try to vectorize bit-precision reductions. */
6806 716 : if (!type_has_mode_precision_p (type))
6807 : return false;
6808 :
6809 716 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
6810 :
6811 : /* TODO: Support lane-reducing operation that does not directly participate
6812 : in loop reduction. */
6813 716 : if (!reduc_info)
6814 : return false;
6815 :
6816 : /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
6817 : recognized. */
6818 716 : gcc_assert (!nested_in_vect_loop_p (LOOP_VINFO_LOOP (loop_vinfo), stmt_info));
6819 716 : gcc_assert (VECT_REDUC_INFO_TYPE (reduc_info) == TREE_CODE_REDUCTION);
6820 :
6821 2864 : for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
6822 : {
6823 2148 : slp_tree slp_op;
6824 2148 : tree op;
6825 2148 : tree vectype;
6826 2148 : enum vect_def_type dt;
6827 :
6828 2148 : if (!vect_is_simple_use (loop_vinfo, slp_node, i, &op,
6829 : &slp_op, &dt, &vectype))
6830 : {
6831 0 : if (dump_enabled_p ())
6832 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6833 : "use not simple.\n");
6834 0 : return false;
6835 : }
6836 :
6837 2148 : if (!vectype)
6838 : {
6839 6 : vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
6840 : slp_op);
6841 6 : if (!vectype)
6842 : return false;
6843 : }
6844 :
6845 2148 : if (!vect_maybe_update_slp_op_vectype (slp_op, vectype))
6846 : {
6847 0 : if (dump_enabled_p ())
6848 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6849 : "incompatible vector types for invariants\n");
6850 0 : return false;
6851 : }
6852 :
6853 2148 : if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6854 716 : continue;
6855 :
6856 : /* There should be at most one cycle def in the stmt. */
6857 1432 : if (VECTORIZABLE_CYCLE_DEF (dt))
6858 : return false;
6859 : }
6860 :
6861 716 : slp_tree node_in = SLP_TREE_CHILDREN (slp_node)[0];
6862 716 : tree vectype_in = SLP_TREE_VECTYPE (node_in);
6863 716 : gcc_assert (vectype_in);
6864 :
6865 : /* Compute number of effective vector statements for costing. */
6866 716 : unsigned int ncopies_for_cost = vect_get_num_copies (loop_vinfo, node_in);
6867 716 : gcc_assert (ncopies_for_cost >= 1);
6868 :
6869 716 : if (vect_is_emulated_mixed_dot_prod (slp_node))
6870 : {
6871 : /* We need extra two invariants: one that contains the minimum signed
6872 : value and one that contains half of its negative. */
6873 15 : int prologue_stmts = 2;
6874 15 : unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
6875 : scalar_to_vec, slp_node, 0,
6876 : vect_prologue);
6877 15 : if (dump_enabled_p ())
6878 0 : dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
6879 : "extra prologue_cost = %d .\n", cost);
6880 :
6881 : /* Three dot-products and a subtraction. */
6882 15 : ncopies_for_cost *= 4;
6883 : }
6884 :
6885 716 : record_stmt_cost (cost_vec, (int) ncopies_for_cost, vector_stmt, slp_node,
6886 : 0, vect_body);
6887 :
6888 716 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
6889 : {
6890 113 : enum tree_code code = gimple_assign_rhs_code (stmt);
6891 113 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
6892 113 : node_in, code, type,
6893 : vectype_in);
6894 : }
6895 :
6896 : /* Transform via vect_transform_reduction. */
6897 716 : SLP_TREE_TYPE (slp_node) = reduc_vec_info_type;
6898 716 : return true;
6899 : }
6900 :
6901 : /* Function vectorizable_reduction.
6902 :
6903 : Check if STMT_INFO performs a reduction operation that can be vectorized.
6904 : If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6905 : stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6906 : Return true if STMT_INFO is vectorizable in this way.
6907 :
6908 : This function also handles reduction idioms (patterns) that have been
6909 : recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6910 : may be of this form:
6911 : X = pattern_expr (arg0, arg1, ..., X)
6912 : and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6913 : sequence that had been detected and replaced by the pattern-stmt
6914 : (STMT_INFO).
6915 :
6916 : This function also handles reduction of condition expressions, for example:
6917 : for (int i = 0; i < N; i++)
6918 : if (a[i] < value)
6919 : last = a[i];
6920 : This is handled by vectorising the loop and creating an additional vector
6921 : containing the loop indexes for which "a[i] < value" was true. In the
6922 : function epilogue this is reduced to a single max value and then used to
6923 : index into the vector of results.
6924 :
6925 : In some cases of reduction patterns, the type of the reduction variable X is
6926 : different than the type of the other arguments of STMT_INFO.
6927 : In such cases, the vectype that is used when transforming STMT_INFO into
6928 : a vector stmt is different than the vectype that is used to determine the
6929 : vectorization factor, because it consists of a different number of elements
6930 : than the actual number of elements that are being operated upon in parallel.
6931 :
6932 : For example, consider an accumulation of shorts into an int accumulator.
6933 : On some targets it's possible to vectorize this pattern operating on 8
6934 : shorts at a time (hence, the vectype for purposes of determining the
6935 : vectorization factor should be V8HI); on the other hand, the vectype that
6936 : is used to create the vector form is actually V4SI (the type of the result).
6937 :
6938 : Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6939 : indicates what is the actual level of parallelism (V8HI in the example), so
6940 : that the right vectorization factor would be derived. This vectype
6941 : corresponds to the type of arguments to the reduction stmt, and should *NOT*
6942 : be used to create the vectorized stmt. The right vectype for the vectorized
6943 : stmt is obtained from the type of the result X:
6944 : get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6945 :
6946 : This means that, contrary to "regular" reductions (or "regular" stmts in
6947 : general), the following equation:
6948 : STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6949 : does *NOT* necessarily hold for reduction patterns. */
6950 :
6951 : bool
6952 381490 : vectorizable_reduction (loop_vec_info loop_vinfo,
6953 : stmt_vec_info stmt_info, slp_tree slp_node,
6954 : slp_instance slp_node_instance,
6955 : stmt_vector_for_cost *cost_vec)
6956 : {
6957 381490 : tree vectype_in = NULL_TREE;
6958 381490 : enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6959 381490 : stmt_vec_info cond_stmt_vinfo = NULL;
6960 381490 : int i;
6961 381490 : int ncopies;
6962 381490 : bool single_defuse_cycle = false;
6963 381490 : tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6964 381490 : tree cond_reduc_val = NULL_TREE;
6965 :
6966 : /* Make sure it was already recognized as a reduction computation. */
6967 381490 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6968 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6969 381490 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6970 : return false;
6971 :
6972 : /* The reduction meta. */
6973 84485 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
6974 :
6975 84485 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6976 : {
6977 1490 : gcc_assert (is_a <gphi *> (stmt_info->stmt));
6978 : /* We eventually need to set a vector type on invariant arguments. */
6979 : unsigned j;
6980 : slp_tree child;
6981 4462 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6982 2980 : if (!vect_maybe_update_slp_op_vectype (child,
6983 : SLP_TREE_VECTYPE (slp_node)))
6984 : {
6985 0 : if (dump_enabled_p ())
6986 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6987 : "incompatible vector types for "
6988 : "invariants\n");
6989 0 : return false;
6990 : }
6991 2980 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
6992 2980 : && !useless_type_conversion_p (SLP_TREE_VECTYPE (slp_node),
6993 : SLP_TREE_VECTYPE (child)))
6994 : {
6995 : /* With bools we can have mask and non-mask precision vectors
6996 : or different non-mask precisions. while pattern recog is
6997 : supposed to guarantee consistency here, we do not have
6998 : pattern stmts for PHIs (PR123316).
6999 : Deal with that here instead of ICEing later. */
7000 8 : if (dump_enabled_p ())
7001 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7002 : "incompatible vector type setup from "
7003 : "bool pattern detection\n");
7004 8 : return false;
7005 : }
7006 : /* Analysis for double-reduction is done on the outer
7007 : loop PHI, nested cycles have no further restrictions. */
7008 1482 : SLP_TREE_TYPE (slp_node) = cycle_phi_info_type;
7009 1482 : return true;
7010 : }
7011 :
7012 82995 : if (!is_a <gphi *> (stmt_info->stmt))
7013 : {
7014 7924 : gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def);
7015 7924 : SLP_TREE_TYPE (slp_node) = reduc_vec_info_type;
7016 7924 : return true;
7017 : }
7018 :
7019 75071 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7020 75071 : stmt_vec_info phi_info = stmt_info;
7021 75071 : bool double_reduc = false;
7022 75071 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7023 : {
7024 : /* We arrive here for both the inner loop LC PHI and the
7025 : outer loop PHI. The latter is what we want to analyze the
7026 : reduction with. The LC PHI is handled by vectorizable_lc_phi. */
7027 322 : if (gimple_bb (stmt_info->stmt) != loop->header)
7028 0 : return false;
7029 :
7030 : /* Set loop and phi_info to the inner loop. */
7031 322 : use_operand_p use_p;
7032 322 : gimple *use_stmt;
7033 322 : bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7034 : &use_p, &use_stmt);
7035 322 : gcc_assert (res);
7036 322 : phi_info = loop_vinfo->lookup_stmt (use_stmt);
7037 322 : loop = loop->inner;
7038 322 : double_reduc = true;
7039 : }
7040 :
7041 75071 : const bool reduc_chain = reduc_info->is_reduc_chain;
7042 75071 : slp_node_instance->reduc_phis = slp_node;
7043 : /* ??? We're leaving slp_node to point to the PHIs, we only
7044 : need it to get at the number of vector stmts which wasn't
7045 : yet initialized for the instance root. */
7046 :
7047 : /* PHIs should not participate in patterns. */
7048 75071 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7049 75071 : gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7050 :
7051 : /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7052 : and compute the reduction chain length. Discover the real
7053 : reduction operation stmt on the way (slp_for_stmt_info). */
7054 75071 : unsigned reduc_chain_length = 0;
7055 75071 : stmt_info = NULL;
7056 75071 : slp_tree slp_for_stmt_info = NULL;
7057 75071 : slp_tree vdef_slp = slp_node_instance->root;
7058 165701 : while (vdef_slp != slp_node)
7059 : {
7060 91722 : int reduc_idx = SLP_TREE_REDUC_IDX (vdef_slp);
7061 91722 : if (reduc_idx == -1)
7062 : {
7063 1084 : if (dump_enabled_p ())
7064 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7065 : "reduction chain broken by patterns.\n");
7066 1092 : return false;
7067 : }
7068 90638 : stmt_vec_info vdef = SLP_TREE_REPRESENTATIVE (vdef_slp);
7069 90638 : if (is_a <gphi *> (vdef->stmt))
7070 : {
7071 644 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[reduc_idx];
7072 : /* Do not count PHIs towards the chain length. */
7073 644 : continue;
7074 : }
7075 89994 : gimple_match_op op;
7076 89994 : if (!gimple_extract_op (vdef->stmt, &op))
7077 : {
7078 0 : if (dump_enabled_p ())
7079 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7080 : "reduction chain includes unsupported"
7081 : " statement type.\n");
7082 0 : return false;
7083 : }
7084 89994 : if (CONVERT_EXPR_CODE_P (op.code))
7085 : {
7086 5238 : if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7087 : {
7088 8 : if (dump_enabled_p ())
7089 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7090 : "conversion in the reduction chain.\n");
7091 8 : return false;
7092 : }
7093 5230 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[0];
7094 : }
7095 : else
7096 : {
7097 : /* First non-conversion stmt. */
7098 84756 : if (!slp_for_stmt_info)
7099 73979 : slp_for_stmt_info = vdef_slp;
7100 :
7101 84756 : if (lane_reducing_op_p (op.code))
7102 : {
7103 : /* The last operand of lane-reducing operation is for
7104 : reduction. */
7105 716 : gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
7106 :
7107 716 : slp_tree op_node = SLP_TREE_CHILDREN (vdef_slp)[0];
7108 716 : tree vectype_op = SLP_TREE_VECTYPE (op_node);
7109 716 : tree type_op = TREE_TYPE (op.ops[0]);
7110 716 : if (!vectype_op)
7111 : {
7112 9 : vectype_op = get_vectype_for_scalar_type (loop_vinfo,
7113 : type_op);
7114 9 : if (!vectype_op
7115 9 : || !vect_maybe_update_slp_op_vectype (op_node,
7116 : vectype_op))
7117 0 : return false;
7118 : }
7119 :
7120 : /* To accommodate lane-reducing operations of mixed input
7121 : vectypes, choose input vectype with the least lanes for the
7122 : reduction PHI statement, which would result in the most
7123 : ncopies for vectorized reduction results. */
7124 716 : if (!vectype_in
7125 716 : || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7126 58 : < GET_MODE_SIZE (SCALAR_TYPE_MODE (type_op))))
7127 687 : vectype_in = vectype_op;
7128 : }
7129 84040 : else if (!vectype_in)
7130 73292 : vectype_in = SLP_TREE_VECTYPE (slp_node);
7131 84756 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[reduc_idx];
7132 : }
7133 89986 : reduc_chain_length++;
7134 : }
7135 73979 : if (!slp_for_stmt_info)
7136 : {
7137 0 : if (dump_enabled_p ())
7138 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7139 : "only noop-conversions in the reduction chain.\n");
7140 0 : return false;
7141 : }
7142 73979 : stmt_info = SLP_TREE_REPRESENTATIVE (slp_for_stmt_info);
7143 :
7144 : /* PHIs should not participate in patterns. */
7145 73979 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7146 :
7147 : /* 1. Is vectorizable reduction? */
7148 : /* Not supportable if the reduction variable is used in the loop, unless
7149 : it's a reduction chain. */
7150 73979 : if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7151 0 : && !reduc_chain)
7152 : return false;
7153 :
7154 : /* Reductions that are not used even in an enclosing outer-loop,
7155 : are expected to be "live" (used out of the loop). */
7156 73979 : if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7157 0 : && !STMT_VINFO_LIVE_P (stmt_info))
7158 : return false;
7159 :
7160 : /* 2. Has this been recognized as a reduction pattern?
7161 :
7162 : Check if STMT represents a pattern that has been recognized
7163 : in earlier analysis stages. For stmts that represent a pattern,
7164 : the STMT_VINFO_RELATED_STMT field records the last stmt in
7165 : the original sequence that constitutes the pattern. */
7166 :
7167 73979 : stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7168 73979 : if (orig_stmt_info)
7169 : {
7170 5106 : gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7171 5106 : gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7172 : }
7173 :
7174 : /* 3. Check the operands of the operation. The first operands are defined
7175 : inside the loop body. The last operand is the reduction variable,
7176 : which is defined by the loop-header-phi. */
7177 :
7178 73979 : tree vectype_out = SLP_TREE_VECTYPE (slp_for_stmt_info);
7179 73979 : VECT_REDUC_INFO_VECTYPE (reduc_info) = vectype_out;
7180 :
7181 73979 : gimple_match_op op;
7182 73979 : if (!gimple_extract_op (stmt_info->stmt, &op))
7183 0 : gcc_unreachable ();
7184 73979 : bool lane_reducing = lane_reducing_op_p (op.code);
7185 :
7186 73979 : if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7187 22131 : && !SCALAR_FLOAT_TYPE_P (op.type))
7188 : return false;
7189 :
7190 : /* Do not try to vectorize bit-precision reductions. */
7191 73979 : if (!type_has_mode_precision_p (op.type)
7192 1737 : && op.code != BIT_AND_EXPR
7193 1602 : && op.code != BIT_IOR_EXPR
7194 74455 : && op.code != BIT_XOR_EXPR)
7195 : return false;
7196 :
7197 : /* Lane-reducing ops also never can be used in a SLP reduction group
7198 : since we'll mix lanes belonging to different reductions. But it's
7199 : OK to use them in a reduction chain or when the reduction group
7200 : has just one element. */
7201 73669 : if (lane_reducing
7202 73669 : && !reduc_chain
7203 650 : && SLP_TREE_LANES (slp_node) > 1)
7204 : {
7205 0 : if (dump_enabled_p ())
7206 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7207 : "lane-reducing reduction in reduction group.\n");
7208 0 : return false;
7209 : }
7210 :
7211 : /* All uses but the last are expected to be defined in the loop.
7212 : The last use is the reduction variable. In case of nested cycle this
7213 : assumption is not true: we use reduc_index to record the index of the
7214 : reduction variable. */
7215 73669 : slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7216 73669 : tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7217 73669 : gcc_assert (op.code != COND_EXPR || !COMPARISON_CLASS_P (op.ops[0]));
7218 236091 : for (i = 0; i < (int) op.num_ops; i++)
7219 : {
7220 : /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7221 162422 : if (i == 0 && op.code == COND_EXPR)
7222 81292 : continue;
7223 :
7224 161568 : stmt_vec_info def_stmt_info;
7225 161568 : enum vect_def_type dt;
7226 161568 : if (!vect_is_simple_use (loop_vinfo, slp_for_stmt_info,
7227 : i, &op.ops[i], &slp_op[i], &dt,
7228 161568 : &vectype_op[i], &def_stmt_info))
7229 : {
7230 0 : if (dump_enabled_p ())
7231 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7232 : "use not simple.\n");
7233 0 : return false;
7234 : }
7235 :
7236 : /* Skip reduction operands, and for an IFN_COND_OP we might hit the
7237 : reduction operand twice (once as definition, once as else). */
7238 161568 : if (SLP_TREE_CHILDREN (slp_for_stmt_info)[i]
7239 323136 : == SLP_TREE_CHILDREN
7240 161568 : (slp_for_stmt_info)[SLP_TREE_REDUC_IDX (slp_for_stmt_info)])
7241 80438 : continue;
7242 :
7243 : /* There should be only one cycle def in the stmt, the one
7244 : leading to reduc_def. */
7245 81130 : if (SLP_TREE_CHILDREN (slp_for_stmt_info)[i]->cycle_info.id != -1)
7246 : return false;
7247 :
7248 81130 : if (!vectype_op[i])
7249 7399 : vectype_op[i]
7250 7399 : = get_vectype_for_scalar_type (loop_vinfo,
7251 7399 : TREE_TYPE (op.ops[i]), slp_op[i]);
7252 :
7253 : /* Record how the non-reduction-def value of COND_EXPR is defined.
7254 : ??? For a chain of multiple CONDs we'd have to match them up all. */
7255 81130 : if (op.code == COND_EXPR && reduc_chain_length == 1)
7256 : {
7257 831 : if (dt == vect_constant_def)
7258 : {
7259 118 : cond_reduc_dt = dt;
7260 118 : cond_reduc_val = op.ops[i];
7261 : }
7262 713 : else if (dt == vect_induction_def
7263 408 : && def_stmt_info
7264 1121 : && is_nonwrapping_integer_induction (def_stmt_info, loop))
7265 : {
7266 109 : cond_reduc_dt = dt;
7267 109 : cond_stmt_vinfo = def_stmt_info;
7268 : }
7269 : }
7270 : }
7271 :
7272 73669 : enum vect_reduction_type reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7273 : /* If we have a condition reduction, see if we can simplify it further. */
7274 73669 : if (reduction_type == COND_REDUCTION)
7275 : {
7276 842 : if (SLP_TREE_LANES (slp_node) != 1)
7277 : return false;
7278 :
7279 : /* When the condition uses the reduction value in the condition, fail. */
7280 818 : if (SLP_TREE_REDUC_IDX (slp_node) == 0)
7281 : {
7282 0 : if (dump_enabled_p ())
7283 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7284 : "condition depends on previous iteration\n");
7285 0 : return false;
7286 : }
7287 :
7288 818 : if (reduc_chain_length == 1
7289 818 : && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7290 : OPTIMIZE_FOR_SPEED)
7291 795 : || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7292 : vectype_in,
7293 : OPTIMIZE_FOR_SPEED)))
7294 : {
7295 0 : if (dump_enabled_p ())
7296 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7297 : "optimizing condition reduction with"
7298 : " FOLD_EXTRACT_LAST.\n");
7299 0 : VECT_REDUC_INFO_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7300 : }
7301 818 : else if (cond_reduc_dt == vect_induction_def)
7302 : {
7303 109 : tree base
7304 : = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7305 109 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7306 :
7307 109 : gcc_assert (TREE_CODE (base) == INTEGER_CST
7308 : && TREE_CODE (step) == INTEGER_CST);
7309 109 : cond_reduc_val = NULL_TREE;
7310 109 : enum tree_code cond_reduc_op_code = ERROR_MARK;
7311 109 : tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7312 109 : if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7313 : ;
7314 : /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7315 : above base; punt if base is the minimum value of the type for
7316 : MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7317 97 : else if (tree_int_cst_sgn (step) == -1)
7318 : {
7319 18 : cond_reduc_op_code = MIN_EXPR;
7320 18 : if (tree_int_cst_sgn (base) == -1)
7321 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7322 18 : else if (tree_int_cst_lt (base,
7323 18 : TYPE_MAX_VALUE (TREE_TYPE (base))))
7324 18 : cond_reduc_val
7325 18 : = int_const_binop (PLUS_EXPR, base, integer_one_node);
7326 : }
7327 : else
7328 : {
7329 79 : cond_reduc_op_code = MAX_EXPR;
7330 79 : if (tree_int_cst_sgn (base) == 1)
7331 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7332 79 : else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7333 : base))
7334 79 : cond_reduc_val
7335 79 : = int_const_binop (MINUS_EXPR, base, integer_one_node);
7336 : }
7337 97 : if (cond_reduc_val)
7338 : {
7339 97 : if (dump_enabled_p ())
7340 61 : dump_printf_loc (MSG_NOTE, vect_location,
7341 : "condition expression based on "
7342 : "integer induction.\n");
7343 97 : VECT_REDUC_INFO_CODE (reduc_info) = cond_reduc_op_code;
7344 97 : VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info)
7345 97 : = cond_reduc_val;
7346 97 : VECT_REDUC_INFO_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7347 : }
7348 : }
7349 709 : else if (cond_reduc_dt == vect_constant_def)
7350 : {
7351 108 : enum vect_def_type cond_initial_dt;
7352 108 : tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7353 108 : vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7354 108 : if (cond_initial_dt == vect_constant_def
7355 133 : && types_compatible_p (TREE_TYPE (cond_initial_val),
7356 25 : TREE_TYPE (cond_reduc_val)))
7357 : {
7358 25 : tree e = fold_binary (LE_EXPR, boolean_type_node,
7359 : cond_initial_val, cond_reduc_val);
7360 25 : if (e && (integer_onep (e) || integer_zerop (e)))
7361 : {
7362 25 : if (dump_enabled_p ())
7363 16 : dump_printf_loc (MSG_NOTE, vect_location,
7364 : "condition expression based on "
7365 : "compile time constant.\n");
7366 : /* Record reduction code at analysis stage. */
7367 25 : VECT_REDUC_INFO_CODE (reduc_info)
7368 25 : = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7369 25 : VECT_REDUC_INFO_TYPE (reduc_info) = CONST_COND_REDUCTION;
7370 : }
7371 : }
7372 : }
7373 : }
7374 :
7375 73645 : if (STMT_VINFO_LIVE_P (phi_info))
7376 : return false;
7377 :
7378 73645 : ncopies = vect_get_num_copies (loop_vinfo, slp_node);
7379 :
7380 73645 : gcc_assert (ncopies >= 1);
7381 :
7382 73645 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7383 :
7384 : /* 4.2. Check support for the epilog operation.
7385 :
7386 : If STMT represents a reduction pattern, then the type of the
7387 : reduction variable may be different than the type of the rest
7388 : of the arguments. For example, consider the case of accumulation
7389 : of shorts into an int accumulator; The original code:
7390 : S1: int_a = (int) short_a;
7391 : orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7392 :
7393 : was replaced with:
7394 : STMT: int_acc = widen_sum <short_a, int_acc>
7395 :
7396 : This means that:
7397 : 1. The tree-code that is used to create the vector operation in the
7398 : epilog code (that reduces the partial results) is not the
7399 : tree-code of STMT, but is rather the tree-code of the original
7400 : stmt from the pattern that STMT is replacing. I.e, in the example
7401 : above we want to use 'widen_sum' in the loop, but 'plus' in the
7402 : epilog.
7403 : 2. The type (mode) we use to check available target support
7404 : for the vector operation to be created in the *epilog*, is
7405 : determined by the type of the reduction variable (in the example
7406 : above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7407 : However the type (mode) we use to check available target support
7408 : for the vector operation to be created *inside the loop*, is
7409 : determined by the type of the other arguments to STMT (in the
7410 : example we'd check this: optab_handler (widen_sum_optab,
7411 : vect_short_mode)).
7412 :
7413 : This is contrary to "regular" reductions, in which the types of all
7414 : the arguments are the same as the type of the reduction variable.
7415 : For "regular" reductions we can therefore use the same vector type
7416 : (and also the same tree-code) when generating the epilog code and
7417 : when generating the code inside the loop. */
7418 :
7419 73645 : code_helper orig_code = VECT_REDUC_INFO_CODE (reduc_info);
7420 :
7421 : /* If conversion might have created a conditional operation like
7422 : IFN_COND_ADD already. Use the internal code for the following checks. */
7423 73645 : if (orig_code.is_internal_fn ())
7424 : {
7425 6837 : tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7426 6837 : orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7427 : }
7428 :
7429 73645 : VECT_REDUC_INFO_CODE (reduc_info) = orig_code;
7430 :
7431 73645 : reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7432 73645 : if (reduction_type == TREE_CODE_REDUCTION)
7433 : {
7434 : /* Check whether it's ok to change the order of the computation.
7435 : Generally, when vectorizing a reduction we change the order of the
7436 : computation. This may change the behavior of the program in some
7437 : cases, so we need to check that this is ok. One exception is when
7438 : vectorizing an outer-loop: the inner-loop is executed sequentially,
7439 : and therefore vectorizing reductions in the inner-loop during
7440 : outer-loop vectorization is safe. Likewise when we are vectorizing
7441 : a series of reductions using SLP and the VF is one the reductions
7442 : are performed in scalar order. */
7443 72827 : if (!reduc_chain
7444 72827 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7445 : ;
7446 72670 : else if (needs_fold_left_reduction_p (op.type, orig_code))
7447 : {
7448 : /* When vectorizing a reduction chain w/o SLP the reduction PHI
7449 : is not directly used in stmt. */
7450 5174 : if (reduc_chain_length != 1)
7451 : {
7452 73 : if (dump_enabled_p ())
7453 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7454 : "in-order reduction chain without SLP.\n");
7455 73 : return false;
7456 : }
7457 : /* Code generation doesn't support function calls other
7458 : than .COND_*. */
7459 5101 : if (!op.code.is_tree_code ()
7460 5309 : && !(op.code.is_internal_fn ()
7461 104 : && conditional_internal_fn_code (internal_fn (op.code))
7462 : != ERROR_MARK))
7463 : {
7464 18 : if (dump_enabled_p ())
7465 16 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7466 : "in-order reduction chain operation not "
7467 : "supported.\n");
7468 18 : return false;
7469 : }
7470 5083 : VECT_REDUC_INFO_TYPE (reduc_info)
7471 5083 : = reduction_type = FOLD_LEFT_REDUCTION;
7472 : }
7473 67496 : else if (!commutative_binary_op_p (orig_code, op.type)
7474 67496 : || !associative_binary_op_p (orig_code, op.type))
7475 : {
7476 144 : if (dump_enabled_p ())
7477 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7478 : "reduction: not commutative/associative\n");
7479 144 : return false;
7480 : }
7481 : }
7482 :
7483 5083 : if ((reduction_type == COND_REDUCTION
7484 : || reduction_type == INTEGER_INDUC_COND_REDUCTION
7485 : || reduction_type == CONST_COND_REDUCTION
7486 68327 : || reduction_type == EXTRACT_LAST_REDUCTION)
7487 818 : && ncopies > 1)
7488 : {
7489 276 : if (dump_enabled_p ())
7490 60 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7491 : "multiple types in condition reduction.\n");
7492 276 : return false;
7493 : }
7494 :
7495 : /* See if we can convert a mask vector to a corresponding bool data vector
7496 : to perform the epilogue reduction. */
7497 73134 : tree alt_vectype_out = NULL_TREE;
7498 73134 : if (VECTOR_BOOLEAN_TYPE_P (vectype_out))
7499 : {
7500 1141 : alt_vectype_out
7501 2282 : = get_related_vectype_for_scalar_type (loop_vinfo->vector_mode,
7502 1141 : TREE_TYPE (vectype_out),
7503 : TYPE_VECTOR_SUBPARTS
7504 : (vectype_out));
7505 1141 : if (!alt_vectype_out
7506 1141 : || maybe_ne (TYPE_VECTOR_SUBPARTS (alt_vectype_out),
7507 2255 : TYPE_VECTOR_SUBPARTS (vectype_out))
7508 2282 : || !expand_vec_cond_expr_p (alt_vectype_out, vectype_out))
7509 27 : alt_vectype_out = NULL_TREE;
7510 : }
7511 :
7512 73134 : internal_fn reduc_fn = IFN_LAST;
7513 73134 : if (reduction_type == TREE_CODE_REDUCTION
7514 73134 : || reduction_type == FOLD_LEFT_REDUCTION
7515 : || reduction_type == INTEGER_INDUC_COND_REDUCTION
7516 542 : || reduction_type == CONST_COND_REDUCTION)
7517 : {
7518 67623 : if (reduction_type == FOLD_LEFT_REDUCTION
7519 77003 : ? fold_left_reduction_fn (orig_code, &reduc_fn)
7520 67623 : : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7521 : {
7522 72034 : internal_fn sbool_fn = IFN_LAST;
7523 72034 : if (reduc_fn == IFN_LAST)
7524 : ;
7525 69996 : else if ((!VECTOR_BOOLEAN_TYPE_P (vectype_out)
7526 1141 : || (GET_MODE_CLASS (TYPE_MODE (vectype_out))
7527 : == MODE_VECTOR_BOOL))
7528 138851 : && direct_internal_fn_supported_p (reduc_fn, vectype_out,
7529 : OPTIMIZE_FOR_SPEED))
7530 : ;
7531 18386 : else if (VECTOR_BOOLEAN_TYPE_P (vectype_out)
7532 1141 : && sbool_reduction_fn_for_fn (reduc_fn, &sbool_fn)
7533 19527 : && direct_internal_fn_supported_p (sbool_fn, vectype_out,
7534 : OPTIMIZE_FOR_SPEED))
7535 125 : reduc_fn = sbool_fn;
7536 18261 : else if (reduction_type != FOLD_LEFT_REDUCTION
7537 18261 : && alt_vectype_out
7538 18261 : && direct_internal_fn_supported_p (reduc_fn, alt_vectype_out,
7539 : OPTIMIZE_FOR_SPEED))
7540 795 : VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info) = alt_vectype_out;
7541 : else
7542 : {
7543 17466 : if (dump_enabled_p ())
7544 942 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7545 : "reduc op not supported by target.\n");
7546 :
7547 17466 : reduc_fn = IFN_LAST;
7548 : }
7549 : }
7550 : else
7551 : {
7552 672 : if (dump_enabled_p ())
7553 48 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7554 : "no reduc code for scalar code.\n");
7555 :
7556 672 : return false;
7557 : }
7558 72034 : if (reduc_fn == IFN_LAST
7559 72034 : && VECTOR_BOOLEAN_TYPE_P (vectype_out))
7560 : {
7561 221 : if (!alt_vectype_out)
7562 : {
7563 12 : if (dump_enabled_p ())
7564 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7565 : "cannot turn mask into bool data vector for "
7566 : "reduction epilogue.\n");
7567 12 : return false;
7568 : }
7569 209 : VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info) = alt_vectype_out;
7570 : }
7571 : }
7572 428 : else if (reduction_type == COND_REDUCTION)
7573 : {
7574 428 : int scalar_precision
7575 428 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7576 428 : cr_index_scalar_type = make_unsigned_type (scalar_precision);
7577 428 : cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7578 : vectype_out);
7579 :
7580 428 : if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7581 : OPTIMIZE_FOR_SPEED))
7582 22 : reduc_fn = IFN_REDUC_MAX;
7583 : }
7584 72450 : VECT_REDUC_INFO_FN (reduc_info) = reduc_fn;
7585 :
7586 72450 : if (reduction_type != EXTRACT_LAST_REDUCTION
7587 : && reduc_fn == IFN_LAST
7588 : && !nunits_out.is_constant ())
7589 : {
7590 : if (dump_enabled_p ())
7591 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7592 : "missing target support for reduction on"
7593 : " variable-length vectors.\n");
7594 : return false;
7595 : }
7596 :
7597 : /* For SLP reductions, see if there is a neutral value we can use. */
7598 72450 : tree neutral_op = NULL_TREE;
7599 72450 : tree initial_value = NULL_TREE;
7600 72450 : if (reduc_chain)
7601 2240 : initial_value = vect_phi_initial_value (reduc_def_phi);
7602 72450 : neutral_op = neutral_op_for_reduction (TREE_TYPE
7603 : (gimple_phi_result (reduc_def_phi)),
7604 : orig_code, initial_value);
7605 72450 : VECT_REDUC_INFO_NEUTRAL_OP (reduc_info) = neutral_op;
7606 :
7607 72450 : if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7608 : {
7609 : /* We can't support in-order reductions of code such as this:
7610 :
7611 : for (int i = 0; i < n1; ++i)
7612 : for (int j = 0; j < n2; ++j)
7613 : l += a[j];
7614 :
7615 : since GCC effectively transforms the loop when vectorizing:
7616 :
7617 : for (int i = 0; i < n1 / VF; ++i)
7618 : for (int j = 0; j < n2; ++j)
7619 : for (int k = 0; k < VF; ++k)
7620 : l += a[j];
7621 :
7622 : which is a reassociation of the original operation. */
7623 66 : if (dump_enabled_p ())
7624 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7625 : "in-order double reduction not supported.\n");
7626 :
7627 66 : return false;
7628 : }
7629 :
7630 72384 : if (reduction_type == FOLD_LEFT_REDUCTION
7631 4345 : && SLP_TREE_LANES (slp_node) > 1
7632 159 : && !reduc_chain)
7633 : {
7634 : /* We cannot use in-order reductions in this case because there is
7635 : an implicit reassociation of the operations involved. */
7636 64 : if (dump_enabled_p ())
7637 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7638 : "in-order unchained SLP reductions not supported.\n");
7639 64 : return false;
7640 : }
7641 :
7642 : /* For double reductions, and for SLP reductions with a neutral value,
7643 : we construct a variable-length initial vector by loading a vector
7644 : full of the neutral value and then shift-and-inserting the start
7645 : values into the low-numbered elements. This is however not needed
7646 : when neutral and initial value are equal or we can handle the
7647 : initial value via adjustment in the epilogue. */
7648 72320 : if ((double_reduc || neutral_op)
7649 : && !nunits_out.is_constant ()
7650 : && reduction_type != INTEGER_INDUC_COND_REDUCTION
7651 : && !((SLP_TREE_LANES (slp_node) == 1 || reduc_chain)
7652 : && neutral_op
7653 : && (!double_reduc
7654 : || operand_equal_p (neutral_op,
7655 : vect_phi_initial_value (reduc_def_phi))))
7656 : && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7657 : vectype_out, OPTIMIZE_FOR_BOTH))
7658 : {
7659 : if (dump_enabled_p ())
7660 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7661 : "reduction on variable-length vectors requires"
7662 : " target support for a vector-shift-and-insert"
7663 : " operation.\n");
7664 : return false;
7665 : }
7666 :
7667 : /* Check extra constraints for variable-length unchained SLP reductions. */
7668 72320 : if (!reduc_chain
7669 : && !nunits_out.is_constant ())
7670 : {
7671 : /* We checked above that we could build the initial vector when
7672 : there's a neutral element value. Check here for the case in
7673 : which each SLP statement has its own initial value and in which
7674 : that value needs to be repeated for every instance of the
7675 : statement within the initial vector. */
7676 : unsigned int group_size = SLP_TREE_LANES (slp_node);
7677 : if (!neutral_op
7678 : && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7679 : TREE_TYPE (vectype_out)))
7680 : {
7681 : if (dump_enabled_p ())
7682 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7683 : "unsupported form of SLP reduction for"
7684 : " variable-length vectors: cannot build"
7685 : " initial vector.\n");
7686 : return false;
7687 : }
7688 : /* The epilogue code relies on the number of elements being a multiple
7689 : of the group size. The duplicate-and-interleave approach to setting
7690 : up the initial vector does too. */
7691 : if (!multiple_p (nunits_out, group_size))
7692 : {
7693 : if (dump_enabled_p ())
7694 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7695 : "unsupported form of SLP reduction for"
7696 : " variable-length vectors: the vector size"
7697 : " is not a multiple of the number of results.\n");
7698 : return false;
7699 : }
7700 : }
7701 :
7702 72320 : if (reduction_type == COND_REDUCTION)
7703 : {
7704 428 : widest_int ni;
7705 :
7706 428 : if (! max_loop_iterations (loop, &ni))
7707 : {
7708 14 : if (dump_enabled_p ())
7709 0 : dump_printf_loc (MSG_NOTE, vect_location,
7710 : "loop count not known, cannot create cond "
7711 : "reduction.\n");
7712 14 : return false;
7713 : }
7714 : /* Convert backedges to iterations. */
7715 414 : ni += 1;
7716 :
7717 : /* The additional index will be the same type as the condition. Check
7718 : that the loop can fit into this less one (because we'll use up the
7719 : zero slot for when there are no matches). */
7720 414 : tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7721 414 : if (wi::geu_p (ni, wi::to_widest (max_index)))
7722 : {
7723 90 : if (dump_enabled_p ())
7724 54 : dump_printf_loc (MSG_NOTE, vect_location,
7725 : "loop size is greater than data size.\n");
7726 90 : return false;
7727 : }
7728 428 : }
7729 :
7730 : /* In case the vectorization factor (VF) is bigger than the number
7731 : of elements that we can fit in a vectype (nunits), we have to generate
7732 : more than one vector stmt - i.e - we need to "unroll" the
7733 : vector stmt by a factor VF/nunits. For more details see documentation
7734 : in vectorizable_operation. */
7735 :
7736 : /* If the reduction is used in an outer loop we need to generate
7737 : VF intermediate results, like so (e.g. for ncopies=2):
7738 : r0 = phi (init, r0)
7739 : r1 = phi (init, r1)
7740 : r0 = x0 + r0;
7741 : r1 = x1 + r1;
7742 : (i.e. we generate VF results in 2 registers).
7743 : In this case we have a separate def-use cycle for each copy, and therefore
7744 : for each copy we get the vector def for the reduction variable from the
7745 : respective phi node created for this copy.
7746 :
7747 : Otherwise (the reduction is unused in the loop nest), we can combine
7748 : together intermediate results, like so (e.g. for ncopies=2):
7749 : r = phi (init, r)
7750 : r = x0 + r;
7751 : r = x1 + r;
7752 : (i.e. we generate VF/2 results in a single register).
7753 : In this case for each copy we get the vector def for the reduction variable
7754 : from the vectorized reduction operation generated in the previous iteration.
7755 :
7756 : This only works when we see both the reduction PHI and its only consumer
7757 : in vectorizable_reduction and there are no intermediate stmts
7758 : participating. When unrolling we want each unrolled iteration to have its
7759 : own reduction accumulator since one of the main goals of unrolling a
7760 : reduction is to reduce the aggregate loop-carried latency. */
7761 72216 : if (ncopies > 1
7762 72216 : && !reduc_chain
7763 8027 : && SLP_TREE_LANES (slp_node) == 1
7764 7859 : && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7765 7836 : && reduc_chain_length == 1
7766 7422 : && loop_vinfo->suggested_unroll_factor == 1)
7767 72216 : single_defuse_cycle = true;
7768 :
7769 72216 : if (single_defuse_cycle && !lane_reducing)
7770 : {
7771 6474 : gcc_assert (op.code != COND_EXPR);
7772 :
7773 : /* 4. check support for the operation in the loop
7774 :
7775 : This isn't necessary for the lane reduction codes, since they
7776 : can only be produced by pattern matching, and it's up to the
7777 : pattern matcher to test for support. The main reason for
7778 : specifically skipping this step is to avoid rechecking whether
7779 : mixed-sign dot-products can be implemented using signed
7780 : dot-products. */
7781 6474 : machine_mode vec_mode = TYPE_MODE (vectype_in);
7782 6474 : if (!directly_supported_p (op.code, vectype_in, optab_vector))
7783 : {
7784 2065 : if (dump_enabled_p ())
7785 44 : dump_printf (MSG_NOTE, "op not supported by target.\n");
7786 4130 : if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7787 2065 : || !vect_can_vectorize_without_simd_p (op.code))
7788 : single_defuse_cycle = false;
7789 : else
7790 5 : if (dump_enabled_p ())
7791 0 : dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7792 : }
7793 :
7794 6474 : if (vect_emulated_vector_p (vectype_in)
7795 6474 : && !vect_can_vectorize_without_simd_p (op.code))
7796 : {
7797 0 : if (dump_enabled_p ())
7798 0 : dump_printf (MSG_NOTE, "using word mode not possible.\n");
7799 0 : return false;
7800 : }
7801 : }
7802 72216 : if (dump_enabled_p () && single_defuse_cycle)
7803 701 : dump_printf_loc (MSG_NOTE, vect_location,
7804 : "using single def-use cycle for reduction by reducing "
7805 : "multiple vectors to one in the loop body\n");
7806 72216 : VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7807 :
7808 : /* For lane-reducing operation, the below processing related to single
7809 : defuse-cycle will be done in its own vectorizable function. One more
7810 : thing to note is that the operation must not be involved in fold-left
7811 : reduction. */
7812 72216 : single_defuse_cycle &= !lane_reducing;
7813 :
7814 72216 : if (single_defuse_cycle || reduction_type == FOLD_LEFT_REDUCTION)
7815 28272 : for (i = 0; i < (int) op.num_ops; i++)
7816 19646 : if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7817 : {
7818 0 : if (dump_enabled_p ())
7819 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7820 : "incompatible vector types for invariants\n");
7821 0 : return false;
7822 : }
7823 :
7824 72216 : vect_model_reduction_cost (loop_vinfo, slp_for_stmt_info, reduc_fn,
7825 : reduction_type, ncopies, cost_vec);
7826 : /* Cost the reduction op inside the loop if transformed via
7827 : vect_transform_reduction for non-lane-reducing operation. Otherwise
7828 : this is costed by the separate vectorizable_* routines. */
7829 72216 : if (single_defuse_cycle)
7830 4414 : record_stmt_cost (cost_vec, ncopies, vector_stmt,
7831 : slp_for_stmt_info, 0, vect_body);
7832 :
7833 72216 : if (dump_enabled_p ()
7834 72216 : && reduction_type == FOLD_LEFT_REDUCTION)
7835 264 : dump_printf_loc (MSG_NOTE, vect_location,
7836 : "using an in-order (fold-left) reduction.\n");
7837 72216 : SLP_TREE_TYPE (slp_node) = cycle_phi_info_type;
7838 :
7839 : /* All but single defuse-cycle optimized and fold-left reductions go
7840 : through their own vectorizable_* routines. */
7841 72216 : stmt_vec_info tem
7842 72216 : = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (slp_node_instance));
7843 72216 : if (!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
7844 63590 : STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7845 : else
7846 : {
7847 8626 : STMT_VINFO_DEF_TYPE (tem) = vect_reduction_def;
7848 8626 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7849 3980 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
7850 : slp_node, op.code, op.type,
7851 : vectype_in);
7852 : }
7853 : return true;
7854 : }
7855 :
7856 : /* STMT_INFO is a dot-product reduction whose multiplication operands
7857 : have different signs. Emit a sequence to emulate the operation
7858 : using a series of signed DOT_PROD_EXPRs and return the last
7859 : statement generated. VEC_DEST is the result of the vector operation
7860 : and VOP lists its inputs. */
7861 :
7862 : static gassign *
7863 4 : vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7864 : gimple_stmt_iterator *gsi, tree vec_dest,
7865 : tree vop[3])
7866 : {
7867 4 : tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
7868 4 : tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
7869 4 : tree narrow_elttype = TREE_TYPE (narrow_vectype);
7870 4 : gimple *new_stmt;
7871 :
7872 : /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
7873 4 : if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
7874 0 : std::swap (vop[0], vop[1]);
7875 :
7876 : /* Convert all inputs to signed types. */
7877 12 : for (int i = 1; i < 3; ++i)
7878 8 : if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
7879 : {
7880 0 : tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
7881 0 : new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
7882 0 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7883 0 : vop[i] = tmp;
7884 : }
7885 :
7886 : /* In the comments below we assume 8-bit inputs for simplicity,
7887 : but the approach works for any full integer type. */
7888 :
7889 : /* Create a vector of -128. */
7890 4 : tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
7891 4 : tree min_narrow = build_vector_from_val (TREE_TYPE (vop[0]),
7892 4 : fold_convert
7893 : (TREE_TYPE (TREE_TYPE (vop[0])),
7894 : min_narrow_elttype));
7895 :
7896 : /* Create a vector of 64. */
7897 4 : auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
7898 4 : tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
7899 4 : half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
7900 :
7901 : /* Emit: SUB_RES = VOP[0] - 128 in an unsigned type. */
7902 4 : tree sub_res = make_ssa_name (TREE_TYPE (vop[0]));
7903 4 : new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
7904 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7905 :
7906 4 : vop[0] = make_ssa_name (narrow_vectype);
7907 4 : new_stmt = gimple_build_assign (vop[0], VIEW_CONVERT_EXPR,
7908 : build1 (VIEW_CONVERT_EXPR, narrow_vectype,
7909 : sub_res));
7910 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7911 :
7912 : /* Emit:
7913 :
7914 : STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
7915 : STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
7916 : STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
7917 :
7918 : on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
7919 : Doing the two 64 * y steps first allows more time to compute x. */
7920 4 : tree stage1 = make_ssa_name (wide_vectype);
7921 4 : new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
7922 : vop[1], half_narrow, vop[2]);
7923 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7924 :
7925 4 : tree stage2 = make_ssa_name (wide_vectype);
7926 4 : new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
7927 : vop[1], half_narrow, stage1);
7928 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7929 :
7930 4 : tree stage3 = make_ssa_name (wide_vectype);
7931 4 : new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
7932 : vop[0], vop[1], stage2);
7933 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7934 :
7935 : /* Convert STAGE3 to the reduction type. */
7936 4 : return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
7937 4 : }
7938 :
7939 : /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7940 : value. */
7941 :
7942 : bool
7943 2636 : vect_transform_reduction (loop_vec_info loop_vinfo,
7944 : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7945 : slp_tree slp_node)
7946 : {
7947 2636 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
7948 2636 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7949 2636 : unsigned vec_num;
7950 :
7951 2636 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
7952 :
7953 2636 : if (nested_in_vect_loop_p (loop, stmt_info))
7954 : {
7955 0 : loop = loop->inner;
7956 0 : gcc_assert (VECT_REDUC_INFO_DEF_TYPE (reduc_info)
7957 : == vect_double_reduction_def);
7958 : }
7959 :
7960 2636 : gimple_match_op op;
7961 2636 : if (!gimple_extract_op (stmt_info->stmt, &op))
7962 0 : gcc_unreachable ();
7963 :
7964 : /* All uses but the last are expected to be defined in the loop.
7965 : The last use is the reduction variable. In case of nested cycle this
7966 : assumption is not true: we use reduc_index to record the index of the
7967 : reduction variable. */
7968 2636 : int reduc_index = SLP_TREE_REDUC_IDX (slp_node);
7969 2636 : tree vectype_in = SLP_TREE_VECTYPE (slp_node);
7970 2636 : if (lane_reducing_op_p (op.code))
7971 262 : vectype_in = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (slp_node)[0]);
7972 :
7973 2636 : vec_num = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
7974 :
7975 2636 : code_helper code = canonicalize_code (op.code, op.type);
7976 2636 : internal_fn cond_fn
7977 483 : = ((code.is_internal_fn ()
7978 483 : && internal_fn_mask_index ((internal_fn)code) != -1)
7979 2636 : ? (internal_fn)code : get_conditional_internal_fn (code, op.type));
7980 :
7981 2636 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7982 2636 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
7983 2636 : bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7984 :
7985 : /* Transform. */
7986 2636 : tree new_temp = NULL_TREE;
7987 18452 : auto_vec<tree> vec_oprnds[3];
7988 :
7989 2636 : if (dump_enabled_p ())
7990 770 : dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7991 :
7992 : /* A binary COND_OP reduction must have the same definition and else
7993 : value. */
7994 3119 : bool cond_fn_p = code.is_internal_fn ()
7995 483 : && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
7996 483 : if (cond_fn_p)
7997 : {
7998 483 : gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
7999 : || code == IFN_COND_MUL || code == IFN_COND_AND
8000 : || code == IFN_COND_IOR || code == IFN_COND_XOR
8001 : || code == IFN_COND_MIN || code == IFN_COND_MAX);
8002 483 : gcc_assert (op.num_ops == 4
8003 : && (op.ops[reduc_index]
8004 : == op.ops[internal_fn_else_index ((internal_fn) code)]));
8005 : }
8006 :
8007 2636 : bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8008 :
8009 2636 : vect_reduction_type reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
8010 2636 : if (reduction_type == FOLD_LEFT_REDUCTION)
8011 : {
8012 895 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
8013 895 : gcc_assert (code.is_tree_code () || cond_fn_p);
8014 895 : return vectorize_fold_left_reduction
8015 895 : (loop_vinfo, stmt_info, gsi, slp_node,
8016 895 : code, reduc_fn, op.num_ops, vectype_in,
8017 895 : reduc_index, masks, lens);
8018 : }
8019 :
8020 1741 : bool single_defuse_cycle = VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info);
8021 1741 : bool lane_reducing = lane_reducing_op_p (code);
8022 1479 : gcc_assert (single_defuse_cycle || lane_reducing);
8023 :
8024 1741 : if (lane_reducing)
8025 : {
8026 : /* The last operand of lane-reducing op is for reduction. */
8027 262 : gcc_assert (reduc_index == (int) op.num_ops - 1);
8028 : }
8029 :
8030 : /* Create the destination vector */
8031 1741 : tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8032 1741 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8033 :
8034 : /* Get NCOPIES vector definitions for all operands except the reduction
8035 : definition. */
8036 1741 : if (!cond_fn_p)
8037 : {
8038 1288 : gcc_assert (reduc_index >= 0 && reduc_index <= 2);
8039 2121 : vect_get_vec_defs (loop_vinfo, slp_node,
8040 1288 : single_defuse_cycle && reduc_index == 0
8041 : ? NULL_TREE : op.ops[0], &vec_oprnds[0],
8042 1288 : single_defuse_cycle && reduc_index == 1
8043 : ? NULL_TREE : op.ops[1], &vec_oprnds[1],
8044 1288 : op.num_ops == 3
8045 262 : && !(single_defuse_cycle && reduc_index == 2)
8046 : ? op.ops[2] : NULL_TREE, &vec_oprnds[2]);
8047 : }
8048 : else
8049 : {
8050 : /* For a conditional operation pass the truth type as mask
8051 : vectype. */
8052 453 : gcc_assert (single_defuse_cycle
8053 : && (reduc_index == 1 || reduc_index == 2));
8054 453 : vect_get_vec_defs (loop_vinfo, slp_node, op.ops[0],
8055 : &vec_oprnds[0],
8056 : reduc_index == 1 ? NULL_TREE : op.ops[1],
8057 : &vec_oprnds[1],
8058 : reduc_index == 2 ? NULL_TREE : op.ops[2],
8059 : &vec_oprnds[2]);
8060 : }
8061 :
8062 : /* For single def-use cycles get one copy of the vectorized reduction
8063 : definition. */
8064 1741 : if (single_defuse_cycle)
8065 : {
8066 1650 : vect_get_vec_defs (loop_vinfo, slp_node,
8067 : reduc_index == 0 ? op.ops[0] : NULL_TREE,
8068 : &vec_oprnds[0],
8069 : reduc_index == 1 ? op.ops[1] : NULL_TREE,
8070 : &vec_oprnds[1],
8071 : reduc_index == 2 ? op.ops[2] : NULL_TREE,
8072 : &vec_oprnds[2]);
8073 : }
8074 91 : else if (lane_reducing)
8075 : {
8076 : /* For normal reduction, consistency between vectorized def/use is
8077 : naturally ensured when mapping from scalar statement. But if lane-
8078 : reducing op is involved in reduction, thing would become somewhat
8079 : complicated in that the op's result and operand for accumulation are
8080 : limited to less lanes than other operands, which certainly causes
8081 : def/use mismatch on adjacent statements around the op if do not have
8082 : any kind of specific adjustment. One approach is to refit lane-
8083 : reducing op in the way of introducing new trivial pass-through copies
8084 : to fix possible def/use gap, so as to make it behave like a normal op.
8085 : And vector reduction PHIs are always generated to the full extent, no
8086 : matter lane-reducing op exists or not. If some copies or PHIs are
8087 : actually superfluous, they would be cleaned up by passes after
8088 : vectorization. An example for single-lane slp, lane-reducing ops
8089 : with mixed input vectypes in a reduction chain, is given as below.
8090 : Similarly, this handling is applicable for multiple-lane slp as well.
8091 :
8092 : int sum = 1;
8093 : for (i)
8094 : {
8095 : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
8096 : sum += w[i]; // widen-sum <vector(16) char>
8097 : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
8098 : sum += n[i]; // normal <vector(4) int>
8099 : }
8100 :
8101 : The vector size is 128-bit,vectorization factor is 16. Reduction
8102 : statements would be transformed as:
8103 :
8104 : vector<4> int sum_v0 = { 0, 0, 0, 1 };
8105 : vector<4> int sum_v1 = { 0, 0, 0, 0 };
8106 : vector<4> int sum_v2 = { 0, 0, 0, 0 };
8107 : vector<4> int sum_v3 = { 0, 0, 0, 0 };
8108 :
8109 : for (i / 16)
8110 : {
8111 : sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
8112 : sum_v1 = sum_v1; // copy
8113 : sum_v2 = sum_v2; // copy
8114 : sum_v3 = sum_v3; // copy
8115 :
8116 : sum_v0 = sum_v0; // copy
8117 : sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
8118 : sum_v2 = sum_v2; // copy
8119 : sum_v3 = sum_v3; // copy
8120 :
8121 : sum_v0 = sum_v0; // copy
8122 : sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1);
8123 : sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2);
8124 : sum_v3 = sum_v3; // copy
8125 :
8126 : sum_v0 += n_v0[i: 0 ~ 3 ];
8127 : sum_v1 += n_v1[i: 4 ~ 7 ];
8128 : sum_v2 += n_v2[i: 8 ~ 11];
8129 : sum_v3 += n_v3[i: 12 ~ 15];
8130 : }
8131 :
8132 : Moreover, for a higher instruction parallelism in final vectorized
8133 : loop, it is considered to make those effective vector lane-reducing
8134 : ops be distributed evenly among all def-use cycles. In the above
8135 : example, DOT_PROD, WIDEN_SUM and SADs are generated into disparate
8136 : cycles, instruction dependency among them could be eliminated. */
8137 91 : unsigned effec_ncopies = vec_oprnds[0].length ();
8138 91 : unsigned total_ncopies = vec_oprnds[reduc_index].length ();
8139 :
8140 91 : gcc_assert (effec_ncopies <= total_ncopies);
8141 :
8142 91 : if (effec_ncopies < total_ncopies)
8143 : {
8144 273 : for (unsigned i = 0; i < op.num_ops - 1; i++)
8145 : {
8146 364 : gcc_assert (vec_oprnds[i].length () == effec_ncopies);
8147 182 : vec_oprnds[i].safe_grow_cleared (total_ncopies);
8148 : }
8149 : }
8150 :
8151 91 : tree reduc_vectype_in = vectype_in;
8152 91 : gcc_assert (reduc_vectype_in);
8153 :
8154 91 : unsigned effec_reduc_ncopies
8155 91 : = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
8156 :
8157 91 : gcc_assert (effec_ncopies <= effec_reduc_ncopies);
8158 :
8159 91 : if (effec_ncopies < effec_reduc_ncopies)
8160 : {
8161 : /* Find suitable def-use cycles to generate vectorized statements
8162 : into, and reorder operands based on the selection. */
8163 0 : unsigned curr_pos = VECT_REDUC_INFO_RESULT_POS (reduc_info);
8164 0 : unsigned next_pos = (curr_pos + effec_ncopies) % effec_reduc_ncopies;
8165 :
8166 0 : gcc_assert (curr_pos < effec_reduc_ncopies);
8167 0 : VECT_REDUC_INFO_RESULT_POS (reduc_info) = next_pos;
8168 :
8169 0 : if (curr_pos)
8170 : {
8171 0 : unsigned count = effec_reduc_ncopies - effec_ncopies;
8172 0 : unsigned start = curr_pos - count;
8173 :
8174 0 : if ((int) start < 0)
8175 : {
8176 0 : count = curr_pos;
8177 0 : start = 0;
8178 : }
8179 :
8180 0 : for (unsigned i = 0; i < op.num_ops - 1; i++)
8181 : {
8182 0 : for (unsigned j = effec_ncopies; j > start; j--)
8183 : {
8184 0 : unsigned k = j - 1;
8185 0 : std::swap (vec_oprnds[i][k], vec_oprnds[i][k + count]);
8186 0 : gcc_assert (!vec_oprnds[i][k]);
8187 : }
8188 : }
8189 : }
8190 : }
8191 : }
8192 :
8193 1741 : bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (slp_node);
8194 3002 : unsigned num = vec_oprnds[reduc_index == 0 ? 1 : 0].length ();
8195 1741 : unsigned mask_index = 0;
8196 :
8197 7654 : for (unsigned i = 0; i < num; ++i)
8198 : {
8199 5913 : gimple *new_stmt;
8200 5913 : tree vop[3] = { vec_oprnds[0][i], vec_oprnds[1][i], NULL_TREE };
8201 5913 : if (!vop[0] || !vop[1])
8202 : {
8203 479 : tree reduc_vop = vec_oprnds[reduc_index][i];
8204 :
8205 : /* If could not generate an effective vector statement for current
8206 : portion of reduction operand, insert a trivial copy to simply
8207 : handle over the operand to other dependent statements. */
8208 479 : gcc_assert (reduc_vop);
8209 :
8210 479 : if (TREE_CODE (reduc_vop) == SSA_NAME
8211 479 : && !SSA_NAME_IS_DEFAULT_DEF (reduc_vop))
8212 479 : new_stmt = SSA_NAME_DEF_STMT (reduc_vop);
8213 : else
8214 : {
8215 0 : new_temp = make_ssa_name (vec_dest);
8216 0 : new_stmt = gimple_build_assign (new_temp, reduc_vop);
8217 0 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
8218 : gsi);
8219 : }
8220 : }
8221 5434 : else if (masked_loop_p && !mask_by_cond_expr)
8222 : {
8223 : /* No conditional ifns have been defined for lane-reducing op
8224 : yet. */
8225 16 : gcc_assert (!lane_reducing);
8226 :
8227 16 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8228 : vec_num, vectype_in,
8229 : mask_index++);
8230 16 : gcall *call;
8231 24 : if (code.is_internal_fn () && cond_fn_p)
8232 : {
8233 16 : gcc_assert (op.num_ops >= 3
8234 : && internal_fn_mask_index (internal_fn (code)) == 0);
8235 8 : vop[2] = vec_oprnds[2][i];
8236 8 : mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask),
8237 : mask, vop[0], gsi);
8238 8 : call = gimple_build_call_internal (cond_fn, 4, mask, vop[1],
8239 : vop[2], vop[reduc_index]);
8240 : }
8241 : else
8242 8 : call = gimple_build_call_internal (cond_fn, 4, mask, vop[0],
8243 : vop[1], vop[reduc_index]);
8244 16 : new_temp = make_ssa_name (vec_dest, call);
8245 16 : gimple_call_set_lhs (call, new_temp);
8246 16 : gimple_call_set_nothrow (call, true);
8247 16 : vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8248 16 : new_stmt = call;
8249 : }
8250 : else
8251 : {
8252 5418 : if (op.num_ops >= 3)
8253 1772 : vop[2] = vec_oprnds[2][i];
8254 :
8255 5418 : if (masked_loop_p && mask_by_cond_expr)
8256 : {
8257 4 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8258 : vec_num, vectype_in,
8259 : mask_index++);
8260 4 : build_vect_cond_expr (code, vop, mask, gsi);
8261 : }
8262 :
8263 5418 : if (emulated_mixed_dot_prod)
8264 4 : new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8265 : vec_dest, vop);
8266 :
8267 6756 : else if (code.is_internal_fn () && !cond_fn_p)
8268 0 : new_stmt = gimple_build_call_internal (internal_fn (code),
8269 : op.num_ops,
8270 : vop[0], vop[1], vop[2]);
8271 6756 : else if (code.is_internal_fn () && cond_fn_p)
8272 1342 : new_stmt = gimple_build_call_internal (internal_fn (code),
8273 : op.num_ops,
8274 : vop[0], vop[1], vop[2],
8275 : vop[reduc_index]);
8276 : else
8277 4072 : new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8278 : vop[0], vop[1], vop[2]);
8279 5418 : new_temp = make_ssa_name (vec_dest, new_stmt);
8280 5418 : gimple_set_lhs (new_stmt, new_temp);
8281 5418 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8282 : }
8283 :
8284 5913 : if (single_defuse_cycle && i < num - 1)
8285 3535 : vec_oprnds[reduc_index].safe_push (gimple_get_lhs (new_stmt));
8286 : else
8287 2378 : slp_node->push_vec_def (new_stmt);
8288 : }
8289 :
8290 : return true;
8291 10544 : }
8292 :
8293 : /* Transform phase of a cycle PHI. */
8294 :
8295 : bool
8296 23727 : vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8297 : stmt_vec_info stmt_info,
8298 : slp_tree slp_node, slp_instance slp_node_instance)
8299 : {
8300 23727 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
8301 23727 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8302 23727 : int i;
8303 23727 : bool nested_cycle = false;
8304 23727 : int vec_num;
8305 :
8306 23865 : if (nested_in_vect_loop_p (loop, stmt_info))
8307 : {
8308 : loop = loop->inner;
8309 : nested_cycle = true;
8310 : }
8311 :
8312 23727 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
8313 23727 : if (reduc_info
8314 23065 : && (VECT_REDUC_INFO_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8315 23065 : || VECT_REDUC_INFO_TYPE (reduc_info) == FOLD_LEFT_REDUCTION))
8316 : /* Leave the scalar phi in place. */
8317 : return true;
8318 :
8319 22170 : if (reduc_info && reduc_info->is_reduc_chain && dump_enabled_p ())
8320 127 : dump_printf_loc (MSG_NOTE, vect_location,
8321 : "vectorizing a reduction chain\n");
8322 :
8323 22832 : vec_num = vect_get_num_copies (loop_vinfo, slp_node);
8324 :
8325 : /* Check whether we should use a single PHI node and accumulate
8326 : vectors to one before the backedge. */
8327 22832 : if (reduc_info && VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info))
8328 22832 : vec_num = 1;
8329 :
8330 : /* Create the destination vector */
8331 22832 : gphi *phi = as_a <gphi *> (stmt_info->stmt);
8332 22832 : tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8333 : vectype_out);
8334 :
8335 : /* Get the loop-entry arguments. */
8336 22832 : auto_vec<tree> vec_initial_defs;
8337 22832 : vec_initial_defs.reserve (vec_num);
8338 : /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8339 : and we can't use zero for induc_val, use initial_def. Similarly
8340 : for REDUC_MIN and initial_def larger than the base. */
8341 22832 : if (reduc_info
8342 22170 : && VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8343 : {
8344 62 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
8345 62 : tree initial_def = vect_phi_initial_value (phi);
8346 62 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).safe_push (initial_def);
8347 62 : tree induc_val = VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info);
8348 62 : if (TREE_CODE (initial_def) == INTEGER_CST
8349 60 : && !integer_zerop (induc_val)
8350 122 : && ((VECT_REDUC_INFO_CODE (reduc_info) == MAX_EXPR
8351 42 : && tree_int_cst_lt (initial_def, induc_val))
8352 58 : || (VECT_REDUC_INFO_CODE (reduc_info) == MIN_EXPR
8353 18 : && tree_int_cst_lt (induc_val, initial_def))))
8354 : {
8355 2 : induc_val = initial_def;
8356 : /* Communicate we used the initial_def to epilouge
8357 : generation. */
8358 2 : VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8359 : }
8360 62 : vec_initial_defs.quick_push
8361 62 : (build_vector_from_val (vectype_out, induc_val));
8362 62 : }
8363 22770 : else if (nested_cycle)
8364 : {
8365 748 : unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8366 748 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8367 : &vec_initial_defs);
8368 : }
8369 : else
8370 : {
8371 22022 : gcc_assert (slp_node == slp_node_instance->reduc_phis);
8372 22022 : vec<tree> &initial_values = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info);
8373 22022 : vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8374 :
8375 22022 : unsigned int num_phis = stmts.length ();
8376 22022 : if (reduc_info->is_reduc_chain)
8377 200 : num_phis = 1;
8378 22022 : initial_values.reserve (num_phis);
8379 44489 : for (unsigned int i = 0; i < num_phis; ++i)
8380 : {
8381 22467 : gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8382 22467 : initial_values.quick_push (vect_phi_initial_value (this_phi));
8383 : }
8384 22022 : tree neutral_op = VECT_REDUC_INFO_NEUTRAL_OP (reduc_info);
8385 22022 : if (vec_num == 1
8386 22022 : && vect_find_reusable_accumulator (loop_vinfo,
8387 : reduc_info, vectype_out))
8388 : ;
8389 : /* Try to simplify the vector initialization by applying an
8390 : adjustment after the reduction has been performed. This
8391 : can also break a critical path but on the other hand
8392 : requires to keep the initial value live across the loop. */
8393 17911 : else if (neutral_op
8394 17336 : && initial_values.length () == 1
8395 17152 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8396 34986 : && !operand_equal_p (neutral_op, initial_values[0]))
8397 : {
8398 12155 : VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info)
8399 12155 : = initial_values[0];
8400 12155 : initial_values[0] = neutral_op;
8401 : }
8402 22022 : if (!VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)
8403 4111 : || loop_vinfo->main_loop_edge)
8404 43598 : get_initial_defs_for_reduction (loop_vinfo, reduc_info, vectype_out,
8405 : &vec_initial_defs, vec_num,
8406 : stmts.length (), neutral_op);
8407 : }
8408 :
8409 22832 : if (reduc_info)
8410 22170 : if (auto *accumulator = VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info))
8411 : {
8412 4111 : tree def = accumulator->reduc_input;
8413 4111 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8414 : {
8415 4108 : unsigned int nreduc;
8416 8216 : bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8417 4108 : (TREE_TYPE (def)),
8418 4108 : TYPE_VECTOR_SUBPARTS (vectype_out),
8419 : &nreduc);
8420 0 : gcc_assert (res);
8421 4108 : gimple_seq stmts = NULL;
8422 : /* Reduce the single vector to a smaller one. */
8423 4108 : if (nreduc != 1)
8424 : {
8425 : /* Perform the reduction in the appropriate type. */
8426 4108 : tree rvectype = vectype_out;
8427 4108 : if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8428 4108 : TREE_TYPE (TREE_TYPE (def))))
8429 235 : rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8430 : TYPE_VECTOR_SUBPARTS
8431 470 : (vectype_out));
8432 4108 : def = vect_create_partial_epilog (def, rvectype,
8433 : VECT_REDUC_INFO_CODE
8434 : (reduc_info),
8435 : &stmts);
8436 : }
8437 : /* The epilogue loop might use a different vector mode, like
8438 : VNx2DI vs. V2DI. */
8439 4108 : if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8440 : {
8441 0 : tree reduc_type = build_vector_type_for_mode
8442 0 : (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8443 0 : def = gimple_convert (&stmts, reduc_type, def);
8444 : }
8445 : /* Adjust the input so we pick up the partially reduced value
8446 : for the skip edge in vect_create_epilog_for_reduction. */
8447 4108 : accumulator->reduc_input = def;
8448 : /* And the reduction could be carried out using a different sign. */
8449 4108 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8450 235 : def = gimple_convert (&stmts, vectype_out, def);
8451 4108 : edge e;
8452 4108 : if ((e = loop_vinfo->main_loop_edge)
8453 4108 : || (e = loop_vinfo->skip_this_loop_edge))
8454 : {
8455 : /* While we'd like to insert on the edge this will split
8456 : blocks and disturb bookkeeping, we also will eventually
8457 : need this on the skip edge. Rely on sinking to
8458 : fixup optimal placement and insert in the pred. */
8459 3885 : gimple_stmt_iterator gsi = gsi_last_bb (e->src);
8460 : /* Insert before a cond that eventually skips the
8461 : epilogue. */
8462 3885 : if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8463 3868 : gsi_prev (&gsi);
8464 3885 : gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8465 : }
8466 : else
8467 223 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8468 : stmts);
8469 : }
8470 4111 : if (loop_vinfo->main_loop_edge)
8471 3888 : vec_initial_defs[0]
8472 3888 : = vect_get_main_loop_result (loop_vinfo, def,
8473 3888 : vec_initial_defs[0]);
8474 : else
8475 223 : vec_initial_defs.safe_push (def);
8476 : }
8477 :
8478 : /* Generate the reduction PHIs upfront. */
8479 47553 : for (i = 0; i < vec_num; i++)
8480 : {
8481 24721 : tree vec_init_def = vec_initial_defs[i];
8482 : /* Create the reduction-phi that defines the reduction
8483 : operand. */
8484 24721 : gphi *new_phi = create_phi_node (vec_dest, loop->header);
8485 24721 : add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8486 : UNKNOWN_LOCATION);
8487 :
8488 : /* The loop-latch arg is set in epilogue processing. */
8489 :
8490 24721 : slp_node->push_vec_def (new_phi);
8491 : }
8492 :
8493 22832 : return true;
8494 22832 : }
8495 :
8496 : /* Vectorizes LC PHIs. */
8497 :
8498 : bool
8499 181833 : vectorizable_lc_phi (loop_vec_info loop_vinfo,
8500 : stmt_vec_info stmt_info,
8501 : slp_tree slp_node)
8502 : {
8503 181833 : if (!loop_vinfo
8504 181833 : || !is_a <gphi *> (stmt_info->stmt)
8505 217674 : || gimple_phi_num_args (stmt_info->stmt) != 1)
8506 : return false;
8507 :
8508 821 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8509 0 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8510 : return false;
8511 :
8512 : /* Deal with copies from externs or constants that disguise as
8513 : loop-closed PHI nodes (PR97886). */
8514 821 : if (!vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8515 : SLP_TREE_VECTYPE (slp_node)))
8516 : {
8517 0 : if (dump_enabled_p ())
8518 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8519 : "incompatible vector types for invariants\n");
8520 0 : return false;
8521 : }
8522 :
8523 : /* ??? This can happen with data vs. mask uses of boolean. */
8524 821 : if (!useless_type_conversion_p (SLP_TREE_VECTYPE (slp_node),
8525 821 : SLP_TREE_VECTYPE
8526 : (SLP_TREE_CHILDREN (slp_node)[0])))
8527 : {
8528 0 : if (dump_enabled_p ())
8529 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8530 : "missed mask promotion\n");
8531 0 : return false;
8532 : }
8533 :
8534 821 : SLP_TREE_TYPE (slp_node) = lc_phi_info_type;
8535 821 : return true;
8536 : }
8537 :
8538 : bool
8539 530 : vect_transform_lc_phi (loop_vec_info loop_vinfo,
8540 : stmt_vec_info stmt_info,
8541 : slp_tree slp_node)
8542 : {
8543 :
8544 530 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8545 530 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8546 530 : basic_block bb = gimple_bb (stmt_info->stmt);
8547 530 : edge e = single_pred_edge (bb);
8548 530 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8549 530 : auto_vec<tree> vec_oprnds;
8550 1060 : vect_get_vec_defs (loop_vinfo, slp_node,
8551 530 : gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8552 1175 : for (unsigned i = 0; i < vec_oprnds.length (); i++)
8553 : {
8554 : /* Create the vectorized LC PHI node. */
8555 645 : gphi *new_phi = create_phi_node (vec_dest, bb);
8556 645 : add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8557 645 : slp_node->push_vec_def (new_phi);
8558 : }
8559 :
8560 530 : return true;
8561 530 : }
8562 :
8563 : /* Vectorizes PHIs. */
8564 :
8565 : bool
8566 138388 : vectorizable_phi (bb_vec_info vinfo,
8567 : stmt_vec_info stmt_info,
8568 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8569 : {
8570 138388 : if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8571 : return false;
8572 :
8573 72105 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8574 : return false;
8575 :
8576 72105 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8577 :
8578 72105 : if (cost_vec) /* transformation not required. */
8579 : {
8580 : slp_tree child;
8581 : unsigned i;
8582 197654 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8583 139731 : if (!child)
8584 : {
8585 0 : if (dump_enabled_p ())
8586 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8587 : "PHI node with unvectorized backedge def\n");
8588 0 : return false;
8589 : }
8590 139731 : else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8591 : {
8592 18 : if (dump_enabled_p ())
8593 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8594 : "incompatible vector types for invariants\n");
8595 18 : return false;
8596 : }
8597 139713 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8598 139713 : && !useless_type_conversion_p (vectype,
8599 : SLP_TREE_VECTYPE (child)))
8600 : {
8601 : /* With bools we can have mask and non-mask precision vectors
8602 : or different non-mask precisions. while pattern recog is
8603 : supposed to guarantee consistency here bugs in it can cause
8604 : mismatches (PR103489 and PR103800 for example).
8605 : Deal with them here instead of ICEing later. */
8606 18 : if (dump_enabled_p ())
8607 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8608 : "incompatible vector type setup from "
8609 : "bool pattern detection\n");
8610 18 : return false;
8611 : }
8612 :
8613 : /* For single-argument PHIs assume coalescing which means zero cost
8614 : for the scalar and the vector PHIs. This avoids artificially
8615 : favoring the vector path (but may pessimize it in some cases). */
8616 57923 : if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8617 52469 : record_stmt_cost (cost_vec, vect_get_num_copies (vinfo, slp_node),
8618 : vector_stmt, slp_node, vectype, 0, vect_body);
8619 57923 : SLP_TREE_TYPE (slp_node) = phi_info_type;
8620 57923 : return true;
8621 : }
8622 :
8623 14146 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8624 14146 : basic_block bb = gimple_bb (stmt_info->stmt);
8625 14146 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8626 14146 : auto_vec<gphi *> new_phis;
8627 51634 : for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8628 : {
8629 37488 : slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8630 :
8631 : /* Skip not yet vectorized defs. */
8632 37935 : if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8633 37488 : && SLP_TREE_VEC_DEFS (child).is_empty ())
8634 447 : continue;
8635 :
8636 37041 : auto_vec<tree> vec_oprnds;
8637 37041 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8638 37041 : if (!new_phis.exists ())
8639 : {
8640 14146 : new_phis.create (vec_oprnds.length ());
8641 29933 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
8642 : {
8643 : /* Create the vectorized LC PHI node. */
8644 15787 : new_phis.quick_push (create_phi_node (vec_dest, bb));
8645 15787 : slp_node->push_vec_def (new_phis[j]);
8646 : }
8647 : }
8648 37041 : edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8649 80827 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
8650 43786 : add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8651 37041 : }
8652 : /* We should have at least one already vectorized child. */
8653 14146 : gcc_assert (new_phis.exists ());
8654 :
8655 14146 : return true;
8656 14146 : }
8657 :
8658 : /* Vectorizes first order recurrences. An overview of the transformation
8659 : is described below. Suppose we have the following loop.
8660 :
8661 : int t = 0;
8662 : for (int i = 0; i < n; ++i)
8663 : {
8664 : b[i] = a[i] - t;
8665 : t = a[i];
8666 : }
8667 :
8668 : There is a first-order recurrence on 'a'. For this loop, the scalar IR
8669 : looks (simplified) like:
8670 :
8671 : scalar.preheader:
8672 : init = 0;
8673 :
8674 : scalar.body:
8675 : i = PHI <0(scalar.preheader), i+1(scalar.body)>
8676 : _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8677 : _1 = a[i]
8678 : b[i] = _1 - _2
8679 : if (i < n) goto scalar.body
8680 :
8681 : In this example, _2 is a recurrence because it's value depends on the
8682 : previous iteration. We vectorize this as (VF = 4)
8683 :
8684 : vector.preheader:
8685 : vect_init = vect_cst(..., ..., ..., 0)
8686 :
8687 : vector.body
8688 : i = PHI <0(vector.preheader), i+4(vector.body)>
8689 : vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8690 : vect_2 = a[i, i+1, i+2, i+3];
8691 : vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8692 : b[i, i+1, i+2, i+3] = vect_2 - vect_3
8693 : if (..) goto vector.body
8694 :
8695 : In this function, vectorizable_recurr, we code generate both the
8696 : vector PHI node and the permute since those together compute the
8697 : vectorized value of the scalar PHI. We do not yet have the
8698 : backedge value to fill in there nor into the vec_perm. Those
8699 : are filled in vect_schedule_scc.
8700 :
8701 : TODO: Since the scalar loop does not have a use of the recurrence
8702 : outside of the loop the natural way to implement peeling via
8703 : vectorizing the live value doesn't work. For now peeling of loops
8704 : with a recurrence is not implemented. For SLP the supported cases
8705 : are restricted to those requiring a single vector recurrence PHI. */
8706 :
8707 : bool
8708 181057 : vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8709 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8710 : {
8711 181057 : if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8712 : return false;
8713 :
8714 35065 : gphi *phi = as_a<gphi *> (stmt_info->stmt);
8715 :
8716 : /* So far we only support first-order recurrence auto-vectorization. */
8717 35065 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8718 : return false;
8719 :
8720 418 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8721 418 : unsigned ncopies = vect_get_num_copies (loop_vinfo, slp_node);
8722 418 : poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8723 418 : unsigned dist = SLP_TREE_LANES (slp_node);
8724 : /* We need to be able to make progress with a single vector. */
8725 418 : if (maybe_gt (dist * 2, nunits))
8726 : {
8727 0 : if (dump_enabled_p ())
8728 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8729 : "first order recurrence exceeds half of "
8730 : "a vector\n");
8731 0 : return false;
8732 : }
8733 :
8734 : /* We need to be able to build a { ..., a, b } init vector with
8735 : dist number of distinct trailing values. Always possible
8736 : when dist == 1 or when nunits is constant or when the initializations
8737 : are uniform. */
8738 418 : tree uniform_initval = NULL_TREE;
8739 418 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8740 1696 : for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
8741 : {
8742 454 : gphi *phi = as_a <gphi *> (s->stmt);
8743 454 : if (! uniform_initval)
8744 418 : uniform_initval = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8745 36 : else if (! operand_equal_p (uniform_initval,
8746 36 : PHI_ARG_DEF_FROM_EDGE (phi, pe)))
8747 : {
8748 : uniform_initval = NULL_TREE;
8749 : break;
8750 : }
8751 : }
8752 418 : if (!uniform_initval && !nunits.is_constant ())
8753 : {
8754 : if (dump_enabled_p ())
8755 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8756 : "cannot build initialization vector for "
8757 : "first order recurrence\n");
8758 : return false;
8759 : }
8760 :
8761 : /* First-order recurrence autovectorization needs to handle permutation
8762 : with indices = [nunits-1, nunits, nunits+1, ...]. */
8763 418 : vec_perm_builder sel (nunits, 1, 3);
8764 1672 : for (int i = 0; i < 3; ++i)
8765 1254 : sel.quick_push (nunits - dist + i);
8766 418 : vec_perm_indices indices (sel, 2, nunits);
8767 :
8768 418 : if (cost_vec) /* transformation not required. */
8769 : {
8770 373 : if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8771 : indices))
8772 : return false;
8773 :
8774 : /* We eventually need to set a vector type on invariant
8775 : arguments. */
8776 : unsigned j;
8777 : slp_tree child;
8778 783 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8779 522 : if (!vect_maybe_update_slp_op_vectype (child, vectype))
8780 : {
8781 0 : if (dump_enabled_p ())
8782 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8783 : "incompatible vector types for "
8784 : "invariants\n");
8785 0 : return false;
8786 : }
8787 :
8788 : /* Verify we have set up compatible types. */
8789 261 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8790 261 : slp_tree latch_def = SLP_TREE_CHILDREN (slp_node)[le->dest_idx];
8791 261 : tree latch_vectype = SLP_TREE_VECTYPE (latch_def);
8792 261 : if (!types_compatible_p (latch_vectype, vectype))
8793 : return false;
8794 :
8795 : /* The recurrence costs the initialization vector and one permute
8796 : for each copy. With SLP the prologue value is explicitly
8797 : represented and costed separately. */
8798 261 : unsigned prologue_cost = 0;
8799 261 : unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8800 : slp_node, 0, vect_body);
8801 261 : if (dump_enabled_p ())
8802 53 : dump_printf_loc (MSG_NOTE, vect_location,
8803 : "vectorizable_recurr: inside_cost = %d, "
8804 : "prologue_cost = %d .\n", inside_cost,
8805 : prologue_cost);
8806 :
8807 261 : SLP_TREE_TYPE (slp_node) = recurr_info_type;
8808 261 : return true;
8809 : }
8810 :
8811 45 : tree vec_init;
8812 45 : if (! uniform_initval)
8813 : {
8814 6 : vec<constructor_elt, va_gc> *v = NULL;
8815 6 : vec_alloc (v, nunits.to_constant ());
8816 33 : for (unsigned i = 0; i < nunits.to_constant () - dist; ++i)
8817 27 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
8818 : build_zero_cst (TREE_TYPE (vectype)));
8819 39 : for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
8820 : {
8821 21 : gphi *phi = as_a <gphi *> (s->stmt);
8822 21 : tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8823 21 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
8824 21 : TREE_TYPE (preheader)))
8825 : {
8826 0 : gimple_seq stmts = NULL;
8827 0 : preheader = gimple_convert (&stmts,
8828 0 : TREE_TYPE (vectype), preheader);
8829 0 : gsi_insert_seq_on_edge_immediate (pe, stmts);
8830 : }
8831 21 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, preheader);
8832 : }
8833 6 : vec_init = build_constructor (vectype, v);
8834 : }
8835 : else
8836 : vec_init = uniform_initval;
8837 45 : vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8838 :
8839 : /* Create the vectorized first-order PHI node. */
8840 45 : tree vec_dest = vect_get_new_vect_var (vectype,
8841 : vect_simple_var, "vec_recur_");
8842 45 : basic_block bb = gimple_bb (phi);
8843 45 : gphi *new_phi = create_phi_node (vec_dest, bb);
8844 45 : add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8845 :
8846 : /* Insert shuffles the first-order recurrence autovectorization.
8847 : result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
8848 45 : tree perm = vect_gen_perm_mask_checked (vectype, indices);
8849 :
8850 : /* Insert the required permute after the latch definition. The
8851 : second and later operands are tentative and will be updated when we have
8852 : vectorized the latch definition. */
8853 45 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8854 45 : gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8855 45 : gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8856 53 : do
8857 : {
8858 53 : gsi_next (&gsi2);
8859 : }
8860 : /* Skip inserted vectorized stmts for the latch definition. We have to
8861 : insert after those. */
8862 98 : while (gsi_stmt (gsi2) && gimple_uid (gsi_stmt (gsi2)) == 0);
8863 :
8864 127 : for (unsigned i = 0; i < ncopies; ++i)
8865 : {
8866 82 : vec_dest = make_ssa_name (vectype);
8867 82 : gassign *vperm
8868 127 : = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8869 45 : i == 0 ? gimple_phi_result (new_phi) : NULL,
8870 : NULL, perm);
8871 82 : vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8872 :
8873 82 : slp_node->push_vec_def (vperm);
8874 : }
8875 :
8876 : return true;
8877 418 : }
8878 :
8879 : /* Return true if VECTYPE represents a vector that requires lowering
8880 : by the vector lowering pass. */
8881 :
8882 : bool
8883 781584 : vect_emulated_vector_p (tree vectype)
8884 : {
8885 1563168 : return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8886 785673 : && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8887 4071 : || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8888 : }
8889 :
8890 : /* Return true if we can emulate CODE on an integer mode representation
8891 : of a vector. */
8892 :
8893 : bool
8894 11763 : vect_can_vectorize_without_simd_p (tree_code code)
8895 : {
8896 11763 : switch (code)
8897 : {
8898 : case PLUS_EXPR:
8899 : case MINUS_EXPR:
8900 : case NEGATE_EXPR:
8901 : case BIT_AND_EXPR:
8902 : case BIT_IOR_EXPR:
8903 : case BIT_XOR_EXPR:
8904 : case BIT_NOT_EXPR:
8905 : return true;
8906 :
8907 11198 : default:
8908 11198 : return false;
8909 : }
8910 : }
8911 :
8912 : /* Likewise, but taking a code_helper. */
8913 :
8914 : bool
8915 992 : vect_can_vectorize_without_simd_p (code_helper code)
8916 : {
8917 992 : return (code.is_tree_code ()
8918 992 : && vect_can_vectorize_without_simd_p (tree_code (code)));
8919 : }
8920 :
8921 : /* Create vector init for vectorized iv. */
8922 : static tree
8923 916 : vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8924 : tree step_expr, poly_uint64 nunits,
8925 : tree vectype,
8926 : enum vect_induction_op_type induction_type)
8927 : {
8928 916 : unsigned HOST_WIDE_INT const_nunits;
8929 916 : tree vec_shift, vec_init, new_name;
8930 916 : unsigned i;
8931 916 : tree itype = TREE_TYPE (vectype);
8932 :
8933 : /* iv_loop is the loop to be vectorized. Create:
8934 : vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
8935 916 : new_name = gimple_convert (stmts, itype, init_expr);
8936 916 : switch (induction_type)
8937 : {
8938 18 : case vect_step_op_shr:
8939 18 : case vect_step_op_shl:
8940 : /* Build the Initial value from shift_expr. */
8941 18 : vec_init = gimple_build_vector_from_val (stmts,
8942 : vectype,
8943 : new_name);
8944 18 : vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
8945 : build_zero_cst (itype), step_expr);
8946 18 : vec_init = gimple_build (stmts,
8947 : (induction_type == vect_step_op_shr
8948 : ? RSHIFT_EXPR : LSHIFT_EXPR),
8949 : vectype, vec_init, vec_shift);
8950 18 : break;
8951 :
8952 822 : case vect_step_op_neg:
8953 822 : {
8954 822 : vec_init = gimple_build_vector_from_val (stmts,
8955 : vectype,
8956 : new_name);
8957 822 : tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
8958 : vectype, vec_init);
8959 : /* The encoding has 2 interleaved stepped patterns. */
8960 822 : vec_perm_builder sel (nunits, 2, 3);
8961 822 : sel.quick_grow (6);
8962 4110 : for (i = 0; i < 3; i++)
8963 : {
8964 2466 : sel[2 * i] = i;
8965 2466 : sel[2 * i + 1] = i + nunits;
8966 : }
8967 822 : vec_perm_indices indices (sel, 2, nunits);
8968 : /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
8969 : fail when vec_init is const vector. In that situation vec_perm is not
8970 : really needed. */
8971 822 : tree perm_mask_even
8972 822 : = vect_gen_perm_mask_any (vectype, indices);
8973 822 : vec_init = gimple_build (stmts, VEC_PERM_EXPR,
8974 : vectype,
8975 : vec_init, vec_neg,
8976 : perm_mask_even);
8977 822 : }
8978 822 : break;
8979 :
8980 76 : case vect_step_op_mul:
8981 76 : {
8982 : /* Use unsigned mult to avoid UD integer overflow. */
8983 76 : gcc_assert (nunits.is_constant (&const_nunits));
8984 76 : tree utype = unsigned_type_for (itype);
8985 76 : tree uvectype = build_vector_type (utype,
8986 76 : TYPE_VECTOR_SUBPARTS (vectype));
8987 76 : new_name = gimple_convert (stmts, utype, new_name);
8988 76 : vec_init = gimple_build_vector_from_val (stmts,
8989 : uvectype,
8990 : new_name);
8991 76 : tree_vector_builder elts (uvectype, const_nunits, 1);
8992 76 : tree elt_step = build_one_cst (utype);
8993 :
8994 76 : elts.quick_push (elt_step);
8995 660 : for (i = 1; i < const_nunits; i++)
8996 : {
8997 : /* Create: new_name_i = new_name + step_expr. */
8998 508 : elt_step = gimple_build (stmts, MULT_EXPR,
8999 : utype, elt_step, step_expr);
9000 508 : elts.quick_push (elt_step);
9001 : }
9002 : /* Create a vector from [new_name_0, new_name_1, ...,
9003 : new_name_nunits-1]. */
9004 76 : tree vec_mul = gimple_build_vector (stmts, &elts);
9005 76 : vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9006 : vec_init, vec_mul);
9007 76 : vec_init = gimple_convert (stmts, vectype, vec_init);
9008 76 : }
9009 76 : break;
9010 :
9011 0 : default:
9012 0 : gcc_unreachable ();
9013 : }
9014 :
9015 916 : return vec_init;
9016 : }
9017 :
9018 : /* Peel init_expr by skip_niter for induction_type. */
9019 : tree
9020 84 : vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9021 : tree skip_niters, tree step_expr,
9022 : enum vect_induction_op_type induction_type,
9023 : bool early_exit_p)
9024 : {
9025 84 : gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST || early_exit_p);
9026 84 : tree type = TREE_TYPE (init_expr);
9027 84 : unsigned prec = TYPE_PRECISION (type);
9028 84 : switch (induction_type)
9029 : {
9030 : /* neg inductions are typically not used for loop termination conditions but
9031 : are typically implemented as b = -b. That is every scalar iteration b is
9032 : negated. That means that for the initial value of b we will have to
9033 : determine whether the number of skipped iteration is a multiple of 2
9034 : because every 2 scalar iterations we are back at "b". */
9035 0 : case vect_step_op_neg:
9036 : /* For early exits the neg induction will always be the same value at the
9037 : start of the iteration. */
9038 0 : if (early_exit_p)
9039 : break;
9040 :
9041 0 : if (TREE_INT_CST_LOW (skip_niters) % 2)
9042 0 : init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9043 : /* else no change. */
9044 : break;
9045 :
9046 12 : case vect_step_op_shr:
9047 12 : case vect_step_op_shl:
9048 12 : skip_niters = fold_build1 (NOP_EXPR, type, skip_niters);
9049 12 : step_expr = fold_build1 (NOP_EXPR, type, step_expr);
9050 12 : step_expr = fold_build2 (MULT_EXPR, type, step_expr, skip_niters);
9051 : /* When shift mount >= precision, need to avoid UD.
9052 : In the original loop, there's no UD, and according to semantic,
9053 : init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9054 12 : if ((!tree_fits_uhwi_p (step_expr)
9055 12 : || tree_to_uhwi (step_expr) >= prec)
9056 6 : && !early_exit_p)
9057 : {
9058 6 : if (induction_type == vect_step_op_shl
9059 6 : || TYPE_UNSIGNED (type))
9060 4 : init_expr = build_zero_cst (type);
9061 : else
9062 2 : init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9063 : init_expr,
9064 4 : wide_int_to_tree (type, prec - 1));
9065 : }
9066 : else
9067 : {
9068 8 : init_expr = fold_build2 ((induction_type == vect_step_op_shr
9069 : ? RSHIFT_EXPR : LSHIFT_EXPR),
9070 : type, init_expr, step_expr);
9071 6 : init_expr = force_gimple_operand (init_expr, stmts, false, NULL);
9072 : }
9073 : break;
9074 :
9075 72 : case vect_step_op_mul:
9076 72 : {
9077 : /* Due to UB we can't support vect_step_op_mul with early break for now.
9078 : so assert and block. */
9079 72 : gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9080 72 : tree utype = unsigned_type_for (type);
9081 72 : init_expr = gimple_convert (stmts, utype, init_expr);
9082 72 : wide_int skipn = wi::to_wide (skip_niters);
9083 72 : wide_int begin = wi::to_wide (step_expr);
9084 72 : auto_mpz base, exp, mod, res;
9085 72 : wi::to_mpz (begin, base, TYPE_SIGN (type));
9086 72 : wi::to_mpz (skipn, exp, UNSIGNED);
9087 72 : mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9088 72 : mpz_powm (res, base, exp, mod);
9089 72 : begin = wi::from_mpz (utype, res, true);
9090 72 : tree mult_expr = wide_int_to_tree (utype, begin);
9091 72 : init_expr = gimple_build (stmts, MULT_EXPR, utype,
9092 : init_expr, mult_expr);
9093 72 : init_expr = gimple_convert (stmts, type, init_expr);
9094 72 : }
9095 72 : break;
9096 :
9097 0 : default:
9098 0 : gcc_unreachable ();
9099 : }
9100 :
9101 84 : return init_expr;
9102 : }
9103 :
9104 : /* Create vector step for vectorized iv. */
9105 : static tree
9106 1202 : vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9107 : poly_uint64 vf,
9108 : enum vect_induction_op_type induction_type)
9109 : {
9110 1202 : tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9111 1202 : tree new_name = NULL;
9112 : /* Step should be pow (step, vf) for mult induction. */
9113 1202 : if (induction_type == vect_step_op_mul)
9114 : {
9115 76 : gcc_assert (vf.is_constant ());
9116 76 : wide_int begin = wi::to_wide (step_expr);
9117 :
9118 584 : for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9119 508 : begin = wi::mul (begin, wi::to_wide (step_expr));
9120 :
9121 76 : new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9122 76 : }
9123 1126 : else if (induction_type == vect_step_op_neg)
9124 : /* Do nothing. */
9125 : ;
9126 : else
9127 18 : new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9128 : expr, step_expr);
9129 1202 : return new_name;
9130 : }
9131 :
9132 : static tree
9133 1202 : vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9134 : stmt_vec_info stmt_info,
9135 : tree new_name, tree vectype,
9136 : enum vect_induction_op_type induction_type)
9137 : {
9138 : /* No step is needed for neg induction. */
9139 1202 : if (induction_type == vect_step_op_neg)
9140 : return NULL;
9141 :
9142 94 : tree t = unshare_expr (new_name);
9143 94 : gcc_assert (CONSTANT_CLASS_P (new_name)
9144 : || TREE_CODE (new_name) == SSA_NAME);
9145 94 : tree new_vec = build_vector_from_val (vectype, t);
9146 94 : tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9147 : new_vec, vectype, NULL);
9148 94 : return vec_step;
9149 : }
9150 :
9151 : /* Update vectorized iv with vect_step, induc_def is init. */
9152 : static tree
9153 1390 : vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9154 : tree induc_def, tree vec_step,
9155 : enum vect_induction_op_type induction_type)
9156 : {
9157 1390 : tree vec_def = induc_def;
9158 1390 : switch (induction_type)
9159 : {
9160 76 : case vect_step_op_mul:
9161 76 : {
9162 : /* Use unsigned mult to avoid UD integer overflow. */
9163 76 : tree uvectype = unsigned_type_for (vectype);
9164 76 : vec_def = gimple_convert (stmts, uvectype, vec_def);
9165 76 : vec_step = gimple_convert (stmts, uvectype, vec_step);
9166 76 : vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9167 : vec_def, vec_step);
9168 76 : vec_def = gimple_convert (stmts, vectype, vec_def);
9169 : }
9170 76 : break;
9171 :
9172 12 : case vect_step_op_shr:
9173 12 : vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9174 : vec_def, vec_step);
9175 12 : break;
9176 :
9177 6 : case vect_step_op_shl:
9178 6 : vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9179 : vec_def, vec_step);
9180 6 : break;
9181 : case vect_step_op_neg:
9182 : vec_def = induc_def;
9183 : /* Do nothing. */
9184 : break;
9185 0 : default:
9186 0 : gcc_unreachable ();
9187 : }
9188 :
9189 1390 : return vec_def;
9190 :
9191 : }
9192 :
9193 : /* Function vectorizable_nonlinear_induction
9194 :
9195 : Check if STMT_INFO performs an nonlinear induction computation that can be
9196 : vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9197 : a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9198 : basic block.
9199 : Return true if STMT_INFO is vectorizable in this way. */
9200 :
9201 : static bool
9202 9198 : vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9203 : stmt_vec_info stmt_info,
9204 : slp_tree slp_node,
9205 : stmt_vector_for_cost *cost_vec)
9206 : {
9207 9198 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9208 9198 : unsigned ncopies;
9209 9198 : bool nested_in_vect_loop = false;
9210 9198 : class loop *iv_loop;
9211 9198 : tree vec_def;
9212 9198 : edge pe = loop_preheader_edge (loop);
9213 9198 : basic_block new_bb;
9214 9198 : tree vec_init, vec_step;
9215 9198 : tree new_name;
9216 9198 : gimple *new_stmt;
9217 9198 : gphi *induction_phi;
9218 9198 : tree induc_def, vec_dest;
9219 9198 : tree init_expr, step_expr;
9220 9198 : tree niters_skip;
9221 9198 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9222 9198 : unsigned i;
9223 9198 : gimple_stmt_iterator si;
9224 :
9225 9198 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9226 :
9227 9198 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9228 9198 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9229 9198 : enum vect_induction_op_type induction_type
9230 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9231 :
9232 9198 : gcc_assert (induction_type > vect_step_op_add);
9233 :
9234 9198 : ncopies = vect_get_num_copies (loop_vinfo, slp_node);
9235 9198 : gcc_assert (ncopies >= 1);
9236 :
9237 : /* FORNOW. Only handle nonlinear induction in the same loop. */
9238 9198 : if (nested_in_vect_loop_p (loop, stmt_info))
9239 : {
9240 0 : if (dump_enabled_p ())
9241 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9242 : "nonlinear induction in nested loop.\n");
9243 0 : return false;
9244 : }
9245 :
9246 9198 : iv_loop = loop;
9247 9198 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9248 :
9249 : /* TODO: Support multi-lane SLP for nonlinear iv. There should be separate
9250 : vector iv update for each iv and a permutation to generate wanted
9251 : vector iv. */
9252 9198 : if (SLP_TREE_LANES (slp_node) > 1)
9253 : {
9254 0 : if (dump_enabled_p ())
9255 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9256 : "SLP induction not supported for nonlinear"
9257 : " induction.\n");
9258 0 : return false;
9259 : }
9260 :
9261 9198 : if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9262 : {
9263 0 : if (dump_enabled_p ())
9264 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9265 : "floating point nonlinear induction vectorization"
9266 : " not supported.\n");
9267 0 : return false;
9268 : }
9269 :
9270 9198 : step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9271 9198 : init_expr = vect_phi_initial_value (phi);
9272 9198 : gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9273 : && TREE_CODE (step_expr) == INTEGER_CST);
9274 : /* step_expr should be aligned with init_expr,
9275 : .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9276 9198 : step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9277 :
9278 9198 : if (TREE_CODE (init_expr) == INTEGER_CST)
9279 4085 : init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9280 5113 : else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9281 : {
9282 : /* INIT_EXPR could be a bit_field, bail out for such case. */
9283 4 : if (dump_enabled_p ())
9284 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9285 : "nonlinear induction vectorization failed:"
9286 : " component type of vectype is not a nop conversion"
9287 : " from type of init_expr.\n");
9288 4 : return false;
9289 : }
9290 :
9291 9194 : switch (induction_type)
9292 : {
9293 3714 : case vect_step_op_neg:
9294 3714 : if (maybe_eq (TYPE_VECTOR_SUBPARTS (vectype), 1u))
9295 : return false;
9296 3552 : if (TREE_CODE (init_expr) != INTEGER_CST
9297 282 : && TREE_CODE (init_expr) != REAL_CST)
9298 : {
9299 : /* Check for backend support of NEGATE_EXPR and vec_perm. */
9300 282 : if (!directly_supported_p (NEGATE_EXPR, vectype))
9301 0 : return false;
9302 :
9303 : /* The encoding has 2 interleaved stepped patterns. */
9304 282 : vec_perm_builder sel (nunits, 2, 3);
9305 282 : machine_mode mode = TYPE_MODE (vectype);
9306 282 : sel.quick_grow (6);
9307 1410 : for (i = 0; i < 3; i++)
9308 : {
9309 846 : sel[i * 2] = i;
9310 846 : sel[i * 2 + 1] = i + nunits;
9311 : }
9312 282 : vec_perm_indices indices (sel, 2, nunits);
9313 282 : if (!can_vec_perm_const_p (mode, mode, indices))
9314 0 : return false;
9315 282 : }
9316 : break;
9317 :
9318 1058 : case vect_step_op_mul:
9319 1058 : {
9320 : /* Check for backend support of MULT_EXPR. */
9321 1058 : if (!directly_supported_p (MULT_EXPR, vectype))
9322 : return false;
9323 :
9324 : /* ?? How to construct vector step for variable number vector.
9325 : [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9326 : if (!vf.is_constant ())
9327 : return false;
9328 : }
9329 : break;
9330 :
9331 4104 : case vect_step_op_shr:
9332 : /* Check for backend support of RSHIFT_EXPR. */
9333 4104 : if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9334 : return false;
9335 :
9336 : /* Don't shift more than type precision to avoid UD. */
9337 26 : if (!tree_fits_uhwi_p (step_expr)
9338 26 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9339 : TYPE_PRECISION (TREE_TYPE (init_expr))))
9340 : return false;
9341 : break;
9342 :
9343 318 : case vect_step_op_shl:
9344 : /* Check for backend support of RSHIFT_EXPR. */
9345 318 : if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9346 : return false;
9347 :
9348 : /* Don't shift more than type precision to avoid UD. */
9349 12 : if (!tree_fits_uhwi_p (step_expr)
9350 12 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9351 : TYPE_PRECISION (TREE_TYPE (init_expr))))
9352 : return false;
9353 :
9354 : break;
9355 :
9356 0 : default:
9357 0 : gcc_unreachable ();
9358 : }
9359 :
9360 4412 : if (cost_vec) /* transformation not required. */
9361 : {
9362 3496 : unsigned inside_cost = 0, prologue_cost = 0;
9363 : /* loop cost for vec_loop. Neg induction doesn't have any
9364 : inside_cost. */
9365 3496 : inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9366 : slp_node, 0, vect_body);
9367 :
9368 : /* loop cost for vec_loop. Neg induction doesn't have any
9369 : inside_cost. */
9370 3496 : if (induction_type == vect_step_op_neg)
9371 2730 : inside_cost = 0;
9372 :
9373 : /* prologue cost for vec_init and vec_step. */
9374 3496 : prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9375 : slp_node, 0, vect_prologue);
9376 :
9377 3496 : if (dump_enabled_p ())
9378 68 : dump_printf_loc (MSG_NOTE, vect_location,
9379 : "vect_model_induction_cost: inside_cost = %d, "
9380 : "prologue_cost = %d. \n", inside_cost,
9381 : prologue_cost);
9382 :
9383 3496 : SLP_TREE_TYPE (slp_node) = induc_vec_info_type;
9384 3496 : DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9385 3496 : return true;
9386 : }
9387 :
9388 : /* Transform. */
9389 :
9390 : /* Compute a vector variable, initialized with the first VF values of
9391 : the induction variable. E.g., for an iv with IV_PHI='X' and
9392 : evolution S, for a vector of 4 units, we want to compute:
9393 : [X, X + S, X + 2*S, X + 3*S]. */
9394 :
9395 916 : if (dump_enabled_p ())
9396 32 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9397 :
9398 916 : pe = loop_preheader_edge (iv_loop);
9399 : /* Find the first insertion point in the BB. */
9400 916 : basic_block bb = gimple_bb (phi);
9401 916 : si = gsi_after_labels (bb);
9402 :
9403 916 : gimple_seq stmts = NULL;
9404 :
9405 916 : niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9406 : /* If we are using the loop mask to "peel" for alignment then we need
9407 : to adjust the start value here. */
9408 916 : if (niters_skip != NULL_TREE)
9409 0 : init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9410 : step_expr, induction_type, false);
9411 :
9412 916 : vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9413 : step_expr, nunits, vectype,
9414 : induction_type);
9415 916 : if (stmts)
9416 : {
9417 162 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9418 162 : gcc_assert (!new_bb);
9419 : }
9420 :
9421 916 : stmts = NULL;
9422 916 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9423 : vf, induction_type);
9424 916 : if (stmts)
9425 : {
9426 0 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9427 0 : gcc_assert (!new_bb);
9428 : }
9429 :
9430 916 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9431 : new_name, vectype,
9432 : induction_type);
9433 : /* Create the following def-use cycle:
9434 : loop prolog:
9435 : vec_init = ...
9436 : vec_step = ...
9437 : loop:
9438 : vec_iv = PHI <vec_init, vec_loop>
9439 : ...
9440 : STMT
9441 : ...
9442 : vec_loop = vec_iv + vec_step; */
9443 :
9444 : /* Create the induction-phi that defines the induction-operand. */
9445 916 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9446 916 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
9447 916 : induc_def = PHI_RESULT (induction_phi);
9448 :
9449 : /* Create the iv update inside the loop. */
9450 916 : stmts = NULL;
9451 916 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9452 : induc_def, vec_step,
9453 : induction_type);
9454 :
9455 916 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9456 916 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9457 :
9458 : /* Set the arguments of the phi node: */
9459 916 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9460 916 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9461 : UNKNOWN_LOCATION);
9462 :
9463 916 : slp_node->push_vec_def (induction_phi);
9464 :
9465 : /* In case that vectorization factor (VF) is bigger than the number
9466 : of elements that we can fit in a vectype (nunits), we have to generate
9467 : more than one vector stmt - i.e - we need to "unroll" the
9468 : vector stmt by a factor VF/nunits. For more details see documentation
9469 : in vectorizable_operation. */
9470 :
9471 916 : if (ncopies > 1)
9472 : {
9473 286 : stmts = NULL;
9474 : /* FORNOW. This restriction should be relaxed. */
9475 286 : gcc_assert (!nested_in_vect_loop);
9476 :
9477 286 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9478 : nunits, induction_type);
9479 :
9480 286 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9481 : new_name, vectype,
9482 : induction_type);
9483 286 : vec_def = induc_def;
9484 1046 : for (i = 1; i < ncopies; i++)
9485 : {
9486 : /* vec_i = vec_prev + vec_step. */
9487 474 : stmts = NULL;
9488 474 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9489 : vec_def, vec_step,
9490 : induction_type);
9491 474 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9492 474 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9493 474 : slp_node->push_vec_def (new_stmt);
9494 : }
9495 : }
9496 :
9497 916 : if (dump_enabled_p ())
9498 64 : dump_printf_loc (MSG_NOTE, vect_location,
9499 : "transform induction: created def-use cycle: %G%G",
9500 32 : (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9501 :
9502 : return true;
9503 : }
9504 :
9505 : /* Function vectorizable_induction
9506 :
9507 : Check if STMT_INFO performs an induction computation that can be vectorized.
9508 : If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9509 : phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9510 : Return true if STMT_INFO is vectorizable in this way. */
9511 :
9512 : bool
9513 316179 : vectorizable_induction (loop_vec_info loop_vinfo,
9514 : stmt_vec_info stmt_info,
9515 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9516 : {
9517 316179 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9518 316179 : bool nested_in_vect_loop = false;
9519 316179 : class loop *iv_loop;
9520 316179 : tree vec_def;
9521 316179 : edge pe = loop_preheader_edge (loop);
9522 316179 : basic_block new_bb;
9523 316179 : tree vec_init = NULL_TREE, vec_step, t;
9524 316179 : tree new_name;
9525 316179 : gphi *induction_phi;
9526 316179 : tree induc_def, vec_dest;
9527 316179 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9528 316179 : unsigned i;
9529 316179 : tree expr;
9530 316179 : tree index_vectype = NULL_TREE;
9531 316179 : gimple_stmt_iterator si;
9532 316179 : enum vect_induction_op_type induction_type
9533 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9534 :
9535 347234 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9536 170187 : if (!phi)
9537 : return false;
9538 :
9539 170187 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
9540 : return false;
9541 :
9542 : /* Make sure it was recognized as induction computation. */
9543 170187 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9544 : return false;
9545 :
9546 : /* Handle nonlinear induction in a separate place. */
9547 166130 : if (induction_type != vect_step_op_add)
9548 9198 : return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9549 9198 : slp_node, cost_vec);
9550 :
9551 156932 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9552 156932 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9553 :
9554 : /* FORNOW. These restrictions should be relaxed. */
9555 156932 : if (nested_in_vect_loop_p (loop, stmt_info))
9556 : {
9557 813 : imm_use_iterator imm_iter;
9558 813 : use_operand_p use_p;
9559 813 : gimple *exit_phi;
9560 813 : edge latch_e;
9561 813 : tree loop_arg;
9562 :
9563 813 : exit_phi = NULL;
9564 813 : latch_e = loop_latch_edge (loop->inner);
9565 813 : loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9566 2475 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9567 : {
9568 873 : gimple *use_stmt = USE_STMT (use_p);
9569 873 : if (is_gimple_debug (use_stmt))
9570 36 : continue;
9571 :
9572 837 : if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9573 : {
9574 : exit_phi = use_stmt;
9575 : break;
9576 : }
9577 813 : }
9578 813 : if (exit_phi)
9579 : {
9580 24 : stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9581 24 : if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9582 8 : && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9583 : {
9584 16 : if (dump_enabled_p ())
9585 16 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9586 : "inner-loop induction only used outside "
9587 : "of the outer vectorized loop.\n");
9588 16 : return false;
9589 : }
9590 : }
9591 :
9592 797 : nested_in_vect_loop = true;
9593 797 : iv_loop = loop->inner;
9594 : }
9595 : else
9596 : iv_loop = loop;
9597 156916 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9598 :
9599 156916 : if (!nunits.is_constant () && SLP_TREE_LANES (slp_node) != 1)
9600 : {
9601 : /* The current SLP code creates the step value element-by-element. */
9602 : if (dump_enabled_p ())
9603 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9604 : "SLP induction not supported for variable-length"
9605 : " vectors.\n");
9606 : return false;
9607 : }
9608 :
9609 156916 : if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9610 : {
9611 12 : if (dump_enabled_p ())
9612 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9613 : "floating point induction vectorization disabled\n");
9614 12 : return false;
9615 : }
9616 :
9617 156904 : tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9618 156904 : gcc_assert (step_expr != NULL_TREE);
9619 313784 : if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
9620 313685 : && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
9621 : {
9622 12 : if (dump_enabled_p ())
9623 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9624 : "bit-precision induction vectorization not "
9625 : "supported.\n");
9626 12 : return false;
9627 : }
9628 156892 : tree stept = TREE_TYPE (step_expr);
9629 156892 : tree step_vectype = get_same_sized_vectype (stept, vectype);
9630 156892 : stept = TREE_TYPE (step_vectype);
9631 :
9632 : /* Check for target support of the vectorized arithmetic used here. */
9633 156892 : if (!target_supports_op_p (step_vectype, PLUS_EXPR, optab_default)
9634 156892 : || !target_supports_op_p (step_vectype, MINUS_EXPR, optab_default))
9635 26958 : return false;
9636 129934 : if (!nunits.is_constant ())
9637 : {
9638 : if (!target_supports_op_p (step_vectype, MULT_EXPR, optab_default))
9639 : return false;
9640 : /* FLOAT_EXPR when computing VEC_INIT for float inductions. */
9641 : if (SCALAR_FLOAT_TYPE_P (stept))
9642 : {
9643 : tree index_type = build_nonstandard_integer_type
9644 : (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (stept)), 1);
9645 :
9646 : index_vectype = build_vector_type (index_type, nunits);
9647 : if (!can_float_p (TYPE_MODE (step_vectype),
9648 : TYPE_MODE (index_vectype), 1))
9649 : return false;
9650 : }
9651 : }
9652 :
9653 129934 : unsigned nvects = vect_get_num_copies (loop_vinfo, slp_node);
9654 129934 : if (cost_vec) /* transformation not required. */
9655 : {
9656 343617 : unsigned inside_cost = 0, prologue_cost = 0;
9657 : /* We eventually need to set a vector type on invariant
9658 : arguments. */
9659 : unsigned j;
9660 : slp_tree child;
9661 343617 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9662 229078 : if (!vect_maybe_update_slp_op_vectype
9663 229078 : (child, SLP_TREE_VECTYPE (slp_node)))
9664 : {
9665 0 : if (dump_enabled_p ())
9666 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9667 : "incompatible vector types for "
9668 : "invariants\n");
9669 0 : return false;
9670 : }
9671 : /* loop cost for vec_loop. */
9672 114539 : inside_cost = record_stmt_cost (cost_vec, nvects,
9673 : vector_stmt, slp_node, 0, vect_body);
9674 : /* prologue cost for vec_init (if not nested) and step. */
9675 114539 : prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9676 : scalar_to_vec,
9677 : slp_node, 0, vect_prologue);
9678 114539 : if (dump_enabled_p ())
9679 4088 : dump_printf_loc (MSG_NOTE, vect_location,
9680 : "vect_model_induction_cost: inside_cost = %d, "
9681 : "prologue_cost = %d .\n", inside_cost,
9682 : prologue_cost);
9683 :
9684 114539 : SLP_TREE_TYPE (slp_node) = induc_vec_info_type;
9685 114539 : DUMP_VECT_SCOPE ("vectorizable_induction");
9686 114539 : return true;
9687 : }
9688 :
9689 : /* Transform. */
9690 :
9691 : /* Compute a vector variable, initialized with the first VF values of
9692 : the induction variable. E.g., for an iv with IV_PHI='X' and
9693 : evolution S, for a vector of 4 units, we want to compute:
9694 : [X, X + S, X + 2*S, X + 3*S]. */
9695 :
9696 15395 : if (dump_enabled_p ())
9697 2791 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9698 :
9699 15395 : pe = loop_preheader_edge (iv_loop);
9700 : /* Find the first insertion point in the BB. */
9701 15395 : basic_block bb = gimple_bb (phi);
9702 15395 : si = gsi_after_labels (bb);
9703 :
9704 : /* For SLP induction we have to generate several IVs as for example
9705 : with group size 3 we need
9706 : [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9707 : [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9708 15395 : gimple_stmt_iterator incr_si;
9709 15395 : bool insert_after;
9710 15395 : standard_iv_increment_position (iv_loop, &incr_si, &insert_after);
9711 :
9712 : /* The initial values are vectorized, but any lanes > group_size
9713 : need adjustment. */
9714 15395 : slp_tree init_node
9715 15395 : = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9716 :
9717 : /* Gather steps. Since we do not vectorize inductions as
9718 : cycles we have to reconstruct the step from SCEV data. */
9719 15395 : unsigned group_size = SLP_TREE_LANES (slp_node);
9720 15395 : tree *steps = XALLOCAVEC (tree, group_size);
9721 15395 : tree *inits = XALLOCAVEC (tree, group_size);
9722 15395 : stmt_vec_info phi_info;
9723 47459 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9724 : {
9725 16669 : steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9726 16669 : if (!init_node)
9727 16424 : inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9728 : pe->dest_idx);
9729 : }
9730 :
9731 : /* Now generate the IVs. */
9732 30790 : gcc_assert (multiple_p (nunits * nvects, group_size));
9733 15395 : unsigned nivs;
9734 15395 : unsigned HOST_WIDE_INT const_nunits;
9735 15395 : if (nested_in_vect_loop)
9736 : nivs = nvects;
9737 15171 : else if (nunits.is_constant (&const_nunits))
9738 : {
9739 : /* Compute the number of distinct IVs we need. First reduce
9740 : group_size if it is a multiple of const_nunits so we get
9741 : one IV for a group_size of 4 but const_nunits 2. */
9742 15171 : unsigned group_sizep = group_size;
9743 15171 : if (group_sizep % const_nunits == 0)
9744 113 : group_sizep = group_sizep / const_nunits;
9745 15171 : nivs = least_common_multiple (group_sizep, const_nunits) / const_nunits;
9746 : }
9747 : else
9748 : {
9749 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
9750 : nivs = 1;
9751 : }
9752 15395 : gimple_seq init_stmts = NULL;
9753 15395 : tree lupdate_mul = NULL_TREE;
9754 224 : if (!nested_in_vect_loop)
9755 : {
9756 15171 : if (nunits.is_constant (&const_nunits))
9757 : {
9758 : /* The number of iterations covered in one vector iteration. */
9759 15171 : unsigned lup_mul = (nvects * const_nunits) / group_size;
9760 15171 : lupdate_mul
9761 15171 : = build_vector_from_val (step_vectype,
9762 15171 : SCALAR_FLOAT_TYPE_P (stept)
9763 28 : ? build_real_from_wide (stept, lup_mul,
9764 : UNSIGNED)
9765 30314 : : build_int_cstu (stept, lup_mul));
9766 : }
9767 : else
9768 : {
9769 : if (SCALAR_FLOAT_TYPE_P (stept))
9770 : {
9771 : tree tem = build_int_cst (integer_type_node, vf);
9772 : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
9773 : }
9774 : else
9775 : lupdate_mul = build_int_cst (stept, vf);
9776 : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
9777 : lupdate_mul);
9778 : }
9779 : }
9780 15395 : tree peel_mul = NULL_TREE;
9781 15395 : if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9782 : {
9783 0 : if (SCALAR_FLOAT_TYPE_P (stept))
9784 0 : peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9785 : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9786 : else
9787 0 : peel_mul = gimple_convert (&init_stmts, stept,
9788 : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9789 0 : peel_mul = gimple_build_vector_from_val (&init_stmts,
9790 : step_vectype, peel_mul);
9791 : }
9792 15395 : tree step_mul = NULL_TREE;
9793 15395 : unsigned ivn;
9794 15395 : auto_vec<tree> vec_steps;
9795 31366 : for (ivn = 0; ivn < nivs; ++ivn)
9796 : {
9797 15971 : gimple_seq stmts = NULL;
9798 15971 : bool invariant = true;
9799 15971 : if (nunits.is_constant (&const_nunits))
9800 : {
9801 15971 : tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9802 15971 : tree_vector_builder init_elts (vectype, const_nunits, 1);
9803 15971 : tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9804 102905 : for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9805 : {
9806 : /* The scalar steps of the IVs. */
9807 86934 : tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9808 86934 : elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9809 86934 : step_elts.quick_push (elt);
9810 86934 : if (!init_node)
9811 : {
9812 : /* The scalar inits of the IVs if not vectorized. */
9813 85672 : elt = inits[(ivn*const_nunits + eltn) % group_size];
9814 85672 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
9815 85672 : TREE_TYPE (elt)))
9816 260 : elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9817 260 : TREE_TYPE (vectype), elt);
9818 85672 : init_elts.quick_push (elt);
9819 : }
9820 : /* The number of steps to add to the initial values. */
9821 86934 : unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9822 173868 : mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9823 173766 : ? build_real_from_wide (stept, mul_elt,
9824 : UNSIGNED)
9825 173766 : : build_int_cstu (stept, mul_elt));
9826 : }
9827 15971 : vec_step = gimple_build_vector (&init_stmts, &step_elts);
9828 15971 : step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9829 15971 : if (!init_node)
9830 15713 : vec_init = gimple_build_vector (&init_stmts, &init_elts);
9831 15971 : }
9832 : else
9833 : {
9834 : tree step = gimple_convert (&init_stmts, stept, steps[0]);
9835 : if (init_node)
9836 : ;
9837 : else if (INTEGRAL_TYPE_P (stept))
9838 : {
9839 : new_name = gimple_convert (&init_stmts, stept, inits[0]);
9840 : /* Build the initial value directly as a VEC_SERIES_EXPR. */
9841 : vec_init = gimple_build (&init_stmts, VEC_SERIES_EXPR,
9842 : step_vectype, new_name, step);
9843 : if (!useless_type_conversion_p (vectype, step_vectype))
9844 : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9845 : vectype, vec_init);
9846 : }
9847 : else
9848 : {
9849 : /* Build:
9850 : [base, base, base, ...]
9851 : + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
9852 : gcc_assert (SCALAR_FLOAT_TYPE_P (stept));
9853 : gcc_assert (flag_associative_math);
9854 : gcc_assert (index_vectype != NULL_TREE);
9855 :
9856 : tree index = build_index_vector (index_vectype, 0, 1);
9857 : new_name = gimple_convert (&init_stmts, stept, inits[0]);
9858 : tree base_vec = gimple_build_vector_from_val (&init_stmts,
9859 : step_vectype,
9860 : new_name);
9861 : tree step_vec = gimple_build_vector_from_val (&init_stmts,
9862 : step_vectype,
9863 : step);
9864 : vec_init = gimple_build (&init_stmts, FLOAT_EXPR,
9865 : step_vectype, index);
9866 : vec_init = gimple_build (&init_stmts, MULT_EXPR,
9867 : step_vectype, vec_init, step_vec);
9868 : vec_init = gimple_build (&init_stmts, PLUS_EXPR,
9869 : step_vectype, vec_init, base_vec);
9870 : if (!useless_type_conversion_p (vectype, step_vectype))
9871 : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9872 : vectype, vec_init);
9873 : }
9874 : /* iv_loop is nested in the loop to be vectorized. Generate:
9875 : vec_step = [S, S, S, S] */
9876 : t = unshare_expr (step);
9877 : gcc_assert (CONSTANT_CLASS_P (t)
9878 : || TREE_CODE (t) == SSA_NAME);
9879 : vec_step = gimple_build_vector_from_val (&init_stmts,
9880 : step_vectype, t);
9881 : }
9882 15971 : vec_steps.safe_push (vec_step);
9883 15971 : if (peel_mul)
9884 : {
9885 0 : if (!step_mul)
9886 : {
9887 0 : gcc_assert (!nunits.is_constant ());
9888 : step_mul = gimple_build (&init_stmts,
9889 : MINUS_EXPR, step_vectype,
9890 : build_zero_cst (step_vectype), peel_mul);
9891 : }
9892 : else
9893 0 : step_mul = gimple_build (&init_stmts,
9894 : MINUS_EXPR, step_vectype,
9895 : step_mul, peel_mul);
9896 : }
9897 :
9898 : /* Create the induction-phi that defines the induction-operand. */
9899 15971 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9900 : "vec_iv_");
9901 15971 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
9902 15971 : induc_def = PHI_RESULT (induction_phi);
9903 :
9904 : /* Create the iv update inside the loop */
9905 15971 : tree up = vec_step;
9906 15971 : if (lupdate_mul)
9907 : {
9908 15713 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
9909 : {
9910 : /* When we're using loop_len produced by SELEC_VL, the
9911 : non-final iterations are not always processing VF
9912 : elements. So vectorize induction variable instead of
9913 :
9914 : _21 = vect_vec_iv_.6_22 + { VF, ... };
9915 :
9916 : We should generate:
9917 :
9918 : _35 = .SELECT_VL (ivtmp_33, VF);
9919 : vect_cst__22 = [vec_duplicate_expr] _35;
9920 : _21 = vect_vec_iv_.6_22 + vect_cst__22; */
9921 0 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
9922 0 : tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1,
9923 : vectype, 0, 0, false);
9924 0 : if (SCALAR_FLOAT_TYPE_P (stept))
9925 0 : expr = gimple_build (&stmts, FLOAT_EXPR, stept, len);
9926 : else
9927 0 : expr = gimple_convert (&stmts, stept, len);
9928 0 : lupdate_mul = gimple_build_vector_from_val (&stmts, step_vectype,
9929 : expr);
9930 0 : up = gimple_build (&stmts, MULT_EXPR,
9931 : step_vectype, vec_step, lupdate_mul);
9932 : }
9933 : else
9934 15713 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9935 : vec_step, lupdate_mul);
9936 : }
9937 15971 : vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9938 15971 : vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, up);
9939 15971 : vec_def = gimple_convert (&stmts, vectype, vec_def);
9940 15971 : insert_iv_increment (&incr_si, insert_after, stmts);
9941 15971 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9942 : UNKNOWN_LOCATION);
9943 :
9944 15971 : if (init_node)
9945 258 : vec_init = vect_get_slp_vect_def (init_node, ivn);
9946 15971 : if (!nested_in_vect_loop
9947 15971 : && step_mul
9948 15971 : && !integer_zerop (step_mul))
9949 : {
9950 15266 : gcc_assert (invariant);
9951 15266 : vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9952 15266 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9953 : vec_step, step_mul);
9954 15266 : vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9955 : vec_def, up);
9956 15266 : vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9957 : }
9958 :
9959 : /* Set the arguments of the phi node: */
9960 15971 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9961 :
9962 15971 : slp_node->push_vec_def (induction_phi);
9963 : }
9964 15395 : if (!nested_in_vect_loop)
9965 : {
9966 : /* Fill up to the number of vectors we need for the whole group. */
9967 15171 : if (nunits.is_constant (&const_nunits))
9968 15171 : nivs = least_common_multiple (group_size, const_nunits) / const_nunits;
9969 : else
9970 : nivs = 1;
9971 15171 : vec_steps.reserve (nivs-ivn);
9972 30369 : for (; ivn < nivs; ++ivn)
9973 : {
9974 27 : slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
9975 27 : vec_steps.quick_push (vec_steps[0]);
9976 : }
9977 : }
9978 :
9979 : /* Re-use IVs when we can. We are generating further vector
9980 : stmts by adding VF' * stride to the IVs generated above. */
9981 15395 : if (ivn < nvects)
9982 : {
9983 3390 : if (nunits.is_constant (&const_nunits))
9984 : {
9985 3390 : unsigned vfp = (least_common_multiple (group_size, const_nunits)
9986 3390 : / group_size);
9987 3390 : lupdate_mul
9988 3390 : = build_vector_from_val (step_vectype,
9989 3390 : SCALAR_FLOAT_TYPE_P (stept)
9990 8 : ? build_real_from_wide (stept,
9991 8 : vfp, UNSIGNED)
9992 6772 : : build_int_cstu (stept, vfp));
9993 : }
9994 : else
9995 : {
9996 : if (SCALAR_FLOAT_TYPE_P (stept))
9997 : {
9998 : tree tem = build_int_cst (integer_type_node, nunits);
9999 : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
10000 : }
10001 : else
10002 : lupdate_mul = build_int_cst (stept, nunits);
10003 : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
10004 : lupdate_mul);
10005 : }
10006 10966 : for (; ivn < nvects; ++ivn)
10007 : {
10008 7576 : gimple *iv
10009 7576 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10010 7576 : tree def = gimple_get_lhs (iv);
10011 7576 : if (ivn < 2*nivs)
10012 3488 : vec_steps[ivn - nivs]
10013 3488 : = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10014 3488 : vec_steps[ivn - nivs], lupdate_mul);
10015 7576 : gimple_seq stmts = NULL;
10016 7576 : def = gimple_convert (&stmts, step_vectype, def);
10017 22728 : def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10018 7576 : def, vec_steps[ivn % nivs]);
10019 7576 : def = gimple_convert (&stmts, vectype, def);
10020 7576 : if (gimple_code (iv) == GIMPLE_PHI)
10021 3488 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10022 : else
10023 : {
10024 4088 : gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10025 4088 : gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10026 : }
10027 7576 : slp_node->push_vec_def (def);
10028 : }
10029 : }
10030 :
10031 15395 : new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10032 15395 : gcc_assert (!new_bb);
10033 :
10034 15395 : return true;
10035 15395 : }
10036 :
10037 : /* Function vectorizable_live_operation_1.
10038 :
10039 : helper function for vectorizable_live_operation. */
10040 :
10041 : static tree
10042 2842 : vectorizable_live_operation_1 (loop_vec_info loop_vinfo, basic_block exit_bb,
10043 : tree vectype, slp_tree slp_node,
10044 : tree bitsize, tree bitstart, tree vec_lhs,
10045 : tree lhs_type, gimple_stmt_iterator *exit_gsi)
10046 : {
10047 2842 : gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10048 :
10049 2842 : tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10050 2842 : gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10051 5686 : for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10052 2844 : SET_PHI_ARG_DEF (phi, i, vec_lhs);
10053 :
10054 2842 : gimple_seq stmts = NULL;
10055 2842 : tree new_tree;
10056 :
10057 : /* If bitstart is 0 then we can use a BIT_FIELD_REF */
10058 2842 : if (integer_zerop (bitstart))
10059 : {
10060 217 : tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10061 : vec_lhs_phi, bitsize, bitstart);
10062 :
10063 : /* Convert the extracted vector element to the scalar type. */
10064 217 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10065 : }
10066 2625 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10067 : {
10068 : /* Emit:
10069 :
10070 : SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - 1>
10071 :
10072 : where VEC_LHS is the vectorized live-out result, LEN is the length of
10073 : the vector, BIAS is the load-store bias. The bias should not be used
10074 : at all since we are not using load/store operations, but LEN will be
10075 : REALLEN + BIAS, so subtract it to get to the correct position. */
10076 0 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
10077 0 : gimple_seq tem = NULL;
10078 0 : gimple_stmt_iterator gsi = gsi_last (tem);
10079 0 : tree len = vect_get_loop_len (loop_vinfo, &gsi,
10080 : &LOOP_VINFO_LENS (loop_vinfo),
10081 : 1, vectype, 0, 1, false);
10082 0 : gimple_seq_add_seq (&stmts, tem);
10083 :
10084 : /* LAST_INDEX = LEN - 1. */
10085 0 : tree last_index = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (len),
10086 0 : len, build_one_cst (TREE_TYPE (len)));
10087 :
10088 : /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - 1>. */
10089 0 : tree scalar_res
10090 0 : = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10091 : vec_lhs_phi, last_index);
10092 :
10093 : /* Convert the extracted vector element to the scalar type. */
10094 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10095 : }
10096 2625 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10097 : {
10098 : /* Emit:
10099 :
10100 : SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10101 :
10102 : where VEC_LHS is the vectorized live-out result and MASK is
10103 : the loop mask for the final iteration. */
10104 0 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
10105 0 : tree scalar_type = TREE_TYPE (vectype);
10106 0 : gimple_seq tem = NULL;
10107 0 : gimple_stmt_iterator gsi = gsi_last (tem);
10108 0 : tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10109 : &LOOP_VINFO_MASKS (loop_vinfo),
10110 : 1, vectype, 0);
10111 0 : tree scalar_res;
10112 0 : gimple_seq_add_seq (&stmts, tem);
10113 :
10114 0 : scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10115 : mask, vec_lhs_phi);
10116 :
10117 : /* Convert the extracted vector element to the scalar type. */
10118 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10119 : }
10120 : else
10121 : {
10122 2625 : tree bftype = TREE_TYPE (vectype);
10123 2625 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10124 85 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10125 2625 : new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10126 2625 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10127 : &stmts, true, NULL_TREE);
10128 : }
10129 :
10130 2842 : *exit_gsi = gsi_after_labels (exit_bb);
10131 2842 : if (stmts)
10132 2842 : gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10133 :
10134 2842 : return new_tree;
10135 : }
10136 :
10137 : /* Function vectorizable_live_operation.
10138 :
10139 : STMT_INFO computes a value that is used outside the loop. Check if
10140 : it can be supported. */
10141 :
10142 : bool
10143 261768 : vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10144 : slp_tree slp_node, slp_instance slp_node_instance,
10145 : int slp_index, bool vec_stmt_p,
10146 : stmt_vector_for_cost *cost_vec)
10147 : {
10148 261768 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10149 261768 : imm_use_iterator imm_iter;
10150 261768 : tree lhs, lhs_type, bitsize;
10151 261768 : tree vectype = SLP_TREE_VECTYPE (slp_node);
10152 261768 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10153 261768 : gimple *use_stmt;
10154 261768 : use_operand_p use_p;
10155 261768 : auto_vec<tree> vec_oprnds;
10156 261768 : int vec_entry = 0;
10157 261768 : poly_uint64 vec_index = 0;
10158 :
10159 261768 : gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10160 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10161 :
10162 : /* If a stmt of a reduction is live, vectorize it via
10163 : vect_create_epilog_for_reduction. vectorizable_reduction assessed
10164 : validity so just trigger the transform here. */
10165 261768 : if (vect_is_reduction (slp_node))
10166 : {
10167 87066 : if (!vec_stmt_p)
10168 : {
10169 63600 : SLP_TREE_LIVE_LANES (slp_node).safe_push (slp_index);
10170 63600 : return true;
10171 : }
10172 : /* For SLP reductions we vectorize the epilogue for all involved stmts
10173 : together. For SLP reduction chains we only get here once. */
10174 23466 : if (SLP_INSTANCE_KIND (slp_node_instance) == slp_inst_kind_reduc_group
10175 23195 : && slp_index != 0)
10176 : return true;
10177 23018 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
10178 23018 : if (VECT_REDUC_INFO_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10179 23018 : || VECT_REDUC_INFO_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10180 : return true;
10181 :
10182 22123 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
10183 22123 : || !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10184 22114 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10185 : slp_node_instance,
10186 : LOOP_VINFO_MAIN_EXIT (loop_vinfo));
10187 :
10188 : /* If early break we only have to materialize the reduction on the merge
10189 : block, but we have to find an alternate exit first. */
10190 22123 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10191 : {
10192 28 : slp_tree phis_node = slp_node_instance->reduc_phis;
10193 28 : stmt_info = SLP_TREE_REPRESENTATIVE (phis_node);
10194 89 : for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10195 28 : if (exit != LOOP_VINFO_MAIN_EXIT (loop_vinfo))
10196 : {
10197 23 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10198 : phis_node, slp_node_instance,
10199 : exit);
10200 23 : break;
10201 28 : }
10202 28 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10203 9 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10204 : phis_node, slp_node_instance,
10205 : LOOP_VINFO_MAIN_EXIT
10206 : (loop_vinfo));
10207 : }
10208 :
10209 22123 : return true;
10210 : }
10211 :
10212 : /* If STMT is not relevant and it is a simple assignment and its inputs are
10213 : invariant then it can remain in place, unvectorized. The original last
10214 : scalar value that it computes will be used. */
10215 174702 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
10216 : {
10217 0 : gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10218 0 : if (dump_enabled_p ())
10219 0 : dump_printf_loc (MSG_NOTE, vect_location,
10220 : "statement is simple and uses invariant. Leaving in "
10221 : "place.\n");
10222 0 : return true;
10223 : }
10224 :
10225 174702 : gcc_assert (slp_index >= 0);
10226 :
10227 : /* Get the last occurrence of the scalar index from the concatenation of
10228 : all the slp vectors. Calculate which slp vector it is and the index
10229 : within. */
10230 174702 : int num_scalar = SLP_TREE_LANES (slp_node);
10231 174702 : int num_vec = vect_get_num_copies (vinfo, slp_node);
10232 174702 : poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10233 :
10234 : /* Calculate which vector contains the result, and which lane of
10235 : that vector we need. */
10236 174702 : if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10237 : {
10238 : if (dump_enabled_p ())
10239 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10240 : "Cannot determine which vector holds the"
10241 : " final result.\n");
10242 : return false;
10243 : }
10244 :
10245 174702 : if (!vec_stmt_p)
10246 : {
10247 : /* No transformation required. */
10248 136424 : if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10249 : {
10250 27340 : if (SLP_TREE_LANES (slp_node) != 1)
10251 : {
10252 19 : if (dump_enabled_p ())
10253 19 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10254 : "can't operate on partial vectors "
10255 : "because an SLP statement is live after "
10256 : "the loop.\n");
10257 19 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10258 : }
10259 27321 : else if (num_vec > 1)
10260 : {
10261 15573 : if (dump_enabled_p ())
10262 53 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10263 : "can't operate on partial vectors "
10264 : "because ncopies is greater than 1.\n");
10265 15573 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10266 : }
10267 : else
10268 : {
10269 11748 : if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10270 : OPTIMIZE_FOR_SPEED))
10271 0 : vect_record_loop_mask (loop_vinfo,
10272 : &LOOP_VINFO_MASKS (loop_vinfo),
10273 : 1, vectype, NULL);
10274 11748 : else if (can_vec_extract_var_idx_p (
10275 11748 : TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10276 0 : vect_record_loop_len (loop_vinfo,
10277 : &LOOP_VINFO_LENS (loop_vinfo),
10278 : 1, vectype, 1);
10279 : else
10280 : {
10281 11748 : if (dump_enabled_p ())
10282 655 : dump_printf_loc (
10283 655 : MSG_MISSED_OPTIMIZATION, vect_location,
10284 : "can't operate on partial vectors "
10285 : "because the target doesn't support extract "
10286 : "last reduction.\n");
10287 11748 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10288 : }
10289 : }
10290 : }
10291 : /* ??? Enable for loop costing as well. */
10292 27340 : if (!loop_vinfo)
10293 64904 : record_stmt_cost (cost_vec, 1, vec_to_scalar, slp_node,
10294 : 0, vect_epilogue);
10295 136424 : SLP_TREE_LIVE_LANES (slp_node).safe_push (slp_index);
10296 136424 : return true;
10297 : }
10298 :
10299 : /* Use the lhs of the original scalar statement. */
10300 38278 : gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10301 38278 : if (dump_enabled_p ())
10302 988 : dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10303 : "stmt %G", stmt);
10304 :
10305 38278 : lhs = gimple_get_lhs (stmt);
10306 38278 : lhs_type = TREE_TYPE (lhs);
10307 :
10308 38278 : bitsize = vector_element_bits_tree (vectype);
10309 :
10310 : /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10311 38278 : gcc_assert (!loop_vinfo
10312 : || ((!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10313 : && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10314 : || SLP_TREE_LANES (slp_node) == 1));
10315 :
10316 : /* Get the correct slp vectorized stmt. */
10317 38278 : tree vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10318 38278 : gimple *vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10319 :
10320 : /* In case we need to early break vectorize also get the first stmt. */
10321 38278 : tree vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10322 :
10323 : /* Get entry to use. */
10324 38278 : tree bitstart = bitsize_int (vec_index);
10325 38278 : bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10326 :
10327 38278 : if (loop_vinfo)
10328 : {
10329 : /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10330 : requirement, insert one phi node for it. It looks like:
10331 : loop;
10332 : BB:
10333 : # lhs' = PHI <lhs>
10334 : ==>
10335 : loop;
10336 : BB:
10337 : # vec_lhs' = PHI <vec_lhs>
10338 : new_tree = lane_extract <vec_lhs', ...>;
10339 : lhs' = new_tree; */
10340 :
10341 2905 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10342 : /* Check if we have a loop where the chosen exit is not the main exit,
10343 : in these cases for an early break we restart the iteration the vector code
10344 : did. For the live values we want the value at the start of the iteration
10345 : rather than at the end. */
10346 2905 : edge main_e = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
10347 2905 : bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10348 15064 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10349 9254 : if (!is_gimple_debug (use_stmt)
10350 9254 : && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10351 2842 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10352 : {
10353 2842 : edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10354 2842 : phi_arg_index_from_use (use_p));
10355 2842 : gcc_assert (loop_exit_edge_p (loop, e));
10356 2842 : bool main_exit_edge = e == main_e;
10357 2842 : tree tmp_vec_lhs = vec_lhs;
10358 2842 : tree tmp_bitstart = bitstart;
10359 :
10360 : /* For early exit where the exit is not in the BB that leads
10361 : to the latch then we're restarting the iteration in the
10362 : scalar loop. So get the first live value. */
10363 2842 : bool early_break_first_element_p
10364 2842 : = all_exits_as_early_p || !main_exit_edge;
10365 2842 : if (early_break_first_element_p)
10366 : {
10367 199 : tmp_vec_lhs = vec_lhs0;
10368 199 : tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10369 : }
10370 :
10371 2842 : gimple_stmt_iterator exit_gsi;
10372 2842 : tree new_tree
10373 2842 : = vectorizable_live_operation_1 (loop_vinfo,
10374 : e->dest, vectype,
10375 : slp_node, bitsize,
10376 : tmp_bitstart, tmp_vec_lhs,
10377 : lhs_type, &exit_gsi);
10378 :
10379 2842 : auto gsi = gsi_for_stmt (use_stmt);
10380 2842 : tree lhs_phi = gimple_phi_result (use_stmt);
10381 2842 : remove_phi_node (&gsi, false);
10382 2842 : gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10383 2842 : gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10384 2842 : break;
10385 2905 : }
10386 :
10387 : /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
10388 12222 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10389 6412 : gcc_assert (is_gimple_debug (use_stmt)
10390 2905 : || flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10391 : }
10392 : else
10393 : {
10394 : /* For basic-block vectorization simply insert the lane-extraction. */
10395 35373 : tree bftype = TREE_TYPE (vectype);
10396 35373 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10397 2 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10398 35373 : tree new_tree = build3 (BIT_FIELD_REF, bftype,
10399 : vec_lhs, bitsize, bitstart);
10400 35373 : gimple_seq stmts = NULL;
10401 35373 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10402 : &stmts, true, NULL_TREE);
10403 35373 : if (TREE_CODE (new_tree) == SSA_NAME
10404 70746 : && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10405 2 : SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10406 35373 : if (is_a <gphi *> (vec_stmt))
10407 : {
10408 2515 : gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10409 2515 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10410 : }
10411 : else
10412 : {
10413 32858 : gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10414 32858 : gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10415 : }
10416 :
10417 : /* Replace use of lhs with newly computed result. If the use stmt is a
10418 : single arg PHI, just replace all uses of PHI result. It's necessary
10419 : because lcssa PHI defining lhs may be before newly inserted stmt. */
10420 35373 : use_operand_p use_p;
10421 35373 : stmt_vec_info use_stmt_info;
10422 208423 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10423 137677 : if (!is_gimple_debug (use_stmt)
10424 137677 : && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10425 101208 : || !PURE_SLP_STMT (use_stmt_info)))
10426 : {
10427 : /* ??? This can happen when the live lane ends up being
10428 : rooted in a vector construction code-generated by an
10429 : external SLP node (and code-generation for that already
10430 : happened).
10431 : Doing this is what would happen if that vector CTOR
10432 : were not code-generated yet so it is not too bad.
10433 : ??? In fact we'd likely want to avoid this situation
10434 : in the first place. */
10435 61590 : if (TREE_CODE (new_tree) == SSA_NAME
10436 61590 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10437 61590 : && gimple_code (use_stmt) != GIMPLE_PHI
10438 116497 : && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10439 : use_stmt))
10440 : {
10441 0 : if (dump_enabled_p ())
10442 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10443 : "Using original scalar computation for "
10444 : "live lane because use precedes vector "
10445 : "def\n");
10446 0 : continue;
10447 : }
10448 188946 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10449 : {
10450 : /* ??? It can also happen that we end up pulling a def into
10451 : a loop where replacing out-of-loop uses would require
10452 : a new LC SSA PHI node. Retain the original scalar in
10453 : those cases as well. PR98064. */
10454 63678 : edge e;
10455 63678 : if (TREE_CODE (new_tree) == SSA_NAME
10456 63678 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10457 63678 : && (gimple_bb (use_stmt)->loop_father
10458 63678 : != gimple_bb (vec_stmt)->loop_father)
10459 : /* But a replacement in a LC PHI is OK. This happens
10460 : in gcc.dg/vect/bb-slp-57.c for example. */
10461 7303 : && (gimple_code (use_stmt) != GIMPLE_PHI
10462 3161 : || (((e = phi_arg_edge_from_use (use_p)), true)
10463 3161 : && !loop_exit_edge_p
10464 3161 : (gimple_bb (vec_stmt)->loop_father, e)))
10465 69289 : && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10466 5611 : gimple_bb (use_stmt)->loop_father))
10467 : {
10468 0 : if (dump_enabled_p ())
10469 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10470 : "Using original scalar computation for "
10471 : "live lane because there is an "
10472 : "out-of-loop definition for it\n");
10473 0 : continue;
10474 : }
10475 63678 : SET_USE (use_p, new_tree);
10476 : }
10477 61590 : update_stmt (use_stmt);
10478 35373 : }
10479 : }
10480 :
10481 : return true;
10482 261768 : }
10483 :
10484 : /* Given loop represented by LOOP_VINFO, return true if computation of
10485 : LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10486 : otherwise. */
10487 :
10488 : static bool
10489 61760 : loop_niters_no_overflow (loop_vec_info loop_vinfo)
10490 : {
10491 61760 : gcc_assert (!LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo));
10492 :
10493 : /* Constant case. */
10494 61760 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10495 : {
10496 35996 : tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10497 35996 : tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10498 :
10499 35996 : gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10500 35996 : gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10501 35996 : if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10502 : return true;
10503 : }
10504 :
10505 25764 : widest_int max;
10506 25764 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10507 : /* Check the upper bound of loop niters. */
10508 25764 : if (get_max_loop_iterations (loop, &max))
10509 : {
10510 25764 : tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10511 25764 : signop sgn = TYPE_SIGN (type);
10512 25764 : widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10513 25764 : if (max < type_max)
10514 25587 : return true;
10515 25764 : }
10516 : return false;
10517 25764 : }
10518 :
10519 : /* Return a mask type with half the number of elements as OLD_TYPE,
10520 : given that it should have mode NEW_MODE. */
10521 :
10522 : tree
10523 4795 : vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10524 : {
10525 4795 : poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10526 4795 : return build_truth_vector_type_for_mode (nunits, new_mode);
10527 : }
10528 :
10529 : /* Return a mask type with twice as many elements as OLD_TYPE,
10530 : given that it should have mode NEW_MODE. */
10531 :
10532 : tree
10533 7208 : vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10534 : {
10535 7208 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10536 7208 : return build_truth_vector_type_for_mode (nunits, new_mode);
10537 : }
10538 :
10539 : /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10540 : contain a sequence of NVECTORS masks that each control a vector of type
10541 : VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10542 : these vector masks with the vector version of SCALAR_MASK. */
10543 :
10544 : void
10545 105109 : vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10546 : unsigned int nvectors, tree vectype, tree scalar_mask)
10547 : {
10548 105109 : gcc_assert (nvectors != 0);
10549 :
10550 105109 : if (scalar_mask)
10551 : {
10552 4979 : scalar_cond_masked_key cond (scalar_mask, nvectors);
10553 4979 : loop_vinfo->scalar_cond_masked_set.add (cond);
10554 : }
10555 :
10556 105109 : masks->mask_set.add (std::make_pair (vectype, nvectors));
10557 105109 : }
10558 :
10559 : /* Given a complete set of masks MASKS, extract mask number INDEX
10560 : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10561 : where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10562 :
10563 : See the comment above vec_loop_masks for more details about the mask
10564 : arrangement. */
10565 :
10566 : tree
10567 208 : vect_get_loop_mask (loop_vec_info loop_vinfo,
10568 : gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10569 : unsigned int nvectors, tree vectype, unsigned int index)
10570 : {
10571 208 : if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10572 : == vect_partial_vectors_while_ult)
10573 : {
10574 0 : rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10575 0 : tree mask_type = rgm->type;
10576 :
10577 : /* Populate the rgroup's mask array, if this is the first time we've
10578 : used it. */
10579 0 : if (rgm->controls.is_empty ())
10580 : {
10581 0 : rgm->controls.safe_grow_cleared (nvectors, true);
10582 0 : for (unsigned int i = 0; i < nvectors; ++i)
10583 : {
10584 0 : tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10585 : /* Provide a dummy definition until the real one is available. */
10586 0 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10587 0 : rgm->controls[i] = mask;
10588 : }
10589 : }
10590 :
10591 0 : tree mask = rgm->controls[index];
10592 0 : if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10593 0 : TYPE_VECTOR_SUBPARTS (vectype)))
10594 : {
10595 : /* A loop mask for data type X can be reused for data type Y
10596 : if X has N times more elements than Y and if Y's elements
10597 : are N times bigger than X's. In this case each sequence
10598 : of N elements in the loop mask will be all-zero or all-one.
10599 : We can then view-convert the mask so that each sequence of
10600 : N elements is replaced by a single element. */
10601 0 : gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10602 : TYPE_VECTOR_SUBPARTS (vectype)));
10603 0 : gimple_seq seq = NULL;
10604 0 : mask_type = truth_type_for (vectype);
10605 0 : mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10606 0 : if (seq)
10607 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10608 : }
10609 0 : return mask;
10610 : }
10611 208 : else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10612 : == vect_partial_vectors_avx512)
10613 : {
10614 : /* The number of scalars per iteration and the number of vectors are
10615 : both compile-time constants. */
10616 208 : unsigned int nscalars_per_iter
10617 208 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10618 208 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10619 :
10620 208 : rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10621 :
10622 : /* The stored nV is dependent on the mask type produced. */
10623 208 : gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10624 : TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10625 : == rgm->factor);
10626 208 : nvectors = rgm->factor;
10627 :
10628 : /* Populate the rgroup's mask array, if this is the first time we've
10629 : used it. */
10630 208 : if (rgm->controls.is_empty ())
10631 : {
10632 20 : rgm->controls.safe_grow_cleared (nvectors, true);
10633 106 : for (unsigned int i = 0; i < nvectors; ++i)
10634 : {
10635 86 : tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
10636 : /* Provide a dummy definition until the real one is available. */
10637 86 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10638 86 : rgm->controls[i] = mask;
10639 : }
10640 : }
10641 208 : if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
10642 : TYPE_VECTOR_SUBPARTS (vectype)))
10643 160 : return rgm->controls[index];
10644 :
10645 : /* Split the vector if needed. Since we are dealing with integer mode
10646 : masks with AVX512 we can operate on the integer representation
10647 : performing the whole vector shifting. */
10648 48 : unsigned HOST_WIDE_INT factor;
10649 48 : bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
10650 48 : TYPE_VECTOR_SUBPARTS (vectype), &factor);
10651 0 : gcc_assert (ok);
10652 48 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
10653 48 : tree mask_type = truth_type_for (vectype);
10654 48 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
10655 48 : unsigned vi = index / factor;
10656 48 : unsigned vpart = index % factor;
10657 48 : tree vec = rgm->controls[vi];
10658 48 : gimple_seq seq = NULL;
10659 48 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
10660 48 : lang_hooks.types.type_for_mode
10661 48 : (TYPE_MODE (rgm->type), 1), vec);
10662 : /* For integer mode masks simply shift the right bits into position. */
10663 48 : if (vpart != 0)
10664 40 : vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
10665 : build_int_cst (integer_type_node,
10666 80 : (TYPE_VECTOR_SUBPARTS (vectype)
10667 40 : * vpart)));
10668 48 : vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
10669 48 : (TYPE_MODE (mask_type), 1), vec);
10670 48 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
10671 48 : if (seq)
10672 48 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10673 48 : return vec;
10674 : }
10675 : else
10676 0 : gcc_unreachable ();
10677 : }
10678 :
10679 : /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10680 : lengths for controlling an operation on VECTYPE. The operation splits
10681 : each element of VECTYPE into FACTOR separate subelements, measuring the
10682 : length as a number of these subelements. */
10683 :
10684 : void
10685 0 : vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10686 : unsigned int nvectors, tree vectype, unsigned int factor)
10687 : {
10688 0 : gcc_assert (nvectors != 0);
10689 0 : if (lens->length () < nvectors)
10690 0 : lens->safe_grow_cleared (nvectors, true);
10691 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
10692 :
10693 : /* The number of scalars per iteration, scalar occupied bytes and
10694 : the number of vectors are both compile-time constants. */
10695 0 : unsigned int nscalars_per_iter
10696 0 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10697 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10698 :
10699 0 : if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10700 : {
10701 : /* For now, we only support cases in which all loads and stores fall back
10702 : to VnQI or none do. */
10703 0 : gcc_assert (!rgl->max_nscalars_per_iter
10704 : || (rgl->factor == 1 && factor == 1)
10705 : || (rgl->max_nscalars_per_iter * rgl->factor
10706 : == nscalars_per_iter * factor));
10707 0 : rgl->max_nscalars_per_iter = nscalars_per_iter;
10708 0 : rgl->type = vectype;
10709 0 : rgl->factor = factor;
10710 : }
10711 0 : }
10712 :
10713 : /* Given a complete set of lengths LENS, extract length number INDEX
10714 : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10715 : where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
10716 : multiplied by the number of elements that should be processed.
10717 : Insert any set-up statements before GSI. */
10718 :
10719 : tree
10720 0 : vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10721 : vec_loop_lens *lens, unsigned int nvectors, tree vectype,
10722 : unsigned int index, unsigned int factor, bool adjusted)
10723 : {
10724 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
10725 0 : bool use_bias_adjusted_len =
10726 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10727 :
10728 : /* Populate the rgroup's len array, if this is the first time we've
10729 : used it. */
10730 0 : if (rgl->controls.is_empty ())
10731 : {
10732 0 : rgl->controls.safe_grow_cleared (nvectors, true);
10733 0 : for (unsigned int i = 0; i < nvectors; ++i)
10734 : {
10735 0 : tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10736 0 : gcc_assert (len_type != NULL_TREE);
10737 :
10738 0 : tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10739 :
10740 : /* Provide a dummy definition until the real one is available. */
10741 0 : SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10742 0 : rgl->controls[i] = len;
10743 :
10744 0 : if (use_bias_adjusted_len)
10745 : {
10746 0 : gcc_assert (i == 0);
10747 0 : tree adjusted_len =
10748 0 : make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10749 0 : SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10750 0 : rgl->bias_adjusted_ctrl = adjusted_len;
10751 : }
10752 : }
10753 : }
10754 :
10755 0 : if (use_bias_adjusted_len && adjusted)
10756 0 : return rgl->bias_adjusted_ctrl;
10757 :
10758 0 : tree loop_len = rgl->controls[index];
10759 0 : if (rgl->factor == 1 && factor == 1)
10760 : {
10761 0 : poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
10762 0 : poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
10763 0 : if (maybe_ne (nunits1, nunits2))
10764 : {
10765 : /* A loop len for data type X can be reused for data type Y
10766 : if X has N times more elements than Y and if Y's elements
10767 : are N times bigger than X's. */
10768 0 : gcc_assert (multiple_p (nunits1, nunits2));
10769 0 : factor = exact_div (nunits1, nunits2).to_constant ();
10770 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10771 0 : gimple_seq seq = NULL;
10772 0 : loop_len = gimple_build (&seq, EXACT_DIV_EXPR, iv_type, loop_len,
10773 0 : build_int_cst (iv_type, factor));
10774 0 : if (seq)
10775 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10776 : }
10777 0 : }
10778 0 : else if (factor && rgl->factor != factor)
10779 : {
10780 : /* The number of scalars per iteration, scalar occupied bytes and
10781 : the number of vectors are both compile-time constants. */
10782 0 : unsigned int nscalars_per_iter
10783 0 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10784 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10785 0 : unsigned int rglvecsize = rgl->factor * rgl->max_nscalars_per_iter;
10786 0 : unsigned int vecsize = nscalars_per_iter * factor;
10787 0 : if (rglvecsize > vecsize)
10788 : {
10789 0 : unsigned int fac = rglvecsize / vecsize;
10790 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10791 0 : gimple_seq seq = NULL;
10792 0 : loop_len = gimple_build (&seq, EXACT_DIV_EXPR, iv_type, loop_len,
10793 0 : build_int_cst (iv_type, fac));
10794 0 : if (seq)
10795 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10796 : }
10797 0 : else if (rglvecsize < vecsize)
10798 : {
10799 0 : unsigned int fac = vecsize / rglvecsize;
10800 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10801 0 : gimple_seq seq = NULL;
10802 0 : loop_len = gimple_build (&seq, MULT_EXPR, iv_type, loop_len,
10803 0 : build_int_cst (iv_type, fac));
10804 0 : if (seq)
10805 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10806 : }
10807 : }
10808 : return loop_len;
10809 : }
10810 :
10811 : /* Generate the tree for the loop len mask and return it. Given the lens,
10812 : nvectors, vectype, index and factor to gen the len mask as below.
10813 :
10814 : tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
10815 : */
10816 : tree
10817 0 : vect_gen_loop_len_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10818 : gimple_stmt_iterator *cond_gsi, vec_loop_lens *lens,
10819 : unsigned int nvectors, tree vectype, tree stmt,
10820 : unsigned int index, unsigned int factor)
10821 : {
10822 0 : tree all_one_mask = build_all_ones_cst (vectype);
10823 0 : tree all_zero_mask = build_zero_cst (vectype);
10824 0 : tree len = vect_get_loop_len (loop_vinfo, gsi, lens, nvectors, vectype, index,
10825 : factor, true);
10826 0 : tree bias = build_int_cst (intQI_type_node,
10827 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo));
10828 0 : tree len_mask = make_temp_ssa_name (TREE_TYPE (stmt), NULL, "vec_len_mask");
10829 0 : gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, stmt,
10830 : all_one_mask, all_zero_mask, len,
10831 : bias);
10832 0 : gimple_call_set_lhs (call, len_mask);
10833 0 : gsi_insert_before (cond_gsi, call, GSI_SAME_STMT);
10834 :
10835 0 : return len_mask;
10836 : }
10837 :
10838 : /* Scale profiling counters by estimation for LOOP which is vectorized
10839 : by factor VF.
10840 : If FLAT is true, the loop we started with had unrealistically flat
10841 : profile. */
10842 :
10843 : static void
10844 61803 : scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
10845 : {
10846 : /* For flat profiles do not scale down proportionally by VF and only
10847 : cap by known iteration count bounds. */
10848 61803 : if (flat)
10849 : {
10850 34713 : if (dump_file && (dump_flags & TDF_DETAILS))
10851 5303 : fprintf (dump_file,
10852 : "Vectorized loop profile seems flat; not scaling iteration "
10853 : "count down by the vectorization factor %i\n", vf);
10854 34713 : scale_loop_profile (loop, profile_probability::always (),
10855 : get_likely_max_loop_iterations_int (loop));
10856 34713 : return;
10857 : }
10858 : /* Loop body executes VF fewer times and exit increases VF times. */
10859 27090 : profile_count entry_count = loop_preheader_edge (loop)->count ();
10860 :
10861 : /* If we have unreliable loop profile avoid dropping entry
10862 : count below header count. This can happen since loops
10863 : has unrealistically low trip counts. */
10864 27090 : while (vf > 1
10865 28156 : && loop->header->count > entry_count
10866 57342 : && loop->header->count < entry_count * vf)
10867 : {
10868 2096 : if (dump_file && (dump_flags & TDF_DETAILS))
10869 155 : fprintf (dump_file,
10870 : "Vectorization factor %i seems too large for profile "
10871 : "previously believed to be consistent; reducing.\n", vf);
10872 2096 : vf /= 2;
10873 : }
10874 :
10875 27090 : if (entry_count.nonzero_p ())
10876 27090 : set_edge_probability_and_rescale_others
10877 27090 : (exit_e,
10878 27090 : entry_count.probability_in (loop->header->count / vf));
10879 : /* Avoid producing very large exit probability when we do not have
10880 : sensible profile. */
10881 0 : else if (exit_e->probability < profile_probability::always () / (vf * 2))
10882 0 : set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
10883 27090 : loop->latch->count = single_pred_edge (loop->latch)->count ();
10884 :
10885 27090 : scale_loop_profile (loop, profile_probability::always () / vf,
10886 : get_likely_max_loop_iterations_int (loop));
10887 : }
10888 :
10889 : /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
10890 : original loop that has now been vectorized.
10891 :
10892 : The inits of the data_references need to be advanced with the number of
10893 : iterations of the main loop. This has been computed in vect_do_peeling and
10894 : is stored in parameter ADVANCE.
10895 :
10896 : Since the loop_vec_info of this EPILOGUE was constructed for the original
10897 : loop, its stmt_vec_infos all point to the original statements. These need
10898 : to be updated to point to their corresponding copies.
10899 :
10900 : The data_reference's connections also need to be updated. Their
10901 : corresponding dr_vec_info need to be reconnected to the EPILOGUE's
10902 : stmt_vec_infos, their statements need to point to their corresponding
10903 : copy. */
10904 :
10905 : static void
10906 6847 : update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
10907 : {
10908 6847 : loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
10909 6847 : hash_map<tree,tree> mapping;
10910 6847 : gimple *orig_stmt, *new_stmt;
10911 6847 : gimple_stmt_iterator epilogue_gsi;
10912 6847 : gphi_iterator epilogue_phi_gsi;
10913 6847 : stmt_vec_info stmt_vinfo = NULL, related_vinfo;
10914 6847 : basic_block *epilogue_bbs = get_loop_body (epilogue);
10915 6847 : unsigned i;
10916 :
10917 6847 : free (LOOP_VINFO_BBS (epilogue_vinfo));
10918 6847 : LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
10919 6847 : LOOP_VINFO_NBBS (epilogue_vinfo) = epilogue->num_nodes;
10920 :
10921 : /* The EPILOGUE loop is a copy of the original loop so they share the same
10922 : gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
10923 : point to the copied statements. */
10924 20541 : for (unsigned i = 0; i < epilogue->num_nodes; ++i)
10925 : {
10926 13694 : for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
10927 35294 : !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
10928 : {
10929 21600 : new_stmt = epilogue_phi_gsi.phi ();
10930 :
10931 21600 : gcc_assert (gimple_uid (new_stmt) > 0);
10932 21600 : stmt_vinfo
10933 21600 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10934 :
10935 21600 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10936 : }
10937 :
10938 27388 : for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
10939 137238 : !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
10940 : {
10941 123544 : new_stmt = gsi_stmt (epilogue_gsi);
10942 123544 : if (is_gimple_debug (new_stmt))
10943 20500 : continue;
10944 :
10945 103044 : gcc_assert (gimple_uid (new_stmt) > 0);
10946 103044 : stmt_vinfo
10947 103044 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10948 :
10949 103044 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10950 :
10951 103044 : related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
10952 103044 : if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
10953 : {
10954 1939 : gimple *stmt = STMT_VINFO_STMT (related_vinfo);
10955 : /* Set BB such that the assert in
10956 : 'get_initial_defs_for_reduction' is able to determine that
10957 : the BB of the related stmt is inside this loop. */
10958 1939 : gimple_set_bb (stmt,
10959 : gimple_bb (new_stmt));
10960 1939 : related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
10961 1939 : gcc_assert (related_vinfo == NULL
10962 : || related_vinfo == stmt_vinfo);
10963 : }
10964 : }
10965 : }
10966 :
10967 6847 : struct data_reference *dr;
10968 6847 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
10969 30928 : FOR_EACH_VEC_ELT (datarefs, i, dr)
10970 : {
10971 24081 : orig_stmt = DR_STMT (dr);
10972 24081 : gcc_assert (gimple_uid (orig_stmt) > 0);
10973 24081 : stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
10974 24081 : DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
10975 : }
10976 :
10977 : /* Advance data_reference's with the number of iterations of the previous
10978 : loop and its prologue. */
10979 6847 : vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
10980 :
10981 : /* Remember the advancement made. */
10982 6847 : LOOP_VINFO_DRS_ADVANCED_BY (epilogue_vinfo) = advance;
10983 6847 : }
10984 :
10985 : /* When vectorizing early break statements instructions that happen before
10986 : the early break in the current BB need to be moved to after the early
10987 : break. This function deals with that and assumes that any validity
10988 : checks has already been performed.
10989 :
10990 : While moving the instructions if it encounters a VUSE or VDEF it then
10991 : corrects the VUSES as it moves the statements along. GDEST is the location
10992 : in which to insert the new statements. */
10993 :
10994 : static void
10995 1411 : move_early_exit_stmts (loop_vec_info loop_vinfo)
10996 : {
10997 1411 : DUMP_VECT_SCOPE ("move_early_exit_stmts");
10998 :
10999 1411 : if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
11000 1192 : return;
11001 :
11002 : /* Move all stmts that need moving. */
11003 219 : basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
11004 219 : gimple_stmt_iterator dest_gsi = gsi_after_labels (dest_bb);
11005 :
11006 219 : tree last_seen_vuse = NULL_TREE;
11007 537 : for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11008 : {
11009 : /* We have to update crossed degenerate virtual PHIs. Simply
11010 : elide them. */
11011 318 : if (gphi *vphi = dyn_cast <gphi *> (stmt))
11012 : {
11013 7 : tree vdef = gimple_phi_result (vphi);
11014 7 : tree vuse = gimple_phi_arg_def (vphi, 0);
11015 7 : imm_use_iterator iter;
11016 7 : use_operand_p use_p;
11017 7 : gimple *use_stmt;
11018 30 : FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef)
11019 : {
11020 48 : FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
11021 16 : SET_USE (use_p, vuse);
11022 7 : }
11023 7 : auto gsi = gsi_for_stmt (stmt);
11024 7 : remove_phi_node (&gsi, true);
11025 7 : last_seen_vuse = vuse;
11026 7 : continue;
11027 7 : }
11028 :
11029 : /* Check to see if statement is still required for vect or has been
11030 : elided. */
11031 311 : auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11032 311 : if (!stmt_info)
11033 0 : continue;
11034 :
11035 311 : if (dump_enabled_p ())
11036 160 : dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11037 :
11038 311 : gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11039 311 : gsi_move_before (&stmt_gsi, &dest_gsi, GSI_NEW_STMT);
11040 622 : last_seen_vuse = gimple_vuse (stmt);
11041 : }
11042 :
11043 : /* Update all the stmts with their new reaching VUSES. */
11044 689 : for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11045 : {
11046 198 : if (dump_enabled_p ())
11047 162 : dump_printf_loc (MSG_NOTE, vect_location,
11048 : "updating vuse to %T for load %G",
11049 : last_seen_vuse, p);
11050 198 : gimple_set_vuse (p, last_seen_vuse);
11051 198 : update_stmt (p);
11052 : }
11053 :
11054 : /* And update the LC PHIs on exits. */
11055 1108 : for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
11056 451 : if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
11057 245 : if (gphi *phi = get_virtual_phi (e->dest))
11058 464 : SET_PHI_ARG_DEF_ON_EDGE (phi, e, last_seen_vuse);
11059 : }
11060 :
11061 : /* Generate adjustment code for early break scalar IVs filling in the value
11062 : we created earlier on for LOOP_VINFO_EARLY_BRK_NITERS_VAR. */
11063 :
11064 : static void
11065 1411 : vect_update_ivs_after_vectorizer_for_early_breaks (loop_vec_info loop_vinfo)
11066 : {
11067 1411 : DUMP_VECT_SCOPE ("vect_update_ivs_after_vectorizer_for_early_breaks");
11068 :
11069 1411 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
11070 : /* If no peeling was done then we have no IV to update. */
11071 1411 : || !LOOP_VINFO_EARLY_BRK_NITERS_VAR (loop_vinfo))
11072 584 : return;
11073 :
11074 827 : tree phi_var = LOOP_VINFO_EARLY_BRK_NITERS_VAR (loop_vinfo);
11075 827 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11076 827 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11077 827 : tree ty_var = TREE_TYPE (phi_var);
11078 827 : auto loop = LOOP_VINFO_LOOP (loop_vinfo);
11079 827 : tree induc_var = niters_skip ? copy_ssa_name (phi_var) : phi_var;
11080 :
11081 : /* Remove the existing dummy GIMPLE statement and just keep the def. */
11082 827 : gimple *def = SSA_NAME_DEF_STMT (phi_var);
11083 827 : auto def_gsi = gsi_for_stmt (def);
11084 827 : gsi_remove (&def_gsi, true);
11085 :
11086 827 : auto induction_phi = create_phi_node (induc_var, loop->header);
11087 827 : tree induc_def = PHI_RESULT (induction_phi);
11088 :
11089 : /* Create the iv update inside the loop. */
11090 827 : gimple_seq init_stmts = NULL;
11091 827 : gimple_seq stmts = NULL;
11092 827 : gimple_seq iv_stmts = NULL;
11093 827 : tree tree_vf = build_int_cst (ty_var, vf);
11094 :
11095 : /* For loop len targets we have to use .SELECT_VL (ivtmp_33, VF); instead of
11096 : just += VF as the VF can change in between two loop iterations. */
11097 827 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
11098 : {
11099 0 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
11100 0 : tree_vf = vect_get_loop_len (loop_vinfo, NULL, lens, 1,
11101 : NULL_TREE, 0, 0, true);
11102 : }
11103 :
11104 827 : tree iter_var;
11105 827 : if (POINTER_TYPE_P (ty_var))
11106 : {
11107 0 : tree offset = gimple_convert (&stmts, sizetype, tree_vf);
11108 0 : iter_var = gimple_build (&stmts, POINTER_PLUS_EXPR, ty_var, induc_def,
11109 : gimple_convert (&stmts, sizetype, offset));
11110 : }
11111 : else
11112 : {
11113 827 : tree offset = gimple_convert (&stmts, ty_var, tree_vf);
11114 827 : iter_var = gimple_build (&stmts, PLUS_EXPR, ty_var, induc_def, offset);
11115 : }
11116 :
11117 827 : tree init_var = build_zero_cst (ty_var);
11118 827 : if (niters_skip)
11119 0 : init_var = gimple_build (&init_stmts, MINUS_EXPR, ty_var, init_var,
11120 : gimple_convert (&init_stmts, ty_var, niters_skip));
11121 :
11122 827 : add_phi_arg (induction_phi, iter_var,
11123 : loop_latch_edge (loop), UNKNOWN_LOCATION);
11124 827 : add_phi_arg (induction_phi, init_var,
11125 : loop_preheader_edge (loop), UNKNOWN_LOCATION);
11126 :
11127 : /* Find the first insertion point in the BB. */
11128 827 : auto pe = loop_preheader_edge (loop);
11129 :
11130 : /* If we've done any peeling, calculate the peeling adjustment needed to the
11131 : final IV. */
11132 827 : if (niters_skip)
11133 : {
11134 0 : tree induc_type = TREE_TYPE (induc_def);
11135 0 : tree s_induc_type = signed_type_for (induc_type);
11136 0 : induc_def = gimple_build (&iv_stmts, MAX_EXPR, s_induc_type,
11137 : gimple_convert (&iv_stmts, s_induc_type,
11138 : induc_def),
11139 : build_zero_cst (s_induc_type));
11140 0 : auto stmt = gimple_build_assign (phi_var,
11141 : gimple_convert (&iv_stmts, induc_type,
11142 : induc_def));
11143 0 : gimple_seq_add_stmt_without_update (&iv_stmts, stmt);
11144 0 : basic_block exit_bb = NULL;
11145 : /* Identify the early exit merge block. I wish we had stored this. */
11146 0 : for (auto e : get_loop_exit_edges (loop))
11147 0 : if (e != LOOP_VINFO_MAIN_EXIT (loop_vinfo))
11148 : {
11149 0 : exit_bb = e->dest;
11150 0 : break;
11151 0 : }
11152 :
11153 0 : gcc_assert (exit_bb);
11154 0 : auto exit_gsi = gsi_after_labels (exit_bb);
11155 0 : gsi_insert_seq_before (&exit_gsi, iv_stmts, GSI_SAME_STMT);
11156 : }
11157 : /* Write the init_stmts in the loop-preheader block. */
11158 827 : auto psi = gsi_last_nondebug_bb (pe->src);
11159 827 : gsi_insert_seq_after (&psi, init_stmts, GSI_LAST_NEW_STMT);
11160 : /* Write the adjustments in the header block. */
11161 827 : basic_block bb = loop->header;
11162 827 : auto si = gsi_after_labels (bb);
11163 827 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11164 : }
11165 :
11166 : /* Function vect_transform_loop.
11167 :
11168 : The analysis phase has determined that the loop is vectorizable.
11169 : Vectorize the loop - created vectorized stmts to replace the scalar
11170 : stmts in the loop, and update the loop exit condition.
11171 : Returns scalar epilogue loop if any. */
11172 :
11173 : class loop *
11174 61803 : vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11175 : {
11176 61803 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11177 61803 : class loop *epilogue = NULL;
11178 61803 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11179 61803 : int nbbs = loop->num_nodes;
11180 61803 : int i;
11181 61803 : tree niters_vector = NULL_TREE;
11182 61803 : tree step_vector = NULL_TREE;
11183 61803 : tree niters_vector_mult_vf = NULL_TREE;
11184 61803 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11185 61803 : unsigned int lowest_vf = constant_lower_bound (vf);
11186 61803 : gimple *stmt;
11187 61803 : bool check_profitability = false;
11188 61803 : unsigned int th;
11189 61803 : bool flat = maybe_flat_loop_profile (loop);
11190 61803 : bool uncounted_p = LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo);
11191 :
11192 61803 : DUMP_VECT_SCOPE ("vec_transform_loop");
11193 :
11194 61803 : if (! LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11195 54956 : loop_vinfo->shared->check_datarefs ();
11196 :
11197 : /* Use the more conservative vectorization threshold. If the number
11198 : of iterations is constant assume the cost check has been performed
11199 : by our caller. If the threshold makes all loops profitable that
11200 : run at least the (estimated) vectorization factor number of times
11201 : checking is pointless, too. */
11202 61803 : th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11203 61803 : if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11204 : {
11205 18734 : if (dump_enabled_p ())
11206 176 : dump_printf_loc (MSG_NOTE, vect_location,
11207 : "Profitability threshold is %d loop iterations.\n",
11208 : th);
11209 : check_profitability = true;
11210 : }
11211 :
11212 : /* Make sure there exists a single-predecessor exit bb. Do this before
11213 : versioning. */
11214 61803 : edge e = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
11215 61803 : if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11216 : {
11217 19098 : split_loop_exit_edge (e, true);
11218 19098 : if (dump_enabled_p ())
11219 2287 : dump_printf (MSG_NOTE, "split exit edge\n");
11220 : }
11221 :
11222 : /* Version the loop first, if required, so the profitability check
11223 : comes first. */
11224 :
11225 61803 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11226 : {
11227 3749 : class loop *sloop
11228 3749 : = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11229 3749 : sloop->force_vectorize = false;
11230 3749 : check_profitability = false;
11231 : }
11232 :
11233 : /* Make sure there exists a single-predecessor exit bb also on the
11234 : scalar loop copy. Do this after versioning but before peeling
11235 : so CFG structure is fine for both scalar and if-converted loop
11236 : to make slpeel_duplicate_current_defs_from_edges face matched
11237 : loop closed PHI nodes on the exit. */
11238 61803 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11239 : {
11240 8067 : e = LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo);
11241 8067 : if (! single_pred_p (e->dest))
11242 : {
11243 7807 : split_loop_exit_edge (e, true);
11244 7807 : if (dump_enabled_p ())
11245 1148 : dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11246 : }
11247 : }
11248 :
11249 61803 : tree niters = vect_build_loop_niters (loop_vinfo);
11250 61803 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11251 61803 : tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11252 61803 : tree advance;
11253 61803 : drs_init_vec orig_drs_init;
11254 61803 : bool niters_no_overflow = uncounted_p ? false /* Not known. */
11255 61760 : : loop_niters_no_overflow (loop_vinfo);
11256 :
11257 61803 : epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11258 : &step_vector, &niters_vector_mult_vf, th,
11259 : check_profitability, niters_no_overflow,
11260 : &advance);
11261 :
11262 : /* Assign hierarchical discriminators to the vectorized loop. */
11263 61803 : poly_uint64 vf_val = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11264 61803 : unsigned int vf_int = constant_lower_bound (vf_val);
11265 61803 : if (vf_int > DISCR_MULTIPLICITY_MAX)
11266 : vf_int = DISCR_MULTIPLICITY_MAX;
11267 :
11268 : /* Assign unique copy_id dynamically instead of using hardcoded constants.
11269 : Epilogue and main vectorized loops get different copy_ids. */
11270 61803 : gimple *loop_last = last_nondebug_stmt (loop->header);
11271 61803 : location_t loop_loc
11272 61803 : = loop_last ? gimple_location (loop_last) : UNKNOWN_LOCATION;
11273 61525 : if (loop_loc != UNKNOWN_LOCATION)
11274 : {
11275 50884 : unsigned int copyid = allocate_copyid_base (loop_loc, 1);
11276 50884 : assign_discriminators_to_loop (loop, vf_int, copyid);
11277 : }
11278 61803 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11279 61803 : && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11280 : {
11281 : /* Ifcvt duplicates loop preheader, loop body and produces an basic
11282 : block after loop exit. We need to scale all that. */
11283 88 : basic_block preheader
11284 88 : = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11285 88 : preheader->count
11286 : = preheader->count.apply_probability
11287 88 : (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11288 88 : scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11289 : LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11290 88 : LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo)->dest->count = preheader->count;
11291 : }
11292 :
11293 61803 : if (niters_vector == NULL_TREE && !uncounted_p)
11294 : {
11295 28141 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11296 28141 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11297 57069 : && known_eq (lowest_vf, vf))
11298 : {
11299 28138 : niters_vector
11300 28138 : = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11301 28138 : LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11302 28138 : step_vector = build_one_cst (TREE_TYPE (niters));
11303 : }
11304 793 : else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11305 1 : vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11306 : &step_vector, niters_no_overflow);
11307 : else
11308 : /* vect_do_peeling subtracted the number of peeled prologue
11309 : iterations from LOOP_VINFO_NITERS. */
11310 792 : vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11311 : &niters_vector, &step_vector,
11312 : niters_no_overflow);
11313 : }
11314 :
11315 : /* 1) Make sure the loop header has exactly two entries
11316 : 2) Make sure we have a preheader basic block. */
11317 :
11318 61803 : gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11319 :
11320 61803 : split_edge (loop_preheader_edge (loop));
11321 :
11322 61803 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11323 : /* This will deal with any possible peeling. */
11324 1 : vect_prepare_for_masked_peels (loop_vinfo);
11325 :
11326 : /* Handle any code motion that we need to for early-break vectorization after
11327 : we've done peeling but just before we start vectorizing. */
11328 61803 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11329 : {
11330 1411 : vect_update_ivs_after_vectorizer_for_early_breaks (loop_vinfo);
11331 1411 : move_early_exit_stmts (loop_vinfo);
11332 : }
11333 :
11334 : /* Remove existing clobber stmts and prefetches. */
11335 188728 : for (i = 0; i < nbbs; i++)
11336 : {
11337 126925 : basic_block bb = bbs[i];
11338 1095032 : for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);)
11339 : {
11340 841182 : stmt = gsi_stmt (si);
11341 841182 : if (gimple_clobber_p (stmt)
11342 841182 : || gimple_call_builtin_p (stmt, BUILT_IN_PREFETCH))
11343 : {
11344 88 : unlink_stmt_vdef (stmt);
11345 88 : gsi_remove (&si, true);
11346 88 : release_defs (stmt);
11347 : }
11348 : else
11349 841094 : gsi_next (&si);
11350 : }
11351 : }
11352 :
11353 : /* Schedule the SLP instances. */
11354 61803 : if (!loop_vinfo->slp_instances.is_empty ())
11355 : {
11356 61803 : DUMP_VECT_SCOPE ("scheduling SLP instances");
11357 61803 : vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11358 : }
11359 :
11360 : /* Generate the loop invariant statements. */
11361 61803 : if (!gimple_seq_empty_p (LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo)))
11362 : {
11363 73 : if (dump_enabled_p ())
11364 30 : dump_printf_loc (MSG_NOTE, vect_location,
11365 : "------>generating loop invariant statements\n");
11366 73 : gimple_stmt_iterator gsi;
11367 73 : gsi = gsi_after_labels (loop_preheader_edge (loop)->src);
11368 73 : gsi_insert_seq_before (&gsi, LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo),
11369 : GSI_CONTINUE_LINKING);
11370 : }
11371 :
11372 : /* Stub out scalar statements that must not survive vectorization and
11373 : were not picked as relevant in any SLP instance.
11374 : Doing this here helps with grouped statements, or statements that
11375 : are involved in patterns. */
11376 188728 : for (i = 0; i < nbbs; i++)
11377 : {
11378 126925 : basic_block bb = bbs[i];
11379 126925 : stmt_vec_info stmt_info;
11380 253850 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11381 1681355 : !gsi_end_p (gsi); gsi_next (&gsi))
11382 : {
11383 1554430 : gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11384 6348 : if (!call || !gimple_call_internal_p (call))
11385 1549241 : continue;
11386 5189 : internal_fn ifn = gimple_call_internal_fn (call);
11387 5189 : if (ifn == IFN_MASK_LOAD)
11388 : {
11389 737 : tree lhs = gimple_get_lhs (call);
11390 737 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11391 : {
11392 0 : tree zero = build_zero_cst (TREE_TYPE (lhs));
11393 0 : gimple *new_stmt = gimple_build_assign (lhs, zero);
11394 0 : gsi_replace (&gsi, new_stmt, true);
11395 : }
11396 : }
11397 4452 : else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11398 : {
11399 2297 : tree lhs = gimple_get_lhs (call);
11400 2297 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11401 : {
11402 0 : tree else_arg
11403 0 : = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11404 0 : gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11405 0 : gsi_replace (&gsi, new_stmt, true);
11406 : }
11407 : }
11408 2155 : else if (ifn == IFN_MASK_CALL
11409 4 : && (stmt_info = loop_vinfo->lookup_stmt (call))
11410 4 : && !STMT_VINFO_RELEVANT_P (stmt_info)
11411 2159 : && !STMT_VINFO_LIVE_P (stmt_info))
11412 : {
11413 4 : gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11414 4 : loop_vinfo->remove_stmt (stmt_info);
11415 : }
11416 : }
11417 : }
11418 :
11419 61803 : if (!uncounted_p)
11420 : {
11421 : /* The vectorization factor is always > 1, so if we use an IV increment of
11422 : 1. A zero NITERS becomes a nonzero NITERS_VECTOR. */
11423 61760 : if (integer_onep (step_vector))
11424 61742 : niters_no_overflow = true;
11425 :
11426 61760 : vect_set_loop_condition (loop, LOOP_VINFO_MAIN_EXIT (loop_vinfo),
11427 : loop_vinfo, niters_vector, step_vector,
11428 61760 : niters_vector_mult_vf, !niters_no_overflow);
11429 : }
11430 :
11431 61803 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11432 :
11433 : /* True if the final iteration might not handle a full vector's
11434 : worth of scalar iterations. */
11435 123606 : bool final_iter_may_be_partial
11436 61803 : = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11437 61803 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
11438 :
11439 : /* +1 to convert latch counts to loop iteration counts. */
11440 61803 : int bias_for_lowest = 1;
11441 :
11442 : /* When we are peeling for gaps then we take away one scalar iteration
11443 : from the vector loop. Thus we can adjust the upper bound by one
11444 : scalar iteration. But only when we know the bound applies to the
11445 : IV exit test which might not be true when we have multiple exits. */
11446 61803 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11447 120407 : bias_for_lowest -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11448 :
11449 61803 : int bias_for_assumed = bias_for_lowest;
11450 61803 : int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11451 61803 : if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11452 : {
11453 : /* When the amount of peeling is known at compile time, the first
11454 : iteration will have exactly alignment_npeels active elements.
11455 : In the worst case it will have at least one. */
11456 1 : int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11457 1 : bias_for_lowest += lowest_vf - min_first_active;
11458 1 : bias_for_assumed += assumed_vf - min_first_active;
11459 : }
11460 : /* In these calculations the "- 1" converts loop iteration counts
11461 : back to latch counts. */
11462 61803 : if (loop->any_upper_bound)
11463 : {
11464 61787 : loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11465 61787 : loop->nb_iterations_upper_bound
11466 61787 : = (final_iter_may_be_partial
11467 63200 : ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11468 2826 : lowest_vf) - 1
11469 60374 : : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11470 120748 : lowest_vf) - 1);
11471 61787 : if (main_vinfo
11472 : /* Both peeling for alignment and peeling for gaps can end up
11473 : with the scalar epilogue running for more than VF-1 iterations. */
11474 6847 : && !main_vinfo->peeling_for_alignment
11475 6799 : && !main_vinfo->peeling_for_gaps)
11476 : {
11477 6616 : unsigned int bound;
11478 6616 : poly_uint64 main_iters
11479 6616 : = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11480 : LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11481 6616 : main_iters
11482 6616 : = upper_bound (main_iters,
11483 6616 : LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11484 13232 : if (can_div_away_from_zero_p (main_iters,
11485 6616 : LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11486 : &bound))
11487 6616 : loop->nb_iterations_upper_bound
11488 6616 : = wi::umin ((bound_wide_int) (bound - 1),
11489 6616 : loop->nb_iterations_upper_bound);
11490 : }
11491 : }
11492 61803 : if (loop->any_likely_upper_bound)
11493 61787 : loop->nb_iterations_likely_upper_bound
11494 61787 : = (final_iter_may_be_partial
11495 63200 : ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11496 1413 : + bias_for_lowest, lowest_vf) - 1
11497 60374 : : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11498 61787 : + bias_for_lowest, lowest_vf) - 1);
11499 61803 : if (loop->any_estimate)
11500 35614 : loop->nb_iterations_estimate
11501 35614 : = (final_iter_may_be_partial
11502 36307 : ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11503 1386 : assumed_vf) - 1
11504 34921 : : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11505 70535 : assumed_vf) - 1);
11506 61803 : scale_profile_for_vect_loop (loop, LOOP_VINFO_MAIN_EXIT (loop_vinfo),
11507 : assumed_vf, flat);
11508 :
11509 61803 : if (dump_enabled_p ())
11510 : {
11511 11015 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11512 : {
11513 9559 : dump_printf_loc (MSG_NOTE, vect_location,
11514 : "LOOP VECTORIZED\n");
11515 9559 : if (loop->inner)
11516 345 : dump_printf_loc (MSG_NOTE, vect_location,
11517 : "OUTER LOOP VECTORIZED\n");
11518 9559 : dump_printf (MSG_NOTE, "\n");
11519 : }
11520 : else
11521 1456 : dump_printf_loc (MSG_NOTE, vect_location,
11522 : "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11523 1456 : GET_MODE_NAME (loop_vinfo->vector_mode));
11524 : }
11525 :
11526 : /* Loops vectorized with a variable factor won't benefit from
11527 : unrolling/peeling. */
11528 61803 : if (!vf.is_constant ())
11529 : {
11530 : loop->unroll = 1;
11531 : if (dump_enabled_p ())
11532 : dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11533 : " variable-length vectorization factor\n");
11534 : }
11535 :
11536 : /* When we have unrolled the loop due to a user requested value we should
11537 : leave it up to the RTL unroll heuristics to determine if it's still worth
11538 : while to unroll more. */
11539 61803 : if (LOOP_VINFO_USER_UNROLL (loop_vinfo))
11540 44 : loop->unroll = 0;
11541 :
11542 : /* Free SLP instances here because otherwise stmt reference counting
11543 : won't work. */
11544 : slp_instance instance;
11545 151888 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11546 90085 : vect_free_slp_instance (instance);
11547 61803 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11548 : /* Clear-up safelen field since its value is invalid after vectorization
11549 : since vectorized loop can have loop-carried dependencies. */
11550 61803 : loop->safelen = 0;
11551 :
11552 61803 : if (epilogue)
11553 : {
11554 : /* Accumulate past advancements made. */
11555 6847 : if (LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo))
11556 75 : advance = fold_build2 (PLUS_EXPR, TREE_TYPE (advance),
11557 : LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo),
11558 : advance);
11559 6847 : update_epilogue_loop_vinfo (epilogue, advance);
11560 :
11561 6847 : epilogue->simduid = loop->simduid;
11562 6847 : epilogue->force_vectorize = loop->force_vectorize;
11563 6847 : epilogue->dont_vectorize = false;
11564 : }
11565 :
11566 61803 : return epilogue;
11567 61803 : }
11568 :
11569 : /* The code below is trying to perform simple optimization - revert
11570 : if-conversion for masked stores, i.e. if the mask of a store is zero
11571 : do not perform it and all stored value producers also if possible.
11572 : For example,
11573 : for (i=0; i<n; i++)
11574 : if (c[i])
11575 : {
11576 : p1[i] += 1;
11577 : p2[i] = p3[i] +2;
11578 : }
11579 : this transformation will produce the following semi-hammock:
11580 :
11581 : if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11582 : {
11583 : vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11584 : vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11585 : MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11586 : vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11587 : vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11588 : MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11589 : }
11590 : */
11591 :
11592 : void
11593 493 : optimize_mask_stores (class loop *loop)
11594 : {
11595 493 : basic_block *bbs = get_loop_body (loop);
11596 493 : unsigned nbbs = loop->num_nodes;
11597 493 : unsigned i;
11598 493 : basic_block bb;
11599 493 : class loop *bb_loop;
11600 493 : gimple_stmt_iterator gsi;
11601 493 : gimple *stmt;
11602 493 : auto_vec<gimple *> worklist;
11603 493 : auto_purge_vect_location sentinel;
11604 :
11605 493 : vect_location = find_loop_location (loop);
11606 : /* Pick up all masked stores in loop if any. */
11607 1972 : for (i = 0; i < nbbs; i++)
11608 : {
11609 986 : bb = bbs[i];
11610 17311 : for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11611 15339 : gsi_next (&gsi))
11612 : {
11613 15339 : stmt = gsi_stmt (gsi);
11614 15339 : if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11615 695 : worklist.safe_push (stmt);
11616 : }
11617 : }
11618 :
11619 493 : free (bbs);
11620 493 : if (worklist.is_empty ())
11621 68 : return;
11622 :
11623 : /* Loop has masked stores. */
11624 1103 : while (!worklist.is_empty ())
11625 : {
11626 678 : gimple *last, *last_store;
11627 678 : edge e, efalse;
11628 678 : tree mask;
11629 678 : basic_block store_bb, join_bb;
11630 678 : gimple_stmt_iterator gsi_to;
11631 678 : tree vdef, new_vdef;
11632 678 : gphi *phi;
11633 678 : tree vectype;
11634 678 : tree zero;
11635 :
11636 678 : last = worklist.pop ();
11637 678 : mask = gimple_call_arg (last, 2);
11638 678 : bb = gimple_bb (last);
11639 : /* Create then_bb and if-then structure in CFG, then_bb belongs to
11640 : the same loop as if_bb. It could be different to LOOP when two
11641 : level loop-nest is vectorized and mask_store belongs to the inner
11642 : one. */
11643 678 : e = split_block (bb, last);
11644 678 : bb_loop = bb->loop_father;
11645 678 : gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11646 678 : join_bb = e->dest;
11647 678 : store_bb = create_empty_bb (bb);
11648 678 : add_bb_to_loop (store_bb, bb_loop);
11649 678 : e->flags = EDGE_TRUE_VALUE;
11650 678 : efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11651 : /* Put STORE_BB to likely part. */
11652 678 : efalse->probability = profile_probability::likely ();
11653 678 : e->probability = efalse->probability.invert ();
11654 678 : store_bb->count = efalse->count ();
11655 678 : make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11656 678 : if (dom_info_available_p (CDI_DOMINATORS))
11657 678 : set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11658 678 : if (dump_enabled_p ())
11659 351 : dump_printf_loc (MSG_NOTE, vect_location,
11660 : "Create new block %d to sink mask stores.",
11661 : store_bb->index);
11662 : /* Create vector comparison with boolean result. */
11663 678 : vectype = TREE_TYPE (mask);
11664 678 : zero = build_zero_cst (vectype);
11665 678 : stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11666 678 : gsi = gsi_last_bb (bb);
11667 678 : gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11668 : /* Create new PHI node for vdef of the last masked store:
11669 : .MEM_2 = VDEF <.MEM_1>
11670 : will be converted to
11671 : .MEM.3 = VDEF <.MEM_1>
11672 : and new PHI node will be created in join bb
11673 : .MEM_2 = PHI <.MEM_1, .MEM_3>
11674 : */
11675 678 : vdef = gimple_vdef (last);
11676 678 : new_vdef = make_ssa_name (gimple_vop (cfun), last);
11677 678 : gimple_set_vdef (last, new_vdef);
11678 678 : phi = create_phi_node (vdef, join_bb);
11679 678 : add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11680 :
11681 : /* Put all masked stores with the same mask to STORE_BB if possible. */
11682 712 : while (true)
11683 : {
11684 695 : gimple_stmt_iterator gsi_from;
11685 695 : gimple *stmt1 = NULL;
11686 :
11687 : /* Move masked store to STORE_BB. */
11688 695 : last_store = last;
11689 695 : gsi = gsi_for_stmt (last);
11690 695 : gsi_from = gsi;
11691 : /* Shift GSI to the previous stmt for further traversal. */
11692 695 : gsi_prev (&gsi);
11693 695 : gsi_to = gsi_start_bb (store_bb);
11694 695 : gsi_move_before (&gsi_from, &gsi_to);
11695 : /* Setup GSI_TO to the non-empty block start. */
11696 695 : gsi_to = gsi_start_bb (store_bb);
11697 695 : if (dump_enabled_p ())
11698 367 : dump_printf_loc (MSG_NOTE, vect_location,
11699 : "Move stmt to created bb\n%G", last);
11700 : /* Move all stored value producers if possible. */
11701 4960 : while (!gsi_end_p (gsi))
11702 : {
11703 4959 : tree lhs;
11704 4959 : imm_use_iterator imm_iter;
11705 4959 : use_operand_p use_p;
11706 4959 : bool res;
11707 :
11708 : /* Skip debug statements. */
11709 4959 : if (is_gimple_debug (gsi_stmt (gsi)))
11710 : {
11711 3 : gsi_prev (&gsi);
11712 3225 : continue;
11713 : }
11714 4956 : stmt1 = gsi_stmt (gsi);
11715 : /* Do not consider statements writing to memory or having
11716 : volatile operand. */
11717 9762 : if (gimple_vdef (stmt1)
11718 9762 : || gimple_has_volatile_ops (stmt1))
11719 : break;
11720 4806 : gsi_from = gsi;
11721 4806 : gsi_prev (&gsi);
11722 4806 : lhs = gimple_get_lhs (stmt1);
11723 4806 : if (!lhs)
11724 : break;
11725 :
11726 : /* LHS of vectorized stmt must be SSA_NAME. */
11727 4806 : if (TREE_CODE (lhs) != SSA_NAME)
11728 : break;
11729 :
11730 4806 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11731 : {
11732 : /* Remove dead scalar statement. */
11733 3554 : if (has_zero_uses (lhs))
11734 : {
11735 3222 : gsi_remove (&gsi_from, true);
11736 3222 : release_defs (stmt1);
11737 3222 : continue;
11738 : }
11739 : }
11740 :
11741 : /* Check that LHS does not have uses outside of STORE_BB. */
11742 1584 : res = true;
11743 4309 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11744 : {
11745 1685 : gimple *use_stmt;
11746 1685 : use_stmt = USE_STMT (use_p);
11747 1685 : if (is_gimple_debug (use_stmt))
11748 0 : continue;
11749 1685 : if (gimple_bb (use_stmt) != store_bb)
11750 : {
11751 : res = false;
11752 : break;
11753 : }
11754 1584 : }
11755 1584 : if (!res)
11756 : break;
11757 :
11758 1040 : if (gimple_vuse (stmt1)
11759 1476 : && gimple_vuse (stmt1) != gimple_vuse (last_store))
11760 : break;
11761 :
11762 : /* Can move STMT1 to STORE_BB. */
11763 1040 : if (dump_enabled_p ())
11764 563 : dump_printf_loc (MSG_NOTE, vect_location,
11765 : "Move stmt to created bb\n%G", stmt1);
11766 1040 : gsi_move_before (&gsi_from, &gsi_to);
11767 : /* Shift GSI_TO for further insertion. */
11768 2080 : gsi_prev (&gsi_to);
11769 : }
11770 : /* Put other masked stores with the same mask to STORE_BB. */
11771 695 : if (worklist.is_empty ()
11772 270 : || gimple_call_arg (worklist.last (), 2) != mask
11773 17 : || worklist.last () != stmt1)
11774 : break;
11775 17 : last = worklist.pop ();
11776 17 : }
11777 1356 : add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11778 : }
11779 493 : }
11780 :
11781 : /* Decide whether it is possible to use a zero-based induction variable
11782 : when vectorizing LOOP_VINFO with partial vectors. If it is, return
11783 : the value that the induction variable must be able to hold in order
11784 : to ensure that the rgroups eventually have no active vector elements.
11785 : Return -1 otherwise. */
11786 :
11787 : widest_int
11788 46786 : vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11789 : {
11790 46786 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11791 46786 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11792 46786 : unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11793 :
11794 : /* Calculate the value that the induction variable must be able
11795 : to hit in order to ensure that we end the loop with an all-false mask.
11796 : This involves adding the maximum number of inactive trailing scalar
11797 : iterations. */
11798 46786 : widest_int iv_limit = -1;
11799 46786 : if (max_loop_iterations (loop, &iv_limit))
11800 : {
11801 46786 : if (niters_skip)
11802 : {
11803 : /* Add the maximum number of skipped iterations to the
11804 : maximum iteration count. */
11805 0 : if (TREE_CODE (niters_skip) == INTEGER_CST)
11806 0 : iv_limit += wi::to_widest (niters_skip);
11807 : else
11808 0 : iv_limit += max_vf - 1;
11809 : }
11810 46786 : else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11811 : /* Make a conservatively-correct assumption. */
11812 320 : iv_limit += max_vf - 1;
11813 :
11814 : /* IV_LIMIT is the maximum number of latch iterations, which is also
11815 : the maximum in-range IV value. Round this value down to the previous
11816 : vector alignment boundary and then add an extra full iteration. */
11817 46786 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11818 46786 : iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11819 : }
11820 46786 : return iv_limit;
11821 : }
11822 :
11823 : /* For the given rgroup_controls RGC, check whether an induction variable
11824 : would ever hit a value that produces a set of all-false masks or zero
11825 : lengths before wrapping around. Return true if it's possible to wrap
11826 : around before hitting the desirable value, otherwise return false. */
11827 :
11828 : bool
11829 0 : vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11830 : {
11831 0 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11832 :
11833 0 : if (iv_limit == -1)
11834 : return true;
11835 :
11836 0 : tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11837 0 : unsigned int compare_precision = TYPE_PRECISION (compare_type);
11838 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11839 :
11840 0 : if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11841 : return true;
11842 :
11843 : return false;
11844 0 : }
|