Line data Source code
1 : /* Loop Vectorization
2 : Copyright (C) 2003-2026 Free Software Foundation, Inc.
3 : Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 : Ira Rosen <irar@il.ibm.com>
5 :
6 : This file is part of GCC.
7 :
8 : GCC is free software; you can redistribute it and/or modify it under
9 : the terms of the GNU General Public License as published by the Free
10 : Software Foundation; either version 3, or (at your option) any later
11 : version.
12 :
13 : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : for more details.
17 :
18 : You should have received a copy of the GNU General Public License
19 : along with GCC; see the file COPYING3. If not see
20 : <http://www.gnu.org/licenses/>. */
21 :
22 : #define INCLUDE_ALGORITHM
23 : #include "config.h"
24 : #include "system.h"
25 : #include "coretypes.h"
26 : #include "backend.h"
27 : #include "target.h"
28 : #include "rtl.h"
29 : #include "tree.h"
30 : #include "gimple.h"
31 : #include "cfghooks.h"
32 : #include "tree-pass.h"
33 : #include "ssa.h"
34 : #include "optabs-tree.h"
35 : #include "memmodel.h"
36 : #include "optabs.h"
37 : #include "diagnostic-core.h"
38 : #include "fold-const.h"
39 : #include "stor-layout.h"
40 : #include "cfganal.h"
41 : #include "gimplify.h"
42 : #include "gimple-iterator.h"
43 : #include "gimplify-me.h"
44 : #include "tree-ssa-loop-ivopts.h"
45 : #include "tree-ssa-loop-manip.h"
46 : #include "tree-ssa-loop-niter.h"
47 : #include "tree-ssa-loop.h"
48 : #include "cfgloop.h"
49 : #include "tree-scalar-evolution.h"
50 : #include "tree-vectorizer.h"
51 : #include "gimple-fold.h"
52 : #include "cgraph.h"
53 : #include "tree-cfg.h"
54 : #include "tree-if-conv.h"
55 : #include "internal-fn.h"
56 : #include "tree-vector-builder.h"
57 : #include "vec-perm-indices.h"
58 : #include "tree-eh.h"
59 : #include "case-cfn-macros.h"
60 : #include "langhooks.h"
61 : #include "opts.h"
62 : #include "hierarchical_discriminator.h"
63 :
64 : /* Loop Vectorization Pass.
65 :
66 : This pass tries to vectorize loops.
67 :
68 : For example, the vectorizer transforms the following simple loop:
69 :
70 : short a[N]; short b[N]; short c[N]; int i;
71 :
72 : for (i=0; i<N; i++){
73 : a[i] = b[i] + c[i];
74 : }
75 :
76 : as if it was manually vectorized by rewriting the source code into:
77 :
78 : typedef int __attribute__((mode(V8HI))) v8hi;
79 : short a[N]; short b[N]; short c[N]; int i;
80 : v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
81 : v8hi va, vb, vc;
82 :
83 : for (i=0; i<N/8; i++){
84 : vb = pb[i];
85 : vc = pc[i];
86 : va = vb + vc;
87 : pa[i] = va;
88 : }
89 :
90 : The main entry to this pass is vectorize_loops(), in which
91 : the vectorizer applies a set of analyses on a given set of loops,
92 : followed by the actual vectorization transformation for the loops that
93 : had successfully passed the analysis phase.
94 : Throughout this pass we make a distinction between two types of
95 : data: scalars (which are represented by SSA_NAMES), and memory references
96 : ("data-refs"). These two types of data require different handling both
97 : during analysis and transformation. The types of data-refs that the
98 : vectorizer currently supports are ARRAY_REFS which base is an array DECL
99 : (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
100 : accesses are required to have a simple (consecutive) access pattern.
101 :
102 : Analysis phase:
103 : ===============
104 : The driver for the analysis phase is vect_analyze_loop().
105 : It applies a set of analyses, some of which rely on the scalar evolution
106 : analyzer (scev) developed by Sebastian Pop.
107 :
108 : During the analysis phase the vectorizer records some information
109 : per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
110 : loop, as well as general information about the loop as a whole, which is
111 : recorded in a "loop_vec_info" struct attached to each loop.
112 :
113 : Transformation phase:
114 : =====================
115 : The loop transformation phase scans all the stmts in the loop, and
116 : creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
117 : the loop that needs to be vectorized. It inserts the vector code sequence
118 : just before the scalar stmt S, and records a pointer to the vector code
119 : in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
120 : attached to S). This pointer will be used for the vectorization of following
121 : stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
122 : otherwise, we rely on dead code elimination for removing it.
123 :
124 : For example, say stmt S1 was vectorized into stmt VS1:
125 :
126 : VS1: vb = px[i];
127 : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
128 : S2: a = b;
129 :
130 : To vectorize stmt S2, the vectorizer first finds the stmt that defines
131 : the operand 'b' (S1), and gets the relevant vector def 'vb' from the
132 : vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
133 : resulting sequence would be:
134 :
135 : VS1: vb = px[i];
136 : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
137 : VS2: va = vb;
138 : S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
139 :
140 : Operands that are not SSA_NAMEs, are data-refs that appear in
141 : load/store operations (like 'x[i]' in S1), and are handled differently.
142 :
143 : Target modeling:
144 : =================
145 : Currently the only target specific information that is used is the
146 : size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
147 : Targets that can support different sizes of vectors, for now will need
148 : to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
149 : flexibility will be added in the future.
150 :
151 : Since we only vectorize operations which vector form can be
152 : expressed using existing tree codes, to verify that an operation is
153 : supported, the vectorizer checks the relevant optab at the relevant
154 : machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
155 : the value found is CODE_FOR_nothing, then there's no target support, and
156 : we can't vectorize the stmt.
157 :
158 : For additional information on this project see:
159 : http://gcc.gnu.org/projects/tree-ssa/vectorization.html
160 : */
161 :
162 : static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
163 : unsigned *);
164 : static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
165 : gphi **);
166 :
167 :
168 : /* Function vect_is_simple_iv_evolution.
169 :
170 : FORNOW: A simple evolution of an induction variables in the loop is
171 : considered a polynomial evolution. */
172 :
173 : static bool
174 908974 : vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn,
175 : stmt_vec_info stmt_info)
176 : {
177 908974 : tree init_expr;
178 908974 : tree step_expr;
179 908974 : tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
180 908974 : basic_block bb;
181 :
182 : /* When there is no evolution in this loop, the evolution function
183 : is not "simple". */
184 908974 : if (evolution_part == NULL_TREE)
185 : return false;
186 :
187 : /* When the evolution is a polynomial of degree >= 2
188 : the evolution function is not "simple". */
189 972599 : if (tree_is_chrec (evolution_part))
190 : return false;
191 :
192 798652 : step_expr = evolution_part;
193 798652 : init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
194 :
195 798652 : if (dump_enabled_p ())
196 39814 : dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
197 : step_expr, init_expr);
198 :
199 798652 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = init_expr;
200 798652 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = step_expr;
201 :
202 798652 : if (TREE_CODE (step_expr) != INTEGER_CST
203 71359 : && (TREE_CODE (step_expr) != SSA_NAME
204 56421 : || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
205 56171 : && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
206 7772 : || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
207 131 : && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
208 131 : || !flag_associative_math)))
209 862334 : && (TREE_CODE (step_expr) != REAL_CST
210 431 : || !flag_associative_math))
211 : {
212 63625 : if (dump_enabled_p ())
213 3064 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
214 : "step unknown.\n");
215 63625 : return false;
216 : }
217 :
218 : return true;
219 : }
220 :
221 : /* Function vect_is_nonlinear_iv_evolution
222 :
223 : Only support nonlinear induction for integer type
224 : 1. neg
225 : 2. mul by constant
226 : 3. lshift/rshift by constant.
227 :
228 : For neg induction, return a fake step as integer -1. */
229 : static bool
230 171356 : vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
231 : gphi* loop_phi_node)
232 : {
233 171356 : tree init_expr, ev_expr, result, op1, op2;
234 171356 : gimple* def;
235 :
236 171356 : if (gimple_phi_num_args (loop_phi_node) != 2)
237 : return false;
238 :
239 171356 : init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
240 171356 : ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
241 :
242 : /* Support nonlinear induction only for integer type. */
243 171356 : if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
244 : return false;
245 :
246 108180 : result = PHI_RESULT (loop_phi_node);
247 :
248 108180 : if (TREE_CODE (ev_expr) != SSA_NAME
249 105872 : || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
250 108180 : || !is_gimple_assign (def))
251 : return false;
252 :
253 97335 : enum tree_code t_code = gimple_assign_rhs_code (def);
254 97335 : tree step;
255 97335 : switch (t_code)
256 : {
257 3508 : case NEGATE_EXPR:
258 3508 : if (gimple_assign_rhs1 (def) != result)
259 : return false;
260 3508 : step = build_int_cst (TREE_TYPE (init_expr), -1);
261 3508 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
262 3508 : break;
263 :
264 11311 : case RSHIFT_EXPR:
265 11311 : case LSHIFT_EXPR:
266 11311 : case MULT_EXPR:
267 11311 : op1 = gimple_assign_rhs1 (def);
268 11311 : op2 = gimple_assign_rhs2 (def);
269 11311 : if (TREE_CODE (op2) != INTEGER_CST
270 7453 : || op1 != result)
271 : return false;
272 7068 : step = op2;
273 7068 : if (t_code == LSHIFT_EXPR)
274 472 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
275 6596 : else if (t_code == RSHIFT_EXPR)
276 5620 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
277 : /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
278 : else
279 976 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
280 : break;
281 :
282 : default:
283 : return false;
284 : }
285 :
286 10576 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = init_expr;
287 10576 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = step;
288 :
289 10576 : return true;
290 : }
291 :
292 : /* Returns true if Phi is a first-order recurrence. A first-order
293 : recurrence is a non-reduction recurrence relation in which the value of
294 : the recurrence in the current loop iteration equals a value defined in
295 : the previous iteration. */
296 :
297 : static bool
298 66305 : vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
299 : gphi *phi)
300 : {
301 : /* A nested cycle isn't vectorizable as first order recurrence. */
302 66305 : if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
303 : return false;
304 :
305 : /* Ensure the loop latch definition is from within the loop. */
306 66139 : edge latch = loop_latch_edge (loop);
307 66139 : tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
308 66139 : if (TREE_CODE (ldef) != SSA_NAME
309 63488 : || SSA_NAME_IS_DEFAULT_DEF (ldef)
310 63422 : || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
311 124956 : || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
312 7972 : return false;
313 :
314 58167 : tree def = gimple_phi_result (phi);
315 :
316 : /* Ensure every use_stmt of the phi node is dominated by the latch
317 : definition. */
318 58167 : imm_use_iterator imm_iter;
319 58167 : use_operand_p use_p;
320 128628 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
321 69954 : if (!is_gimple_debug (USE_STMT (use_p))
322 136288 : && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
323 45838 : || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
324 : USE_STMT (use_p))))
325 57660 : return false;
326 :
327 : /* First-order recurrence autovectorization needs shuffle vector. */
328 507 : tree scalar_type = TREE_TYPE (def);
329 507 : tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
330 507 : if (!vectype)
331 : return false;
332 :
333 : return true;
334 : }
335 :
336 : /* Function vect_analyze_scalar_cycles_1.
337 :
338 : Examine the cross iteration def-use cycles of scalar variables
339 : in LOOP. LOOP_VINFO represents the loop that is now being
340 : considered for vectorization (can be LOOP, or an outer-loop
341 : enclosing LOOP). SLP indicates there will be some subsequent
342 : slp analyses or not. */
343 :
344 : static void
345 446176 : vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
346 : {
347 446176 : basic_block bb = loop->header;
348 446176 : auto_vec<stmt_vec_info, 64> worklist;
349 446176 : gphi_iterator gsi;
350 :
351 446176 : DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
352 :
353 : /* First - identify all inductions. Reduction detection assumes that all the
354 : inductions have been identified, therefore, this order must not be
355 : changed. */
356 1596391 : for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
357 : {
358 1150215 : gphi *phi = gsi.phi ();
359 1150215 : tree access_fn = NULL;
360 1150215 : tree def = PHI_RESULT (phi);
361 1150215 : stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
362 :
363 : /* Skip virtual phi's. The data dependences that are associated with
364 : virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
365 2300430 : if (virtual_operand_p (def))
366 404618 : continue;
367 :
368 : /* Skip already analyzed inner loop PHIs of double reductions. */
369 909974 : if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))
370 1000 : continue;
371 :
372 908974 : if (dump_enabled_p ())
373 41938 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
374 : (gimple *) phi);
375 :
376 908974 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
377 :
378 : /* Analyze the evolution function. */
379 908974 : access_fn = analyze_scalar_evolution (loop, def);
380 908974 : if (dump_enabled_p ())
381 41938 : dump_printf_loc (MSG_NOTE, vect_location,
382 : "Access function of PHI: %T\n", access_fn);
383 908974 : if (access_fn)
384 908974 : STRIP_NOPS (access_fn);
385 :
386 1072351 : if ((!access_fn
387 908974 : || !vect_is_simple_iv_evolution (loop->num, access_fn, stmt_vinfo)
388 735027 : || (LOOP_VINFO_LOOP (loop_vinfo) != loop
389 11370 : && (TREE_CODE (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo))
390 : != INTEGER_CST)))
391 : /* Only handle nonlinear iv for same loop. */
392 1082927 : && (LOOP_VINFO_LOOP (loop_vinfo) != loop
393 171356 : || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo, phi)))
394 : {
395 163377 : worklist.safe_push (stmt_vinfo);
396 163377 : continue;
397 : }
398 :
399 745597 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
400 : != NULL_TREE);
401 745597 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
402 :
403 745597 : if (dump_enabled_p ())
404 36859 : dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
405 745597 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
406 :
407 : /* Mark if we have a non-linear IV. */
408 745597 : LOOP_VINFO_NON_LINEAR_IV (loop_vinfo)
409 745597 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_vinfo) != vect_step_op_add;
410 : }
411 :
412 :
413 : /* Second - identify all reductions and nested cycles. */
414 609553 : while (worklist.length () > 0)
415 : {
416 163377 : stmt_vec_info stmt_vinfo = worklist.pop ();
417 163377 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
418 163377 : tree def = PHI_RESULT (phi);
419 :
420 163377 : if (dump_enabled_p ())
421 5079 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
422 : (gimple *) phi);
423 :
424 326754 : gcc_assert (!virtual_operand_p (def)
425 : && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
426 :
427 163377 : gphi *double_reduc;
428 163377 : stmt_vec_info reduc_stmt_info
429 163377 : = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc);
430 163377 : if (reduc_stmt_info && double_reduc)
431 : {
432 1102 : stmt_vec_info inner_phi_info
433 1102 : = loop_vinfo->lookup_stmt (double_reduc);
434 : /* ??? Pass down flag we're the inner loop of a double reduc. */
435 1102 : stmt_vec_info inner_reduc_info
436 1102 : = vect_is_simple_reduction (loop_vinfo, inner_phi_info, NULL);
437 1102 : if (inner_reduc_info)
438 : {
439 1000 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
440 1000 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
441 1000 : STMT_VINFO_REDUC_DEF (inner_phi_info) = inner_reduc_info;
442 1000 : STMT_VINFO_REDUC_DEF (inner_reduc_info) = inner_phi_info;
443 1000 : if (dump_enabled_p ())
444 130 : dump_printf_loc (MSG_NOTE, vect_location,
445 : "Detected double reduction.\n");
446 :
447 1000 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
448 1000 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
449 1000 : STMT_VINFO_DEF_TYPE (inner_phi_info) = vect_nested_cycle;
450 : /* Make it accessible for SLP vectorization. */
451 1000 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
452 : }
453 102 : else if (dump_enabled_p ())
454 14 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
455 : "Unknown def-use cycle pattern.\n");
456 : }
457 162275 : else if (reduc_stmt_info)
458 : {
459 95970 : if (loop != LOOP_VINFO_LOOP (loop_vinfo))
460 : {
461 2431 : if (dump_enabled_p ())
462 434 : dump_printf_loc (MSG_NOTE, vect_location,
463 : "Detected vectorizable nested cycle.\n");
464 :
465 2431 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
466 : }
467 : else
468 : {
469 93539 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
470 93539 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
471 93539 : if (dump_enabled_p ())
472 3950 : dump_printf_loc (MSG_NOTE, vect_location,
473 : "Detected reduction.\n");
474 :
475 93539 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
476 93539 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
477 93539 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
478 : }
479 : }
480 66305 : else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
481 501 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
482 : else
483 65804 : if (dump_enabled_p ())
484 477 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
485 : "Unknown def-use cycle pattern.\n");
486 : }
487 446176 : }
488 :
489 :
490 : /* Function vect_analyze_scalar_cycles.
491 :
492 : Examine the cross iteration def-use cycles of scalar variables, by
493 : analyzing the loop-header PHIs of scalar variables. Classify each
494 : cycle as one of the following: invariant, induction, reduction, unknown.
495 : We do that for the loop represented by LOOP_VINFO, and also to its
496 : inner-loop, if exists.
497 : Examples for scalar cycles:
498 :
499 : Example1: reduction:
500 :
501 : loop1:
502 : for (i=0; i<N; i++)
503 : sum += a[i];
504 :
505 : Example2: induction:
506 :
507 : loop2:
508 : for (i=0; i<N; i++)
509 : a[i] = i; */
510 :
511 : static void
512 440398 : vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
513 : {
514 440398 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
515 :
516 440398 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
517 :
518 : /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
519 : Reductions in such inner-loop therefore have different properties than
520 : the reductions in the nest that gets vectorized:
521 : 1. When vectorized, they are executed in the same order as in the original
522 : scalar loop, so we can't change the order of computation when
523 : vectorizing them.
524 : 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
525 : current checks are too strict. */
526 :
527 440398 : if (loop->inner)
528 5778 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
529 440398 : }
530 :
531 : /* Function vect_get_loop_niters.
532 :
533 : Determine how many iterations the loop is executed and place it
534 : in NUMBER_OF_ITERATIONS. Place the number of latch iterations
535 : in NUMBER_OF_ITERATIONSM1. Place the condition under which the
536 : niter information holds in ASSUMPTIONS.
537 :
538 : Return the loop exit conditions. */
539 :
540 :
541 : static vec<gcond *>
542 279806 : vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
543 : tree *number_of_iterations, tree *number_of_iterationsm1)
544 : {
545 279806 : auto_vec<edge> exits = get_loop_exit_edges (loop);
546 279806 : vec<gcond *> conds;
547 559612 : conds.create (exits.length ());
548 279806 : class tree_niter_desc niter_desc;
549 279806 : tree niter_assumptions, niter, may_be_zero;
550 :
551 279806 : *assumptions = boolean_true_node;
552 279806 : *number_of_iterationsm1 = chrec_dont_know;
553 279806 : *number_of_iterations = chrec_dont_know;
554 :
555 279806 : DUMP_VECT_SCOPE ("get_loop_niters");
556 :
557 279806 : if (exits.is_empty ())
558 0 : return conds;
559 :
560 279806 : if (dump_enabled_p ())
561 14631 : dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
562 : exits.length ());
563 :
564 : edge exit;
565 : unsigned int i;
566 681188 : FOR_EACH_VEC_ELT (exits, i, exit)
567 : {
568 401382 : gcond *cond = get_loop_exit_condition (exit);
569 401382 : if (cond)
570 401349 : conds.safe_push (cond);
571 :
572 401382 : if (dump_enabled_p ())
573 15785 : dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
574 :
575 401382 : if (exit != main_exit)
576 180561 : continue;
577 :
578 279806 : may_be_zero = NULL_TREE;
579 279806 : if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
580 279806 : || chrec_contains_undetermined (niter_desc.niter))
581 58985 : continue;
582 :
583 220821 : niter_assumptions = niter_desc.assumptions;
584 220821 : may_be_zero = niter_desc.may_be_zero;
585 220821 : niter = niter_desc.niter;
586 :
587 220821 : if (may_be_zero && integer_zerop (may_be_zero))
588 : may_be_zero = NULL_TREE;
589 :
590 9510 : if (may_be_zero)
591 : {
592 9510 : if (COMPARISON_CLASS_P (may_be_zero))
593 : {
594 : /* Try to combine may_be_zero with assumptions, this can simplify
595 : computation of niter expression. */
596 9510 : if (niter_assumptions && !integer_nonzerop (niter_assumptions))
597 1027 : niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
598 : niter_assumptions,
599 : fold_build1 (TRUTH_NOT_EXPR,
600 : boolean_type_node,
601 : may_be_zero));
602 : else
603 8483 : niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
604 : build_int_cst (TREE_TYPE (niter), 0),
605 : rewrite_to_non_trapping_overflow (niter));
606 :
607 220821 : may_be_zero = NULL_TREE;
608 : }
609 0 : else if (integer_nonzerop (may_be_zero))
610 : {
611 0 : *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
612 0 : *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
613 0 : continue;
614 : }
615 : else
616 0 : continue;
617 : }
618 :
619 : /* Loop assumptions are based off the normal exit. */
620 220821 : *assumptions = niter_assumptions;
621 220821 : *number_of_iterationsm1 = niter;
622 :
623 : /* We want the number of loop header executions which is the number
624 : of latch executions plus one.
625 : ??? For UINT_MAX latch executions this number overflows to zero
626 : for loops like do { n++; } while (n != 0); */
627 220821 : if (niter && !chrec_contains_undetermined (niter))
628 : {
629 220821 : niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
630 : unshare_expr (niter),
631 : build_int_cst (TREE_TYPE (niter), 1));
632 220821 : if (TREE_CODE (niter) == INTEGER_CST
633 121075 : && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
634 : {
635 : /* If we manage to fold niter + 1 into INTEGER_CST even when
636 : niter is some complex expression, ensure back
637 : *number_of_iterationsm1 is an INTEGER_CST as well. See
638 : PR113210. */
639 0 : *number_of_iterationsm1
640 0 : = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
641 : build_minus_one_cst (TREE_TYPE (niter)));
642 : }
643 : }
644 220821 : *number_of_iterations = niter;
645 : }
646 :
647 279806 : if (dump_enabled_p ())
648 14631 : dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
649 :
650 279806 : return conds;
651 279806 : }
652 :
653 : /* Determine the main loop exit for the vectorizer. */
654 :
655 : edge
656 492432 : vec_init_loop_exit_info (class loop *loop)
657 : {
658 : /* Before we begin we must first determine which exit is the main one and
659 : which are auxilary exits. */
660 492432 : auto_vec<edge> exits = get_loop_exit_edges (loop);
661 979827 : if (exits.length () == 0)
662 : return NULL;
663 487395 : if (exits.length () == 1)
664 320880 : return exits[0];
665 :
666 : /* If we have multiple exits, look for counting IV exit.
667 : Analyze all exits and return the last one we can analyze. */
668 166515 : class tree_niter_desc niter_desc;
669 166515 : edge candidate = NULL;
670 618521 : for (edge exit : exits)
671 : {
672 472416 : if (!get_loop_exit_condition (exit))
673 : {
674 20410 : if (dump_enabled_p ())
675 14 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
676 : "Unhandled loop exit detected.\n");
677 20410 : return NULL;
678 : }
679 :
680 452006 : if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
681 452006 : && !chrec_contains_undetermined (niter_desc.niter))
682 : {
683 133313 : tree may_be_zero = niter_desc.may_be_zero;
684 133313 : if ((integer_zerop (may_be_zero)
685 : /* As we are handling may_be_zero that's not false by
686 : rewriting niter to may_be_zero ? 0 : niter we require
687 : an empty latch. */
688 462570 : || (single_pred_p (loop->latch)
689 10229 : && exit->src == single_pred (loop->latch)
690 2743 : && (integer_nonzerop (may_be_zero)
691 2743 : || COMPARISON_CLASS_P (may_be_zero))))
692 136056 : && (!candidate
693 5855 : || dominated_by_p (CDI_DOMINATORS, exit->src,
694 5855 : candidate->src)))
695 : candidate = exit;
696 : }
697 : }
698 :
699 : /* If no exit is analyzable by scalar evolution, we return the last exit
700 : under the assummption we are dealing with an uncounted loop. */
701 201588 : if (!candidate && single_pred_p (loop->latch))
702 35073 : candidate = loop_exits_from_bb_p (loop, single_pred (loop->latch));
703 :
704 : return candidate;
705 166515 : }
706 :
707 : /* Function bb_in_loop_p
708 :
709 : Used as predicate for dfs order traversal of the loop bbs. */
710 :
711 : static bool
712 1683735 : bb_in_loop_p (const_basic_block bb, const void *data)
713 : {
714 1683735 : const class loop *const loop = (const class loop *)data;
715 1683735 : if (flow_bb_inside_loop_p (loop, bb))
716 : return true;
717 : return false;
718 : }
719 :
720 :
721 : /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
722 : stmt_vec_info structs for all the stmts in LOOP_IN. */
723 :
724 576629 : _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
725 : : vec_info (vec_info::loop, shared),
726 576629 : loop (loop_in),
727 576629 : num_itersm1 (NULL_TREE),
728 576629 : num_iters (NULL_TREE),
729 576629 : num_iters_unchanged (NULL_TREE),
730 576629 : num_iters_assumptions (NULL_TREE),
731 576629 : vector_costs (nullptr),
732 576629 : scalar_costs (nullptr),
733 576629 : th (0),
734 576629 : versioning_threshold (0),
735 576629 : vectorization_factor (0),
736 576629 : main_loop_edge (nullptr),
737 576629 : skip_main_loop_edge (nullptr),
738 576629 : skip_this_loop_edge (nullptr),
739 576629 : reusable_accumulators (),
740 576629 : suggested_unroll_factor (1),
741 576629 : max_vectorization_factor (0),
742 576629 : mask_skip_niters (NULL_TREE),
743 576629 : mask_skip_niters_pfa_offset (NULL_TREE),
744 576629 : rgroup_compare_type (NULL_TREE),
745 576629 : simd_if_cond (NULL_TREE),
746 576629 : partial_vector_style (vect_partial_vectors_none),
747 576629 : unaligned_dr (NULL),
748 576629 : peeling_for_alignment (0),
749 576629 : ptr_mask (0),
750 576629 : max_spec_read_amount (0),
751 576629 : nonlinear_iv (false),
752 576629 : ivexpr_map (NULL),
753 576629 : scan_map (NULL),
754 576629 : inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
755 576629 : vectorizable (false),
756 576629 : can_use_partial_vectors_p (true),
757 576629 : must_use_partial_vectors_p (false),
758 576629 : using_partial_vectors_p (false),
759 576629 : using_decrementing_iv_p (false),
760 576629 : using_select_vl_p (false),
761 576629 : allow_mutual_alignment (false),
762 576629 : partial_load_store_bias (0),
763 576629 : peeling_for_gaps (false),
764 576629 : peeling_for_niter (false),
765 576629 : early_breaks (false),
766 576629 : loop_iv_cond (NULL),
767 576629 : user_unroll (false),
768 576629 : no_data_dependencies (false),
769 576629 : has_mask_store (false),
770 576629 : scalar_loop_scaling (profile_probability::uninitialized ()),
771 576629 : scalar_loop (NULL),
772 576629 : main_loop_info (NULL),
773 576629 : orig_loop_info (NULL),
774 576629 : epilogue_vinfo (NULL),
775 576629 : drs_advanced_by (NULL_TREE),
776 576629 : vec_loop_main_exit (NULL),
777 576629 : vec_epilogue_loop_main_exit (NULL),
778 576629 : scalar_loop_main_exit (NULL),
779 576629 : early_break_needs_epilogue (false),
780 576629 : early_break_niters_var (NULL)
781 : {
782 : /* CHECKME: We want to visit all BBs before their successors (except for
783 : latch blocks, for which this assertion wouldn't hold). In the simple
784 : case of the loop forms we allow, a dfs order of the BBs would the same
785 : as reversed postorder traversal, so we are safe. */
786 :
787 576629 : bbs = XCNEWVEC (basic_block, loop->num_nodes);
788 1153258 : nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, bbs,
789 576629 : loop->num_nodes, loop);
790 576629 : gcc_assert (nbbs == loop->num_nodes);
791 :
792 2008586 : for (unsigned int i = 0; i < nbbs; i++)
793 : {
794 1431957 : basic_block bb = bbs[i];
795 1431957 : gimple_stmt_iterator si;
796 :
797 2957283 : for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
798 : {
799 1525326 : gimple *phi = gsi_stmt (si);
800 1525326 : gimple_set_uid (phi, 0);
801 1525326 : add_stmt (phi);
802 : }
803 :
804 13275723 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
805 : {
806 10411809 : gimple *stmt = gsi_stmt (si);
807 10411809 : gimple_set_uid (stmt, 0);
808 10411809 : if (is_gimple_debug (stmt))
809 4439117 : continue;
810 5972692 : add_stmt (stmt);
811 : /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
812 : third argument is the #pragma omp simd if (x) condition, when 0,
813 : loop shouldn't be vectorized, when non-zero constant, it should
814 : be vectorized normally, otherwise versioned with vectorized loop
815 : done if the condition is non-zero at runtime. */
816 5972692 : if (loop_in->simduid
817 43372 : && is_gimple_call (stmt)
818 4268 : && gimple_call_internal_p (stmt)
819 4141 : && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
820 4137 : && gimple_call_num_args (stmt) >= 3
821 103 : && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
822 5972795 : && (loop_in->simduid
823 103 : == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
824 : {
825 103 : tree arg = gimple_call_arg (stmt, 2);
826 103 : if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
827 103 : simd_if_cond = arg;
828 : else
829 0 : gcc_assert (integer_nonzerop (arg));
830 : }
831 : }
832 : }
833 576629 : }
834 :
835 : /* Free all levels of rgroup CONTROLS. */
836 :
837 : void
838 1430038 : release_vec_loop_controls (vec<rgroup_controls> *controls)
839 : {
840 1430038 : rgroup_controls *rgc;
841 1430038 : unsigned int i;
842 1454389 : FOR_EACH_VEC_ELT (*controls, i, rgc)
843 24351 : rgc->controls.release ();
844 1430038 : controls->release ();
845 1430038 : }
846 :
847 : /* Free all memory used by the _loop_vec_info, as well as all the
848 : stmt_vec_info structs of all the stmts in the loop. */
849 :
850 576629 : _loop_vec_info::~_loop_vec_info ()
851 : {
852 576629 : free (bbs);
853 :
854 576629 : release_vec_loop_controls (&masks.rgc_vec);
855 576629 : release_vec_loop_controls (&lens);
856 580463 : delete ivexpr_map;
857 576951 : delete scan_map;
858 576629 : delete scalar_costs;
859 576629 : delete vector_costs;
860 788074 : for (auto reduc_info : reduc_infos)
861 202930 : delete reduc_info;
862 :
863 : /* When we release an epiloge vinfo that we do not intend to use
864 : avoid clearing AUX of the main loop which should continue to
865 : point to the main loop vinfo since otherwise we'll leak that. */
866 576629 : if (loop->aux == this)
867 61570 : loop->aux = NULL;
868 1153258 : }
869 :
870 : /* Return an invariant or register for EXPR and emit necessary
871 : computations in the LOOP_VINFO loop preheader. */
872 :
873 : tree
874 19668 : cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
875 : {
876 19668 : if (is_gimple_reg (expr)
877 19668 : || is_gimple_min_invariant (expr))
878 6808 : return expr;
879 :
880 12860 : if (! loop_vinfo->ivexpr_map)
881 3834 : loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
882 12860 : tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
883 12860 : if (! cached)
884 : {
885 8321 : gimple_seq stmts = NULL;
886 8321 : cached = force_gimple_operand (unshare_expr (expr),
887 : &stmts, true, NULL_TREE);
888 8321 : if (stmts)
889 : {
890 8181 : edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
891 8181 : gsi_insert_seq_on_edge_immediate (e, stmts);
892 : }
893 : }
894 12860 : return cached;
895 : }
896 :
897 : /* Return true if we can use CMP_TYPE as the comparison type to produce
898 : all masks required to mask LOOP_VINFO. */
899 :
900 : static bool
901 109607 : can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
902 : {
903 109607 : rgroup_controls *rgm;
904 109607 : unsigned int i;
905 125111 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
906 125111 : if (rgm->type != NULL_TREE
907 125111 : && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
908 : cmp_type, rgm->type,
909 : OPTIMIZE_FOR_SPEED))
910 : return false;
911 : return true;
912 : }
913 :
914 : /* Calculate the maximum number of scalars per iteration for every
915 : rgroup in LOOP_VINFO. */
916 :
917 : static unsigned int
918 23358 : vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
919 : {
920 23358 : unsigned int res = 1;
921 23358 : unsigned int i;
922 23358 : rgroup_controls *rgm;
923 55984 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
924 32626 : res = MAX (res, rgm->max_nscalars_per_iter);
925 23358 : return res;
926 : }
927 :
928 : /* Calculate the minimum precision necessary to represent:
929 :
930 : MAX_NITERS * FACTOR
931 :
932 : as an unsigned integer, where MAX_NITERS is the maximum number of
933 : loop header iterations for the original scalar form of LOOP_VINFO. */
934 :
935 : unsigned
936 25737 : vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
937 : {
938 25737 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
939 :
940 : /* Get the maximum number of iterations that is representable
941 : in the counter type. */
942 25737 : tree ni_type;
943 25737 : if (!LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo))
944 25737 : ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
945 : else
946 0 : ni_type = sizetype;
947 25737 : widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
948 :
949 : /* Get a more refined estimate for the number of iterations. */
950 25737 : widest_int max_back_edges;
951 25737 : if (max_loop_iterations (loop, &max_back_edges))
952 25737 : max_ni = wi::smin (max_ni, max_back_edges + 1);
953 :
954 : /* Work out how many bits we need to represent the limit. */
955 25737 : return wi::min_precision (max_ni * factor, UNSIGNED);
956 25737 : }
957 :
958 : /* True if the loop needs peeling or partial vectors when vectorized. */
959 :
960 : static bool
961 155579 : vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
962 : {
963 155579 : unsigned HOST_WIDE_INT const_vf;
964 :
965 155579 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
966 : return true;
967 :
968 13493 : loop_vec_info main_loop_vinfo
969 154289 : = (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
970 154289 : ? LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo) : loop_vinfo);
971 154289 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
972 78983 : && LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo) >= 0)
973 : {
974 : /* Work out the (constant) number of iterations that need to be
975 : peeled for reasons other than niters. */
976 78933 : unsigned int peel_niter
977 : = LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo);
978 78933 : return !multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
979 78933 : LOOP_VINFO_VECT_FACTOR (loop_vinfo));
980 : }
981 :
982 75356 : if (!LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo)
983 75356 : && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf))
984 : {
985 : /* When the number of iterations is a multiple of the vectorization
986 : factor and we are not doing prologue or forced epilogue peeling
987 : the epilogue isn't necessary. */
988 74944 : if (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
989 149888 : >= (unsigned) exact_log2 (const_vf))
990 : return false;
991 : }
992 :
993 : return true;
994 : }
995 :
996 : /* Each statement in LOOP_VINFO can be masked where necessary. Check
997 : whether we can actually generate the masks required. Return true if so,
998 : storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
999 :
1000 : static bool
1001 23358 : vect_verify_full_masking (loop_vec_info loop_vinfo)
1002 : {
1003 23358 : unsigned int min_ni_width;
1004 :
1005 : /* Use a normal loop if there are no statements that need masking.
1006 : This only happens in rare degenerate cases: it means that the loop
1007 : has no loads, no stores, and no live-out values. */
1008 23358 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1009 : return false;
1010 :
1011 : /* Produce the rgroup controls. */
1012 92098 : for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1013 : {
1014 34370 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1015 34370 : tree vectype = mask.first;
1016 34370 : unsigned nvectors = mask.second;
1017 :
1018 45382 : if (masks->rgc_vec.length () < nvectors)
1019 25490 : masks->rgc_vec.safe_grow_cleared (nvectors, true);
1020 34370 : rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1021 : /* The number of scalars per iteration and the number of vectors are
1022 : both compile-time constants. */
1023 34370 : unsigned int nscalars_per_iter
1024 34370 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1025 34370 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1026 :
1027 34370 : if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1028 : {
1029 27400 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1030 27400 : rgm->type = truth_type_for (vectype);
1031 27400 : rgm->factor = 1;
1032 : }
1033 : }
1034 :
1035 23358 : unsigned int max_nscalars_per_iter
1036 23358 : = vect_get_max_nscalars_per_iter (loop_vinfo);
1037 :
1038 : /* Work out how many bits we need to represent the limit. */
1039 23358 : min_ni_width
1040 23358 : = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1041 :
1042 : /* Find a scalar mode for which WHILE_ULT is supported. */
1043 23358 : opt_scalar_int_mode cmp_mode_iter;
1044 23358 : tree cmp_type = NULL_TREE;
1045 23358 : tree iv_type = NULL_TREE;
1046 23358 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1047 23358 : unsigned int iv_precision = UINT_MAX;
1048 :
1049 23358 : if (iv_limit != -1)
1050 23358 : iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1051 : UNSIGNED);
1052 :
1053 186864 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1054 : {
1055 163506 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1056 163506 : if (cmp_bits >= min_ni_width
1057 163506 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1058 : {
1059 109607 : tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1060 109607 : if (this_type
1061 109607 : && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1062 : {
1063 : /* Although we could stop as soon as we find a valid mode,
1064 : there are at least two reasons why that's not always the
1065 : best choice:
1066 :
1067 : - An IV that's Pmode or wider is more likely to be reusable
1068 : in address calculations than an IV that's narrower than
1069 : Pmode.
1070 :
1071 : - Doing the comparison in IV_PRECISION or wider allows
1072 : a natural 0-based IV, whereas using a narrower comparison
1073 : type requires mitigations against wrap-around.
1074 :
1075 : Conversely, if the IV limit is variable, doing the comparison
1076 : in a wider type than the original type can introduce
1077 : unnecessary extensions, so picking the widest valid mode
1078 : is not always a good choice either.
1079 :
1080 : Here we prefer the first IV type that's Pmode or wider,
1081 : and the first comparison type that's IV_PRECISION or wider.
1082 : (The comparison type must be no wider than the IV type,
1083 : to avoid extensions in the vector loop.)
1084 :
1085 : ??? We might want to try continuing beyond Pmode for ILP32
1086 : targets if CMP_BITS < IV_PRECISION. */
1087 0 : iv_type = this_type;
1088 0 : if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1089 : cmp_type = this_type;
1090 0 : if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1091 : break;
1092 : }
1093 : }
1094 : }
1095 :
1096 23358 : if (!cmp_type)
1097 : {
1098 23358 : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1099 23358 : return false;
1100 : }
1101 :
1102 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1103 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1104 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1105 0 : return true;
1106 23358 : }
1107 :
1108 : /* Each statement in LOOP_VINFO can be masked where necessary. Check
1109 : whether we can actually generate AVX512 style masks. Return true if so,
1110 : storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1111 :
1112 : static bool
1113 23358 : vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1114 : {
1115 : /* Produce differently organized rgc_vec and differently check
1116 : we can produce masks. */
1117 :
1118 : /* Use a normal loop if there are no statements that need masking.
1119 : This only happens in rare degenerate cases: it means that the loop
1120 : has no loads, no stores, and no live-out values. */
1121 23358 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1122 : return false;
1123 :
1124 : /* For the decrementing IV we need to represent all values in
1125 : [0, niter + niter_skip] where niter_skip is the elements we
1126 : skip in the first iteration for prologue peeling. */
1127 23358 : tree iv_type = NULL_TREE;
1128 23358 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1129 23358 : unsigned int iv_precision = UINT_MAX;
1130 23358 : if (iv_limit != -1)
1131 23358 : iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1132 :
1133 : /* First compute the type for the IV we use to track the remaining
1134 : scalar iterations. */
1135 23358 : opt_scalar_int_mode cmp_mode_iter;
1136 30530 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1137 : {
1138 30530 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1139 30530 : if (cmp_bits >= iv_precision
1140 30530 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1141 : {
1142 23358 : iv_type = build_nonstandard_integer_type (cmp_bits, true);
1143 23358 : if (iv_type)
1144 : break;
1145 : }
1146 : }
1147 23358 : if (!iv_type)
1148 : return false;
1149 :
1150 : /* Produce the rgroup controls. */
1151 92098 : for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1152 : {
1153 34370 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1154 34370 : tree vectype = mask.first;
1155 34370 : unsigned nvectors = mask.second;
1156 :
1157 : /* The number of scalars per iteration and the number of vectors are
1158 : both compile-time constants. */
1159 34370 : unsigned int nscalars_per_iter
1160 34370 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1161 34370 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1162 :
1163 : /* We index the rgroup_controls vector with nscalars_per_iter
1164 : which we keep constant and instead have a varying nvectors,
1165 : remembering the vector mask with the fewest nV. */
1166 45382 : if (masks->rgc_vec.length () < nscalars_per_iter)
1167 23411 : masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1168 34370 : rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1169 :
1170 34370 : if (!rgm->type || rgm->factor > nvectors)
1171 : {
1172 25316 : rgm->type = truth_type_for (vectype);
1173 25316 : rgm->compare_type = NULL_TREE;
1174 25316 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1175 25316 : rgm->factor = nvectors;
1176 25316 : rgm->bias_adjusted_ctrl = NULL_TREE;
1177 : }
1178 : }
1179 :
1180 : /* There is no fixed compare type we are going to use but we have to
1181 : be able to get at one for each mask group. */
1182 23358 : unsigned int min_ni_width
1183 23358 : = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1184 :
1185 23358 : bool ok = true;
1186 88331 : for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1187 : {
1188 24303 : tree mask_type = rgc.type;
1189 24303 : if (!mask_type)
1190 866 : continue;
1191 :
1192 : /* For now vect_get_loop_mask only supports integer mode masks
1193 : when we need to split it. */
1194 23437 : if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1195 23437 : || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1196 : {
1197 : ok = false;
1198 : break;
1199 : }
1200 :
1201 : /* If iv_type is usable as compare type use that - we can elide the
1202 : saturation in that case. */
1203 17395 : if (TYPE_PRECISION (iv_type) >= min_ni_width)
1204 : {
1205 17395 : tree cmp_vectype
1206 17395 : = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1207 17395 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1208 5924 : rgc.compare_type = cmp_vectype;
1209 : }
1210 17395 : if (!rgc.compare_type)
1211 32983 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1212 : {
1213 32979 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1214 32979 : if (cmp_bits >= min_ni_width
1215 32979 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1216 : {
1217 32967 : tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1218 32967 : if (!cmp_type)
1219 0 : continue;
1220 :
1221 : /* Check whether we can produce the mask with cmp_type. */
1222 32967 : tree cmp_vectype
1223 32967 : = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1224 32967 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1225 : {
1226 11467 : rgc.compare_type = cmp_vectype;
1227 11467 : break;
1228 : }
1229 : }
1230 : }
1231 17395 : if (!rgc.compare_type)
1232 : {
1233 : ok = false;
1234 : break;
1235 : }
1236 : }
1237 23358 : if (!ok)
1238 : {
1239 6046 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1240 6046 : return false;
1241 : }
1242 :
1243 17312 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1244 17312 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1245 17312 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1246 17312 : return true;
1247 23358 : }
1248 :
1249 : /* Check whether we can use vector access with length based on precison
1250 : comparison. So far, to keep it simple, we only allow the case that the
1251 : precision of the target supported length is larger than the precision
1252 : required by loop niters. */
1253 :
1254 : static bool
1255 6 : vect_verify_loop_lens (loop_vec_info loop_vinfo)
1256 : {
1257 6 : if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1258 : return false;
1259 :
1260 0 : if (!VECTOR_MODE_P (loop_vinfo->vector_mode))
1261 : return false;
1262 :
1263 0 : machine_mode len_load_mode, len_store_mode;
1264 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1265 0 : .exists (&len_load_mode))
1266 0 : return false;
1267 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1268 0 : .exists (&len_store_mode))
1269 0 : return false;
1270 :
1271 0 : signed char partial_load_bias = internal_len_load_store_bias
1272 0 : (IFN_LEN_LOAD, len_load_mode);
1273 :
1274 0 : signed char partial_store_bias = internal_len_load_store_bias
1275 0 : (IFN_LEN_STORE, len_store_mode);
1276 :
1277 0 : gcc_assert (partial_load_bias == partial_store_bias);
1278 :
1279 0 : if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1280 : return false;
1281 :
1282 : /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1283 : len_loads with a length of zero. In order to avoid that we prohibit
1284 : more than one loop length here. */
1285 0 : if (partial_load_bias == -1
1286 0 : && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1287 : return false;
1288 :
1289 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1290 :
1291 0 : unsigned int max_nitems_per_iter = 1;
1292 0 : unsigned int i;
1293 0 : rgroup_controls *rgl;
1294 : /* Find the maximum number of items per iteration for every rgroup. */
1295 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1296 : {
1297 0 : unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1298 0 : max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1299 : }
1300 :
1301 : /* Work out how many bits we need to represent the length limit. */
1302 0 : unsigned int min_ni_prec
1303 0 : = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1304 :
1305 : /* Now use the maximum of below precisions for one suitable IV type:
1306 : - the IV's natural precision
1307 : - the precision needed to hold: the maximum number of scalar
1308 : iterations multiplied by the scale factor (min_ni_prec above)
1309 : - the Pmode precision
1310 :
1311 : If min_ni_prec is less than the precision of the current niters,
1312 : we perfer to still use the niters type. Prefer to use Pmode and
1313 : wider IV to avoid narrow conversions. */
1314 :
1315 0 : unsigned int ni_prec
1316 0 : = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1317 0 : min_ni_prec = MAX (min_ni_prec, ni_prec);
1318 0 : min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1319 :
1320 0 : tree iv_type = NULL_TREE;
1321 0 : opt_scalar_int_mode tmode_iter;
1322 0 : FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1323 : {
1324 0 : scalar_mode tmode = tmode_iter.require ();
1325 0 : unsigned int tbits = GET_MODE_BITSIZE (tmode);
1326 :
1327 : /* ??? Do we really want to construct one IV whose precision exceeds
1328 : BITS_PER_WORD? */
1329 0 : if (tbits > BITS_PER_WORD)
1330 : break;
1331 :
1332 : /* Find the first available standard integral type. */
1333 0 : if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1334 : {
1335 0 : iv_type = build_nonstandard_integer_type (tbits, true);
1336 0 : break;
1337 : }
1338 : }
1339 :
1340 0 : if (!iv_type)
1341 : {
1342 0 : if (dump_enabled_p ())
1343 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1344 : "can't vectorize with length-based partial vectors"
1345 : " because there is no suitable iv type.\n");
1346 0 : return false;
1347 : }
1348 :
1349 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1350 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1351 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1352 :
1353 0 : return true;
1354 : }
1355 :
1356 : /* Calculate the cost of one scalar iteration of the loop. */
1357 : static void
1358 363517 : vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1359 : {
1360 363517 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1361 363517 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1362 363517 : int nbbs = loop->num_nodes, factor;
1363 363517 : int innerloop_iters, i;
1364 :
1365 363517 : DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1366 :
1367 : /* Gather costs for statements in the scalar loop. */
1368 :
1369 : /* FORNOW. */
1370 363517 : innerloop_iters = 1;
1371 363517 : if (loop->inner)
1372 1604 : innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1373 :
1374 1249330 : for (i = 0; i < nbbs; i++)
1375 : {
1376 885813 : gimple_stmt_iterator si;
1377 885813 : basic_block bb = bbs[i];
1378 :
1379 885813 : if (bb->loop_father == loop->inner)
1380 : factor = innerloop_iters;
1381 : else
1382 882605 : factor = 1;
1383 :
1384 7287244 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1385 : {
1386 5515618 : gimple *stmt = gsi_stmt (si);
1387 5515618 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1388 :
1389 5515618 : if (!is_gimple_assign (stmt)
1390 : && !is_gimple_call (stmt)
1391 : && !is_a<gcond *> (stmt))
1392 1981949 : continue;
1393 :
1394 : /* Skip stmts that are not vectorized inside the loop. */
1395 3533669 : stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1396 3533669 : if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1397 1740325 : && (!STMT_VINFO_LIVE_P (vstmt_info)
1398 53 : || !VECTORIZABLE_CYCLE_DEF
1399 : (STMT_VINFO_DEF_TYPE (vstmt_info))))
1400 1740325 : continue;
1401 :
1402 1793344 : vect_cost_for_stmt kind;
1403 1793344 : if (STMT_VINFO_DATA_REF (stmt_info))
1404 : {
1405 862982 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1406 : kind = scalar_load;
1407 : else
1408 322731 : kind = scalar_store;
1409 : }
1410 930362 : else if (vect_nop_conversion_p (stmt_info))
1411 53821 : continue;
1412 : else
1413 : kind = scalar_stmt;
1414 :
1415 : /* We are using vect_prologue here to avoid scaling twice
1416 : by the inner loop factor. */
1417 1739523 : record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1418 : factor, kind, stmt_info, 0, vect_body);
1419 : }
1420 : }
1421 :
1422 : /* Now accumulate cost. */
1423 363517 : loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1424 363517 : add_stmt_costs (loop_vinfo->scalar_costs,
1425 : &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1426 363517 : loop_vinfo->scalar_costs->finish_cost (nullptr);
1427 363517 : }
1428 :
1429 : /* Function vect_analyze_loop_form.
1430 :
1431 : Verify that certain CFG restrictions hold, including:
1432 : - the loop has a pre-header
1433 : - the loop has a single entry
1434 : - nested loops can have only a single exit.
1435 : - the loop exit condition is simple enough
1436 : - the number of iterations can be analyzed, i.e, a countable loop. The
1437 : niter could be analyzed under some assumptions. */
1438 :
1439 : opt_result
1440 455941 : vect_analyze_loop_form (class loop *loop, gimple *loop_vectorized_call,
1441 : vect_loop_form_info *info)
1442 : {
1443 455941 : DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1444 :
1445 455941 : edge exit_e = vec_init_loop_exit_info (loop);
1446 455941 : if (!exit_e)
1447 29535 : return opt_result::failure_at (vect_location,
1448 : "not vectorized:"
1449 : " Infinite loop detected.\n");
1450 426406 : if (loop_vectorized_call)
1451 : {
1452 28709 : tree arg = gimple_call_arg (loop_vectorized_call, 1);
1453 28709 : class loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg));
1454 28709 : edge scalar_exit_e = vec_init_loop_exit_info (scalar_loop);
1455 28709 : if (!scalar_exit_e)
1456 0 : return opt_result::failure_at (vect_location,
1457 : "not vectorized:"
1458 : " could not determine main exit from"
1459 : " loop with multiple exits.\n");
1460 : }
1461 :
1462 426406 : info->loop_exit = exit_e;
1463 426406 : if (dump_enabled_p ())
1464 16017 : dump_printf_loc (MSG_NOTE, vect_location,
1465 : "using as main loop exit: %d -> %d [AUX: %p]\n",
1466 16017 : exit_e->src->index, exit_e->dest->index, exit_e->aux);
1467 :
1468 : /* Check if we have any control flow that doesn't leave the loop. */
1469 426406 : basic_block *bbs = get_loop_body (loop);
1470 1396039 : for (unsigned i = 0; i < loop->num_nodes; i++)
1471 1085099 : if (EDGE_COUNT (bbs[i]->succs) != 1
1472 1085099 : && (EDGE_COUNT (bbs[i]->succs) != 2
1473 650404 : || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1474 : {
1475 115466 : free (bbs);
1476 115466 : return opt_result::failure_at (vect_location,
1477 : "not vectorized:"
1478 : " unsupported control flow in loop.\n");
1479 : }
1480 :
1481 : /* Check if we have any control flow that doesn't leave the loop. */
1482 312035 : bool has_phi = false;
1483 312035 : for (unsigned i = 0; i < loop->num_nodes; i++)
1484 311578 : if (!gimple_seq_empty_p (phi_nodes (bbs[i])))
1485 : {
1486 : has_phi = true;
1487 : break;
1488 : }
1489 310940 : if (!has_phi)
1490 457 : return opt_result::failure_at (vect_location,
1491 : "not vectorized:"
1492 : " no scalar evolution detected in loop.\n");
1493 :
1494 310483 : free (bbs);
1495 :
1496 : /* Different restrictions apply when we are considering an inner-most loop,
1497 : vs. an outer (nested) loop.
1498 : (FORNOW. May want to relax some of these restrictions in the future). */
1499 :
1500 310483 : info->inner_loop_cond = NULL;
1501 310483 : if (!loop->inner)
1502 : {
1503 : /* Inner-most loop. */
1504 :
1505 292018 : if (empty_block_p (loop->header))
1506 0 : return opt_result::failure_at (vect_location,
1507 : "not vectorized: empty loop.\n");
1508 : }
1509 : else
1510 : {
1511 18465 : class loop *innerloop = loop->inner;
1512 18465 : edge entryedge;
1513 :
1514 : /* Nested loop. We currently require that the loop is doubly-nested,
1515 : contains a single inner loop with a single exit to the block
1516 : with the single exit condition in the outer loop.
1517 : Vectorizable outer-loops look like this:
1518 :
1519 : (pre-header)
1520 : |
1521 : header <---+
1522 : | |
1523 : inner-loop |
1524 : | |
1525 : tail ------+
1526 : |
1527 : (exit-bb)
1528 :
1529 : The inner-loop also has the properties expected of inner-most loops
1530 : as described above. */
1531 :
1532 18465 : if ((loop->inner)->inner || (loop->inner)->next)
1533 2932 : return opt_result::failure_at (vect_location,
1534 : "not vectorized:"
1535 : " multiple nested loops.\n");
1536 :
1537 15533 : entryedge = loop_preheader_edge (innerloop);
1538 15533 : if (entryedge->src != loop->header
1539 15033 : || !single_exit (innerloop)
1540 26936 : || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1541 4472 : return opt_result::failure_at (vect_location,
1542 : "not vectorized:"
1543 : " unsupported outerloop form.\n");
1544 :
1545 : /* Analyze the inner-loop. */
1546 11061 : vect_loop_form_info inner;
1547 11061 : opt_result res = vect_analyze_loop_form (loop->inner, NULL, &inner);
1548 11061 : if (!res)
1549 : {
1550 416 : if (dump_enabled_p ())
1551 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1552 : "not vectorized: Bad inner loop.\n");
1553 416 : return res;
1554 : }
1555 :
1556 : /* Don't support analyzing niter under assumptions for inner
1557 : loop. */
1558 10645 : if (!integer_onep (inner.assumptions))
1559 257 : return opt_result::failure_at (vect_location,
1560 : "not vectorized: Bad inner loop.\n");
1561 :
1562 10388 : if (inner.number_of_iterations == chrec_dont_know
1563 10388 : || !expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1564 1835 : return opt_result::failure_at (vect_location,
1565 : "not vectorized: inner-loop count not"
1566 : " invariant.\n");
1567 :
1568 8553 : if (dump_enabled_p ())
1569 1049 : dump_printf_loc (MSG_NOTE, vect_location,
1570 : "Considering outer-loop vectorization.\n");
1571 8553 : info->inner_loop_cond = inner.conds[0];
1572 11061 : }
1573 :
1574 300571 : if (EDGE_COUNT (loop->header->preds) != 2)
1575 0 : return opt_result::failure_at (vect_location,
1576 : "not vectorized:"
1577 : " too many incoming edges.\n");
1578 :
1579 : /* We assume that the latch is empty. */
1580 300571 : basic_block latch = loop->latch;
1581 300571 : do
1582 : {
1583 300571 : if (!empty_block_p (latch)
1584 300571 : || !gimple_seq_empty_p (phi_nodes (latch)))
1585 20732 : return opt_result::failure_at (vect_location,
1586 : "not vectorized: latch block not "
1587 : "empty.\n");
1588 279839 : latch = single_pred (latch);
1589 : }
1590 559678 : while (single_succ_p (latch));
1591 :
1592 : /* Make sure there is no abnormal exit. */
1593 279839 : auto_vec<edge> exits = get_loop_exit_edges (loop);
1594 961060 : for (edge e : exits)
1595 : {
1596 401415 : if (e->flags & EDGE_ABNORMAL)
1597 33 : return opt_result::failure_at (vect_location,
1598 : "not vectorized:"
1599 : " abnormal loop exit edge.\n");
1600 : }
1601 :
1602 279806 : info->conds
1603 279806 : = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1604 : &info->number_of_iterations,
1605 279806 : &info->number_of_iterationsm1);
1606 279806 : if (info->conds.is_empty ())
1607 33 : return opt_result::failure_at
1608 33 : (vect_location,
1609 : "not vectorized: complicated exit condition.\n");
1610 :
1611 : /* Determine what the primary and alternate exit conds are. */
1612 681122 : for (unsigned i = 0; i < info->conds.length (); i++)
1613 : {
1614 401349 : gcond *cond = info->conds[i];
1615 401349 : if (exit_e->src == gimple_bb (cond))
1616 279773 : std::swap (info->conds[0], info->conds[i]);
1617 : }
1618 :
1619 279773 : if (chrec_contains_undetermined (info->number_of_iterations))
1620 : {
1621 58952 : if (dump_enabled_p ())
1622 257 : dump_printf_loc (MSG_NOTE, vect_location,
1623 : "Loop being analyzed as uncounted.\n");
1624 58952 : if (loop->inner)
1625 562 : return opt_result::failure_at
1626 562 : (vect_location,
1627 : "not vectorized: outer loop vectorization of uncounted loops"
1628 : " is unsupported.\n");
1629 58390 : return opt_result::success ();
1630 : }
1631 :
1632 220821 : if (integer_zerop (info->assumptions))
1633 4 : return opt_result::failure_at
1634 4 : (info->conds[0],
1635 : "not vectorized: number of iterations cannot be computed.\n");
1636 :
1637 220817 : if (integer_zerop (info->number_of_iterations))
1638 12 : return opt_result::failure_at
1639 12 : (info->conds[0],
1640 : "not vectorized: number of iterations = 0.\n");
1641 :
1642 220805 : if (!(tree_fits_shwi_p (info->number_of_iterations)
1643 121052 : && tree_to_shwi (info->number_of_iterations) > 0))
1644 : {
1645 99753 : if (dump_enabled_p ())
1646 : {
1647 2477 : dump_printf_loc (MSG_NOTE, vect_location,
1648 : "Symbolic number of iterations is ");
1649 2477 : dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1650 2477 : dump_printf (MSG_NOTE, "\n");
1651 : }
1652 : }
1653 :
1654 220805 : if (!integer_onep (info->assumptions))
1655 : {
1656 8643 : if (dump_enabled_p ())
1657 : {
1658 66 : dump_printf_loc (MSG_NOTE, vect_location,
1659 : "Loop to be versioned with niter assumption ");
1660 66 : dump_generic_expr (MSG_NOTE, TDF_SLIM, info->assumptions);
1661 66 : dump_printf (MSG_NOTE, "\n");
1662 : }
1663 : }
1664 :
1665 220805 : return opt_result::success ();
1666 279839 : }
1667 :
1668 : /* Create a loop_vec_info for LOOP with SHARED and the
1669 : vect_analyze_loop_form result. */
1670 :
1671 : loop_vec_info
1672 576629 : vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1673 : const vect_loop_form_info *info,
1674 : loop_vec_info orig_loop_info)
1675 : {
1676 576629 : loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1677 576629 : LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1678 576629 : LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1679 576629 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1680 576629 : LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_info;
1681 576629 : if (orig_loop_info && LOOP_VINFO_EPILOGUE_P (orig_loop_info))
1682 372 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo)
1683 372 : = LOOP_VINFO_MAIN_LOOP_INFO (orig_loop_info);
1684 : else
1685 576257 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo) = orig_loop_info;
1686 : /* Also record the assumptions for versioning. */
1687 576629 : if (!integer_onep (info->assumptions) && !orig_loop_info)
1688 19568 : LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1689 :
1690 2558294 : for (gcond *cond : info->conds)
1691 : {
1692 828407 : stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1693 : /* Mark the statement as a condition. */
1694 828407 : STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1695 : }
1696 :
1697 576629 : unsigned cond_id = 0;
1698 576629 : if (!LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo))
1699 491633 : LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[cond_id++];
1700 :
1701 913403 : for (; cond_id < info->conds.length (); cond_id ++)
1702 336774 : LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[cond_id]);
1703 :
1704 576629 : LOOP_VINFO_MAIN_EXIT (loop_vinfo) = info->loop_exit;
1705 :
1706 : /* Check to see if we're vectorizing multiple exits. */
1707 576629 : LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1708 576629 : = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1709 :
1710 : /* At the moment we can't support no epilogs for multiple exits, result of
1711 : the first compare should be masked by that of the second. We can only
1712 : allow it if the early exits have the same live values. for differing
1713 : values we have to calculate a third mask to disambiguate. */
1714 576629 : LOOP_VINFO_EARLY_BRK_NEEDS_EPILOG (loop_vinfo)
1715 576629 : = LOOP_VINFO_LOOP_CONDS (loop_vinfo).length () > 1;
1716 :
1717 576629 : if (info->inner_loop_cond)
1718 : {
1719 : /* If we have an estimate on the number of iterations of the inner
1720 : loop use that to limit the scale for costing, otherwise use
1721 : --param vect-inner-loop-cost-factor literally. */
1722 8962 : widest_int nit;
1723 8962 : if (estimated_stmt_executions (loop->inner, &nit))
1724 7672 : LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1725 7672 : = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1726 8962 : }
1727 :
1728 576629 : return loop_vinfo;
1729 : }
1730 :
1731 :
1732 :
1733 : /* Return true if we know that the iteration count is smaller than the
1734 : vectorization factor. Return false if it isn't, or if we can't be sure
1735 : either way. */
1736 :
1737 : static bool
1738 154661 : vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1739 : {
1740 154661 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1741 :
1742 154661 : HOST_WIDE_INT max_niter;
1743 154661 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1744 79206 : max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1745 : else
1746 75455 : max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1747 :
1748 154661 : if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1749 10892 : return true;
1750 :
1751 : return false;
1752 : }
1753 :
1754 : /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1755 : is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1756 : definitely no, or -1 if it's worth retrying. */
1757 :
1758 : static int
1759 154670 : vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1760 : unsigned *suggested_unroll_factor)
1761 : {
1762 154670 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1763 154670 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1764 :
1765 : /* Only loops that can handle partially-populated vectors can have iteration
1766 : counts less than the vectorization factor. */
1767 154670 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
1768 154670 : && vect_known_niters_smaller_than_vf (loop_vinfo))
1769 : {
1770 10882 : if (dump_enabled_p ())
1771 236 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1772 : "not vectorized: iteration count smaller than "
1773 : "vectorization factor.\n");
1774 10882 : return 0;
1775 : }
1776 :
1777 : /* If we know the number of iterations we can do better, for the
1778 : epilogue we can also decide whether the main loop leaves us
1779 : with enough iterations, prefering a smaller vector epilog then
1780 : also possibly used for the case we skip the vector loop. */
1781 143788 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1782 : {
1783 69562 : widest_int scalar_niters
1784 69562 : = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
1785 69562 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1786 : {
1787 2646 : loop_vec_info orig_loop_vinfo
1788 : = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1789 2646 : loop_vec_info main_loop_vinfo
1790 : = LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo);
1791 2646 : unsigned lowest_vf
1792 2646 : = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
1793 2646 : int prolog_peeling = 0;
1794 2646 : if (!vect_use_loop_mask_for_alignment_p (main_loop_vinfo))
1795 2646 : prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo);
1796 2646 : if (prolog_peeling >= 0
1797 2646 : && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
1798 : lowest_vf))
1799 : {
1800 5282 : unsigned gap
1801 2641 : = LOOP_VINFO_PEELING_FOR_GAPS (main_loop_vinfo) ? 1 : 0;
1802 5282 : scalar_niters = ((scalar_niters - gap - prolog_peeling)
1803 5282 : % lowest_vf + gap);
1804 : }
1805 : }
1806 : /* Reject vectorizing for a single scalar iteration, even if
1807 : we could in principle implement that using partial vectors.
1808 : But allow such vectorization if VF == 1 in case we do not
1809 : need to peel for gaps (if we need, avoid vectorization for
1810 : reasons of code footprint). */
1811 69562 : unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
1812 69562 : if (scalar_niters <= peeling_gap + 1
1813 69562 : && (assumed_vf > 1 || peeling_gap != 0))
1814 : {
1815 653 : if (dump_enabled_p ())
1816 159 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1817 : "not vectorized: loop only has a single "
1818 : "scalar iteration.\n");
1819 653 : return 0;
1820 : }
1821 :
1822 68909 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1823 : {
1824 : /* Check that the loop processes at least one full vector. */
1825 68898 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1826 68898 : if (known_lt (scalar_niters, vf))
1827 : {
1828 348 : if (dump_enabled_p ())
1829 296 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1830 : "loop does not have enough iterations "
1831 : "to support vectorization.\n");
1832 388 : return 0;
1833 : }
1834 :
1835 : /* If we need to peel an extra epilogue iteration to handle data
1836 : accesses with gaps, check that there are enough scalar iterations
1837 : available.
1838 :
1839 : The check above is redundant with this one when peeling for gaps,
1840 : but the distinction is useful for diagnostics. */
1841 68550 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1842 68856 : && known_le (scalar_niters, vf))
1843 : {
1844 40 : if (dump_enabled_p ())
1845 9 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1846 : "loop does not have enough iterations "
1847 : "to support peeling for gaps.\n");
1848 40 : return 0;
1849 : }
1850 : }
1851 69562 : }
1852 :
1853 : /* If using the "very cheap" model. reject cases in which we'd keep
1854 : a copy of the scalar code (even if we might be able to vectorize it). */
1855 142747 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1856 142747 : && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1857 75821 : || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
1858 : {
1859 721 : if (dump_enabled_p ())
1860 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1861 : "some scalar iterations would need to be peeled\n");
1862 721 : return 0;
1863 : }
1864 :
1865 142026 : int min_profitable_iters, min_profitable_estimate;
1866 142026 : vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1867 : &min_profitable_estimate,
1868 : suggested_unroll_factor);
1869 :
1870 142026 : if (min_profitable_iters < 0)
1871 : {
1872 24892 : if (dump_enabled_p ())
1873 34 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1874 : "not vectorized: vectorization not profitable.\n");
1875 24892 : if (dump_enabled_p ())
1876 34 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1877 : "not vectorized: vector version will never be "
1878 : "profitable.\n");
1879 24892 : return -1;
1880 : }
1881 :
1882 117134 : int min_scalar_loop_bound = (param_min_vect_loop_bound
1883 117134 : * assumed_vf);
1884 :
1885 : /* Use the cost model only if it is more conservative than user specified
1886 : threshold. */
1887 117134 : unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1888 : min_profitable_iters);
1889 :
1890 117134 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1891 :
1892 63238 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1893 180372 : && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1894 : {
1895 442 : if (dump_enabled_p ())
1896 1 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1897 : "not vectorized: vectorization not profitable.\n");
1898 442 : if (dump_enabled_p ())
1899 1 : dump_printf_loc (MSG_NOTE, vect_location,
1900 : "not vectorized: iteration count smaller than user "
1901 : "specified loop bound parameter or minimum profitable "
1902 : "iterations (whichever is more conservative).\n");
1903 442 : return 0;
1904 : }
1905 :
1906 : /* The static profitablity threshold min_profitable_estimate includes
1907 : the cost of having to check at runtime whether the scalar loop
1908 : should be used instead. If it turns out that we don't need or want
1909 : such a check, the threshold we should use for the static estimate
1910 : is simply the point at which the vector loop becomes more profitable
1911 : than the scalar loop. */
1912 116692 : if (min_profitable_estimate > min_profitable_iters
1913 24820 : && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1914 24223 : && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1915 613 : && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1916 117305 : && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1917 : {
1918 12 : if (dump_enabled_p ())
1919 7 : dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1920 : " choice between the scalar and vector loops\n");
1921 12 : min_profitable_estimate = min_profitable_iters;
1922 : }
1923 :
1924 : /* If the vector loop needs multiple iterations to be beneficial then
1925 : things are probably too close to call, and the conservative thing
1926 : would be to stick with the scalar code. */
1927 116692 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1928 116692 : && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1929 : {
1930 17817 : if (dump_enabled_p ())
1931 223 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1932 : "one iteration of the vector loop would be"
1933 : " more expensive than the equivalent number of"
1934 : " iterations of the scalar loop\n");
1935 17817 : return 0;
1936 : }
1937 :
1938 98875 : HOST_WIDE_INT estimated_niter;
1939 :
1940 : /* If we are vectorizing an epilogue then we know the maximum number of
1941 : scalar iterations it will cover is at least one lower than the
1942 : vectorization factor of the main loop. */
1943 98875 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1944 12165 : estimated_niter
1945 12165 : = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1946 : else
1947 : {
1948 86710 : estimated_niter = estimated_stmt_executions_int (loop);
1949 86710 : if (estimated_niter == -1)
1950 31384 : estimated_niter = likely_max_stmt_executions_int (loop);
1951 : }
1952 43549 : if (estimated_niter != -1
1953 95925 : && ((unsigned HOST_WIDE_INT) estimated_niter
1954 95925 : < MAX (th, (unsigned) min_profitable_estimate)))
1955 : {
1956 4407 : if (dump_enabled_p ())
1957 32 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1958 : "not vectorized: estimated iteration count too "
1959 : "small.\n");
1960 4407 : if (dump_enabled_p ())
1961 32 : dump_printf_loc (MSG_NOTE, vect_location,
1962 : "not vectorized: estimated iteration count smaller "
1963 : "than specified loop bound parameter or minimum "
1964 : "profitable iterations (whichever is more "
1965 : "conservative).\n");
1966 4407 : return -1;
1967 : }
1968 :
1969 : /* As we cannot use a runtime check to gate profitability for uncounted
1970 : loops require either an estimate or if none, at least a profitable
1971 : vectorization within the first vector iteration (that condition
1972 : will practically never be true due to the required epilog and
1973 : likely alignment prologue). */
1974 94468 : if (LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo)
1975 163 : && estimated_niter == -1
1976 94604 : && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1977 : {
1978 120 : if (dump_enabled_p ())
1979 2 : dump_printf_loc (MSG_NOTE, vect_location,
1980 : "not vectorized: no loop iteration estimate on the "
1981 : "uncounted loop and not trivially profitable.\n");
1982 120 : return -1;
1983 : }
1984 :
1985 : return 1;
1986 : }
1987 :
1988 : /* Gather data references in LOOP with body BBS and store them into
1989 : *DATAREFS. */
1990 :
1991 : static opt_result
1992 277447 : vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1993 : vec<data_reference_p> *datarefs)
1994 : {
1995 827299 : for (unsigned i = 0; i < loop->num_nodes; i++)
1996 1226006 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1997 5298052 : !gsi_end_p (gsi); gsi_next (&gsi))
1998 : {
1999 4748200 : gimple *stmt = gsi_stmt (gsi);
2000 4748200 : if (is_gimple_debug (stmt))
2001 2247410 : continue;
2002 2500920 : opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2003 : NULL, 0);
2004 2500920 : if (!res)
2005 : {
2006 63281 : if (is_gimple_call (stmt) && loop->safelen)
2007 : {
2008 404 : tree fndecl = gimple_call_fndecl (stmt), op;
2009 404 : if (fndecl == NULL_TREE
2010 404 : && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2011 : {
2012 0 : fndecl = gimple_call_arg (stmt, 0);
2013 0 : gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2014 0 : fndecl = TREE_OPERAND (fndecl, 0);
2015 0 : gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2016 : }
2017 404 : if (fndecl != NULL_TREE)
2018 : {
2019 367 : cgraph_node *node = cgraph_node::get (fndecl);
2020 367 : if (node != NULL && node->simd_clones != NULL)
2021 : {
2022 131 : unsigned int j, n = gimple_call_num_args (stmt);
2023 545 : for (j = 0; j < n; j++)
2024 : {
2025 284 : op = gimple_call_arg (stmt, j);
2026 284 : if (DECL_P (op)
2027 284 : || (REFERENCE_CLASS_P (op)
2028 0 : && get_base_address (op)))
2029 : break;
2030 : }
2031 131 : op = gimple_call_lhs (stmt);
2032 : /* Ignore #pragma omp declare simd functions
2033 : if they don't have data references in the
2034 : call stmt itself. */
2035 261 : if (j == n
2036 131 : && !(op
2037 120 : && (DECL_P (op)
2038 120 : || (REFERENCE_CLASS_P (op)
2039 0 : && get_base_address (op)))))
2040 130 : continue;
2041 : }
2042 : }
2043 : }
2044 63151 : return res;
2045 : }
2046 : /* If dependence analysis will give up due to the limit on the
2047 : number of datarefs stop here and fail fatally. */
2048 4275862 : if (datarefs->length ()
2049 1838223 : > (unsigned)param_loop_max_datarefs_for_datadeps)
2050 0 : return opt_result::failure_at (stmt, "exceeded param "
2051 : "loop-max-datarefs-for-datadeps\n");
2052 : }
2053 214296 : return opt_result::success ();
2054 : }
2055 :
2056 : /* Determine if operating on full vectors for LOOP_VINFO might leave
2057 : some scalar iterations still to do. If so, decide how we should
2058 : handle those scalar iterations. The possibilities are:
2059 :
2060 : (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2061 : In this case:
2062 :
2063 : LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2064 : LOOP_VINFO_PEELING_FOR_NITER == false
2065 :
2066 : (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2067 : to handle the remaining scalar iterations. In this case:
2068 :
2069 : LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2070 : LOOP_VINFO_PEELING_FOR_NITER == true
2071 :
2072 : The MASKED_P argument specifies to what extent
2073 : param_vect_partial_vector_usage is to be honored. For MASKED_P == 0
2074 : no partial vectors are to be used, for MASKED_P == -1 it's
2075 : param_vect_partial_vector_usage that gets to decide whether we may
2076 : consider partial vector usage. For MASKED_P == 1 partial vectors
2077 : may be used if possible.
2078 :
2079 : */
2080 :
2081 : static opt_result
2082 155579 : vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2083 : int masked_p)
2084 : {
2085 : /* Determine whether there would be any scalar iterations left over. */
2086 155579 : bool need_peeling_or_partial_vectors_p
2087 155579 : = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2088 :
2089 : /* Decide whether to vectorize the loop with partial vectors. */
2090 155579 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2091 155579 : if (masked_p == 0
2092 155579 : || (masked_p == -1 && param_vect_partial_vector_usage == 0))
2093 : /* If requested explicitly do not use partial vectors. */
2094 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2095 207 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2096 65 : && LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo))
2097 0 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2098 207 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2099 65 : && need_peeling_or_partial_vectors_p)
2100 : {
2101 : /* For partial-vector-usage=1, try to push the handling of partial
2102 : vectors to the epilogue, with the main loop continuing to operate
2103 : on full vectors.
2104 :
2105 : If we are unrolling we also do not want to use partial vectors. This
2106 : is to avoid the overhead of generating multiple masks and also to
2107 : avoid having to execute entire iterations of FALSE masked instructions
2108 : when dealing with one or less full iterations.
2109 :
2110 : ??? We could then end up failing to use partial vectors if we
2111 : decide to peel iterations into a prologue, and if the main loop
2112 : then ends up processing fewer than VF iterations. */
2113 43 : if ((param_vect_partial_vector_usage == 1
2114 11 : || loop_vinfo->suggested_unroll_factor > 1)
2115 32 : && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2116 65 : && !vect_known_niters_smaller_than_vf (loop_vinfo))
2117 : ;
2118 : else
2119 31 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2120 : }
2121 :
2122 155579 : if (LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo)
2123 0 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2124 0 : return opt_result::failure_at (vect_location,
2125 : "not vectorized: loop needs but cannot "
2126 : "use partial vectors\n");
2127 :
2128 155579 : if (dump_enabled_p ())
2129 12515 : dump_printf_loc (MSG_NOTE, vect_location,
2130 : "operating on %s vectors%s.\n",
2131 12515 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2132 : ? "partial" : "full",
2133 12515 : LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2134 : ? " for epilogue loop" : "");
2135 :
2136 155579 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2137 311158 : = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2138 155579 : && need_peeling_or_partial_vectors_p);
2139 :
2140 155579 : return opt_result::success ();
2141 : }
2142 :
2143 : /* Function vect_analyze_loop_2.
2144 :
2145 : Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2146 : analyses will record information in some members of LOOP_VINFO. FATAL
2147 : indicates if some analysis meets fatal error. If one non-NULL pointer
2148 : SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2149 : worked out suggested unroll factor, while one NULL pointer shows it's
2150 : going to apply the suggested unroll factor.
2151 : SINGLE_LANE_SLP_DONE_FOR_SUGGESTED_UF is to hold whether single-lane
2152 : slp was forced when the suggested unroll factor was worked out. */
2153 : static opt_result
2154 575929 : vect_analyze_loop_2 (loop_vec_info loop_vinfo, int masked_p, bool &fatal,
2155 : unsigned *suggested_unroll_factor,
2156 : bool& single_lane_slp_done_for_suggested_uf)
2157 : {
2158 575929 : opt_result ok = opt_result::success ();
2159 575929 : int res;
2160 575929 : unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2161 575929 : loop_vec_info orig_loop_vinfo = NULL;
2162 :
2163 : /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2164 : loop_vec_info of the first vectorized loop. */
2165 575929 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2166 13905 : orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2167 : else
2168 : orig_loop_vinfo = loop_vinfo;
2169 13905 : gcc_assert (orig_loop_vinfo);
2170 :
2171 : /* We can't mask on niters for uncounted loops due to unkown upper bound. */
2172 575929 : if (LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo))
2173 84996 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2174 :
2175 : /* The first group of checks is independent of the vector size. */
2176 575929 : fatal = true;
2177 :
2178 575929 : if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2179 575929 : && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2180 5 : return opt_result::failure_at (vect_location,
2181 : "not vectorized: simd if(0)\n");
2182 :
2183 : /* Find all data references in the loop (which correspond to vdefs/vuses)
2184 : and analyze their evolution in the loop. */
2185 :
2186 575924 : loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2187 :
2188 : /* Gather the data references. */
2189 575924 : if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2190 : {
2191 277447 : opt_result res
2192 277447 : = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2193 : &LOOP_VINFO_DATAREFS (loop_vinfo));
2194 277447 : if (!res)
2195 : {
2196 63151 : if (dump_enabled_p ())
2197 1642 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2198 : "not vectorized: loop contains function "
2199 : "calls or data references that cannot "
2200 : "be analyzed\n");
2201 63151 : return res;
2202 : }
2203 214296 : loop_vinfo->shared->save_datarefs ();
2204 : }
2205 : else
2206 298477 : loop_vinfo->shared->check_datarefs ();
2207 :
2208 : /* Analyze the data references and also adjust the minimal
2209 : vectorization factor according to the loads and stores. */
2210 :
2211 512773 : ok = vect_analyze_data_refs (loop_vinfo, &fatal);
2212 512773 : if (!ok)
2213 : {
2214 72375 : if (dump_enabled_p ())
2215 1230 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2216 : "bad data references.\n");
2217 72375 : return ok;
2218 : }
2219 :
2220 : /* Check if we are applying unroll factor now. */
2221 440398 : bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2222 440398 : gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2223 :
2224 : /* When single-lane SLP was forced and we are applying suggested unroll
2225 : factor, keep that decision here. */
2226 880796 : bool force_single_lane = (applying_suggested_uf
2227 440398 : && single_lane_slp_done_for_suggested_uf);
2228 :
2229 : /* Classify all cross-iteration scalar data-flow cycles.
2230 : Cross-iteration cycles caused by virtual phis are analyzed separately. */
2231 440398 : vect_analyze_scalar_cycles (loop_vinfo);
2232 :
2233 440398 : vect_pattern_recog (loop_vinfo);
2234 :
2235 : /* Analyze the access patterns of the data-refs in the loop (consecutive,
2236 : complex, etc.). FORNOW: Only handle consecutive access pattern. */
2237 :
2238 440398 : ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2239 440398 : if (!ok)
2240 : {
2241 7948 : if (dump_enabled_p ())
2242 291 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2243 : "bad data access.\n");
2244 7948 : return ok;
2245 : }
2246 :
2247 : /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2248 :
2249 432450 : ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2250 432450 : if (!ok)
2251 : {
2252 45134 : if (dump_enabled_p ())
2253 401 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2254 : "unexpected pattern.\n");
2255 45134 : return ok;
2256 : }
2257 :
2258 : /* While the rest of the analysis below depends on it in some way. */
2259 387316 : fatal = false;
2260 :
2261 : /* Analyze data dependences between the data-refs in the loop
2262 : and adjust the maximum vectorization factor according to
2263 : the dependences.
2264 : FORNOW: fail at the first data dependence that we encounter. */
2265 :
2266 387316 : ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2267 387316 : if (!ok)
2268 : {
2269 23799 : if (dump_enabled_p ())
2270 538 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2271 : "bad data dependence.\n");
2272 23799 : return ok;
2273 : }
2274 363517 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2275 :
2276 : /* Compute the scalar iteration cost. */
2277 363517 : vect_compute_single_scalar_iteration_cost (loop_vinfo);
2278 :
2279 363517 : bool saved_can_use_partial_vectors_p
2280 : = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2281 :
2282 : /* This is the point where we can re-start analysis with single-lane
2283 : SLP forced. */
2284 498884 : start_over:
2285 :
2286 : /* Check the SLP opportunities in the loop, analyze and build
2287 : SLP trees. */
2288 997768 : ok = vect_analyze_slp (loop_vinfo, loop_vinfo->stmt_vec_infos.length (),
2289 : force_single_lane);
2290 498884 : if (!ok)
2291 24962 : return ok;
2292 :
2293 : /* If there are any SLP instances mark them as pure_slp and compute
2294 : the overall vectorization factor. */
2295 473922 : if (!vect_make_slp_decision (loop_vinfo))
2296 61102 : return opt_result::failure_at (vect_location, "no stmts to vectorize.\n");
2297 :
2298 412820 : if (dump_enabled_p ())
2299 19072 : dump_printf_loc (MSG_NOTE, vect_location, "Loop contains only SLP stmts\n");
2300 :
2301 : /* Dump the vectorization factor from the SLP decision. */
2302 412820 : if (dump_enabled_p ())
2303 : {
2304 19072 : dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
2305 19072 : dump_dec (MSG_NOTE, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2306 19072 : dump_printf (MSG_NOTE, "\n");
2307 : }
2308 :
2309 : /* We don't expect to have to roll back to anything other than an empty
2310 : set of rgroups. */
2311 412820 : gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2312 :
2313 : /* Apply the suggested unrolling factor, this was determined by the backend
2314 : during finish_cost the first time we ran the analyzis for this
2315 : vector mode. */
2316 412820 : if (applying_suggested_uf)
2317 437 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2318 :
2319 : /* Now the vectorization factor is final. */
2320 412820 : poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2321 412820 : gcc_assert (known_ne (vectorization_factor, 0U));
2322 :
2323 : /* Optimize the SLP graph with the vectorization factor fixed. */
2324 412820 : vect_optimize_slp (loop_vinfo);
2325 :
2326 : /* Gather the loads reachable from the SLP graph entries. */
2327 412820 : vect_gather_slp_loads (loop_vinfo);
2328 :
2329 412820 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2330 : {
2331 14169 : dump_printf_loc (MSG_NOTE, vect_location,
2332 : "vectorization_factor = ");
2333 14169 : dump_dec (MSG_NOTE, vectorization_factor);
2334 14169 : dump_printf (MSG_NOTE, ", niters = %wd\n",
2335 14169 : LOOP_VINFO_INT_NITERS (loop_vinfo));
2336 : }
2337 :
2338 412820 : if (max_vf != MAX_VECTORIZATION_FACTOR
2339 412820 : && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2340 41 : return opt_result::failure_at (vect_location, "bad data dependence.\n");
2341 :
2342 412779 : loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2343 :
2344 : /* Analyze the alignment of the data-refs in the loop. */
2345 412779 : vect_analyze_data_refs_alignment (loop_vinfo);
2346 :
2347 : /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2348 : It is important to call pruning after vect_analyze_data_ref_accesses,
2349 : since we use grouping information gathered by interleaving analysis. */
2350 412779 : ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2351 412779 : if (!ok)
2352 16946 : return ok;
2353 :
2354 : /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2355 : vectorization, since we do not want to add extra peeling or
2356 : add versioning for alignment. */
2357 395833 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2358 : /* This pass will decide on using loop versioning and/or loop peeling in
2359 : order to enhance the alignment of data references in the loop. */
2360 381105 : ok = vect_enhance_data_refs_alignment (loop_vinfo);
2361 395833 : if (!ok)
2362 0 : return ok;
2363 :
2364 : /* Analyze operations in the SLP instances. We can't simply
2365 : remove unsupported SLP instances as this makes the above
2366 : SLP kind detection invalid and might also affect the VF. */
2367 395833 : if (! vect_slp_analyze_operations (loop_vinfo))
2368 : {
2369 240254 : ok = opt_result::failure_at (vect_location,
2370 : "unsupported SLP instances\n");
2371 240254 : goto again;
2372 : }
2373 :
2374 : /* For now, we don't expect to mix both masking and length approaches for one
2375 : loop, disable it if both are recorded. */
2376 155579 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2377 23364 : && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2378 178937 : && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2379 : {
2380 0 : if (dump_enabled_p ())
2381 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2382 : "can't vectorize a loop with partial vectors"
2383 : " because we don't expect to mix different"
2384 : " approaches with partial vectors for the"
2385 : " same loop.\n");
2386 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2387 : }
2388 :
2389 : /* If we still have the option of using partial vectors,
2390 : check whether we can generate the necessary loop controls. */
2391 155579 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2392 : {
2393 23364 : if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2394 : {
2395 23358 : if (!vect_verify_full_masking (loop_vinfo)
2396 23358 : && !vect_verify_full_masking_avx512 (loop_vinfo))
2397 6046 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2398 : }
2399 : else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
2400 6 : if (!vect_verify_loop_lens (loop_vinfo))
2401 6 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2402 : }
2403 :
2404 : /* Decide whether this loop_vinfo should use partial vectors or peeling,
2405 : assuming that the loop will be used as a main loop. We will redo
2406 : this analysis later if we instead decide to use the loop as an
2407 : epilogue loop. */
2408 155579 : ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, masked_p);
2409 155579 : if (!ok)
2410 0 : return ok;
2411 :
2412 : /* If we're vectorizing a loop that uses length "controls" and
2413 : can iterate more than once, we apply decrementing IV approach
2414 : in loop control. */
2415 155579 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2416 31 : && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
2417 0 : && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
2418 155579 : && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2419 0 : && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
2420 : LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
2421 0 : LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
2422 :
2423 : /* If a loop uses length controls and has a decrementing loop control IV,
2424 : we will normally pass that IV through a MIN_EXPR to calcaluate the
2425 : basis for the length controls. E.g. in a loop that processes one
2426 : element per scalar iteration, the number of elements would be
2427 : MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
2428 :
2429 : This MIN_EXPR approach allows us to use pointer IVs with an invariant
2430 : step, since only the final iteration of the vector loop can have
2431 : inactive lanes.
2432 :
2433 : However, some targets have a dedicated instruction for calculating the
2434 : preferred length, given the total number of elements that still need to
2435 : be processed. This is encapsulated in the SELECT_VL internal function.
2436 :
2437 : If the target supports SELECT_VL, we can use it instead of MIN_EXPR
2438 : to determine the basis for the length controls. However, unlike the
2439 : MIN_EXPR calculation, the SELECT_VL calculation can decide to make
2440 : lanes inactive in any iteration of the vector loop, not just the last
2441 : iteration. This SELECT_VL approach therefore requires us to use pointer
2442 : IVs with variable steps.
2443 :
2444 : Once we've decided how many elements should be processed by one
2445 : iteration of the vector loop, we need to populate the rgroup controls.
2446 : If a loop has multiple rgroups, we need to make sure that those rgroups
2447 : "line up" (that is, they must be consistent about which elements are
2448 : active and which aren't). This is done by vect_adjust_loop_lens_control.
2449 :
2450 : In principle, it would be possible to use vect_adjust_loop_lens_control
2451 : on either the result of a MIN_EXPR or the result of a SELECT_VL.
2452 : However:
2453 :
2454 : (1) In practice, it only makes sense to use SELECT_VL when a vector
2455 : operation will be controlled directly by the result. It is not
2456 : worth using SELECT_VL if it would only be the input to other
2457 : calculations.
2458 :
2459 : (2) If we use SELECT_VL for an rgroup that has N controls, each associated
2460 : pointer IV will need N updates by a variable amount (N-1 updates
2461 : within the iteration and 1 update to move to the next iteration).
2462 :
2463 : Because of this, we prefer to use the MIN_EXPR approach whenever there
2464 : is more than one length control.
2465 :
2466 : In addition, SELECT_VL always operates to a granularity of 1 unit.
2467 : If we wanted to use it to control an SLP operation on N consecutive
2468 : elements, we would need to make the SELECT_VL inputs measure scalar
2469 : iterations (rather than elements) and then multiply the SELECT_VL
2470 : result by N. But using SELECT_VL this way is inefficient because
2471 : of (1) above.
2472 :
2473 : 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
2474 : satisfied:
2475 :
2476 : (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
2477 : (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
2478 :
2479 : Since SELECT_VL (variable step) will make SCEV analysis failed and then
2480 : we will fail to gain benefits of following unroll optimizations. We prefer
2481 : using the MIN_EXPR approach in this situation. */
2482 155579 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
2483 : {
2484 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
2485 0 : if (LOOP_VINFO_LENS (loop_vinfo).length () == 1
2486 0 : && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1
2487 0 : && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2488 : || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
2489 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
2490 :
2491 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
2492 0 : for (auto rgc : LOOP_VINFO_LENS (loop_vinfo))
2493 0 : if (rgc.type
2494 0 : && !direct_internal_fn_supported_p (IFN_SELECT_VL,
2495 : rgc.type, iv_type,
2496 : OPTIMIZE_FOR_SPEED))
2497 : {
2498 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2499 0 : break;
2500 : }
2501 :
2502 : /* If any of the SLP instances cover more than a single lane
2503 : we cannot use .SELECT_VL at the moment, even if the number
2504 : of lanes is uniform throughout the SLP graph. */
2505 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
2506 0 : for (slp_instance inst : LOOP_VINFO_SLP_INSTANCES (loop_vinfo))
2507 0 : if (SLP_TREE_LANES (SLP_INSTANCE_TREE (inst)) != 1
2508 0 : && !(SLP_INSTANCE_KIND (inst) == slp_inst_kind_store
2509 0 : && SLP_INSTANCE_TREE (inst)->ldst_lanes))
2510 : {
2511 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2512 0 : break;
2513 : }
2514 : }
2515 :
2516 : /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2517 : to be able to handle fewer than VF scalars, or needs to have a lower VF
2518 : than the main loop. */
2519 155579 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2520 13571 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2521 : {
2522 13557 : poly_uint64 unscaled_vf
2523 13557 : = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2524 : orig_loop_vinfo->suggested_unroll_factor);
2525 13557 : if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
2526 391 : return opt_result::failure_at (vect_location,
2527 : "Vectorization factor too high for"
2528 : " epilogue loop.\n");
2529 : }
2530 :
2531 : /* If the epilogue needs peeling for gaps but the main loop doesn't give
2532 : up on the epilogue. */
2533 155188 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2534 13180 : && LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2535 73 : && (LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo)
2536 : != LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
2537 4 : return opt_result::failure_at (vect_location,
2538 : "Epilogue loop requires peeling for gaps "
2539 : "but main loop does not.\n");
2540 :
2541 : /* If an epilogue loop is required make sure we can create one. */
2542 155184 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2543 153903 : || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2544 56587 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
2545 : {
2546 100112 : if (dump_enabled_p ())
2547 5550 : dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2548 100112 : if (!vect_can_advance_ivs_p (loop_vinfo)
2549 199710 : || !slpeel_can_duplicate_loop_p (loop,
2550 : LOOP_VINFO_MAIN_EXIT (loop_vinfo),
2551 99598 : LOOP_VINFO_MAIN_EXIT (loop_vinfo)))
2552 : {
2553 514 : ok = opt_result::failure_at (vect_location,
2554 : "not vectorized: can't create required "
2555 : "epilog loop\n");
2556 514 : goto again;
2557 : }
2558 : }
2559 :
2560 : /* Check the costings of the loop make vectorizing worthwhile. */
2561 154670 : res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2562 154670 : if (res < 0 && !param_vect_allow_possibly_not_worthwhile_vectorizations)
2563 : {
2564 29419 : ok = opt_result::failure_at (vect_location,
2565 : "Loop costings may not be worthwhile.\n");
2566 29419 : goto again;
2567 : }
2568 125251 : if (!res)
2569 30903 : return opt_result::failure_at (vect_location,
2570 : "Loop costings not worthwhile.\n");
2571 :
2572 : /* During peeling, we need to check if number of loop iterations is
2573 : enough for both peeled prolog loop and vector loop. This check
2574 : can be merged along with threshold check of loop versioning, so
2575 : increase threshold for this case if necessary.
2576 :
2577 : If we are analyzing an epilogue we still want to check what its
2578 : versioning threshold would be. If we decide to vectorize the epilogues we
2579 : will want to use the lowest versioning threshold of all epilogues and main
2580 : loop. This will enable us to enter a vectorized epilogue even when
2581 : versioning the loop. We can't simply check whether the epilogue requires
2582 : versioning though since we may have skipped some versioning checks when
2583 : analyzing the epilogue. For instance, checks for alias versioning will be
2584 : skipped when dealing with epilogues as we assume we already checked them
2585 : for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2586 94348 : if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2587 : {
2588 8973 : poly_uint64 niters_th = 0;
2589 8973 : unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2590 :
2591 8973 : if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2592 : {
2593 : /* Niters for peeled prolog loop. */
2594 8973 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2595 : {
2596 115 : dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2597 115 : tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2598 115 : niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2599 : }
2600 : else
2601 8858 : niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2602 : }
2603 :
2604 : /* Niters for at least one iteration of vectorized loop. */
2605 8973 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2606 8969 : niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2607 : /* One additional iteration because of peeling for gap. */
2608 8973 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2609 66 : niters_th += 1;
2610 :
2611 : /* Use the same condition as vect_transform_loop to decide when to use
2612 : the cost to determine a versioning threshold. */
2613 8973 : if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2614 8973 : && ordered_p (th, niters_th))
2615 6647 : niters_th = ordered_max (poly_uint64 (th), niters_th);
2616 :
2617 8973 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2618 : }
2619 :
2620 94348 : gcc_assert (known_eq (vectorization_factor,
2621 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2622 :
2623 94348 : single_lane_slp_done_for_suggested_uf = force_single_lane;
2624 :
2625 : /* Ok to vectorize! */
2626 94348 : LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2627 94348 : return opt_result::success ();
2628 :
2629 270187 : again:
2630 : /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2631 270187 : gcc_assert (!ok);
2632 :
2633 : /* Try again with single-lane SLP. */
2634 270187 : if (force_single_lane)
2635 133872 : return ok;
2636 :
2637 : /* If we are applying suggested unroll factor, we don't need to
2638 : re-try any more as we want to keep the SLP mode fixed. */
2639 136315 : if (applying_suggested_uf)
2640 10 : return ok;
2641 :
2642 : /* Likewise if the grouped loads or stores in the SLP cannot be handled
2643 : via interleaving or lane instructions. */
2644 : slp_instance instance;
2645 : slp_tree node;
2646 : unsigned i, j;
2647 369974 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2648 : {
2649 234607 : if (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance)) != vect_internal_def)
2650 0 : continue;
2651 :
2652 234607 : stmt_vec_info vinfo;
2653 234607 : vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2654 234607 : if (!vinfo || !STMT_VINFO_GROUPED_ACCESS (vinfo))
2655 231900 : continue;
2656 2707 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2657 2707 : unsigned int size = DR_GROUP_SIZE (vinfo);
2658 2707 : tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
2659 2707 : if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
2660 4735 : && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2661 5408 : && ! vect_grouped_store_supported (vectype, size))
2662 673 : return opt_result::failure_at (vinfo->stmt,
2663 : "unsupported grouped store\n");
2664 237138 : FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2665 : {
2666 2200 : vinfo = SLP_TREE_REPRESENTATIVE (node);
2667 2200 : if (STMT_VINFO_GROUPED_ACCESS (vinfo))
2668 : {
2669 1925 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2670 1925 : bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2671 1925 : size = DR_GROUP_SIZE (vinfo);
2672 1925 : vectype = SLP_TREE_VECTYPE (node);
2673 1925 : if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
2674 1925 : && ! vect_grouped_load_supported (vectype, single_element_p,
2675 : size))
2676 265 : return opt_result::failure_at (vinfo->stmt,
2677 : "unsupported grouped load\n");
2678 : }
2679 : }
2680 : }
2681 :
2682 : /* Roll back state appropriately. Force single-lane SLP this time. */
2683 135367 : force_single_lane = true;
2684 135367 : if (dump_enabled_p ())
2685 3379 : dump_printf_loc (MSG_NOTE, vect_location,
2686 : "re-trying with single-lane SLP\n");
2687 :
2688 : /* Reset the vectorization factor. */
2689 135367 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = 0;
2690 : /* Free the SLP instances. */
2691 369029 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2692 233662 : vect_free_slp_instance (instance);
2693 135367 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2694 : /* Reset altered state on stmts. */
2695 514007 : for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2696 : {
2697 378640 : basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2698 378640 : for (gimple_stmt_iterator si = gsi_start_phis (bb);
2699 681114 : !gsi_end_p (si); gsi_next (&si))
2700 : {
2701 302474 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2702 302474 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2703 302474 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2704 : {
2705 : /* vectorizable_reduction adjusts reduction stmt def-types,
2706 : restore them to that of the PHI. */
2707 25533 : STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2708 25533 : = STMT_VINFO_DEF_TYPE (stmt_info);
2709 25533 : STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2710 : (STMT_VINFO_REDUC_DEF (stmt_info)))
2711 25533 : = STMT_VINFO_DEF_TYPE (stmt_info);
2712 : }
2713 : }
2714 : }
2715 : /* Free optimized alias test DDRS. */
2716 135367 : LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2717 135367 : LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2718 135367 : LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2719 : /* Reset target cost data. */
2720 135367 : delete loop_vinfo->vector_costs;
2721 135367 : loop_vinfo->vector_costs = nullptr;
2722 : /* Reset accumulated rgroup information. */
2723 135367 : LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
2724 135367 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
2725 135367 : release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2726 : /* Reset assorted flags. */
2727 135367 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2728 135367 : LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2729 135367 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2730 135367 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2731 135367 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2732 135367 : = saved_can_use_partial_vectors_p;
2733 135367 : LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2734 135367 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2735 135367 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2736 135367 : LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = false;
2737 :
2738 135367 : if (loop_vinfo->scan_map)
2739 122 : loop_vinfo->scan_map->empty ();
2740 :
2741 135367 : goto start_over;
2742 : }
2743 :
2744 : /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2745 : to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2746 : OLD_LOOP_VINFO is better unless something specifically indicates
2747 : otherwise.
2748 :
2749 : Note that this deliberately isn't a partial order. */
2750 :
2751 : static bool
2752 32393 : vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2753 : loop_vec_info old_loop_vinfo)
2754 : {
2755 32393 : struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2756 32393 : gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2757 :
2758 32393 : poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2759 32393 : poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2760 :
2761 : /* Always prefer a VF of loop->simdlen over any other VF. */
2762 32393 : if (loop->simdlen)
2763 : {
2764 0 : bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2765 0 : bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2766 0 : if (new_simdlen_p != old_simdlen_p)
2767 : return new_simdlen_p;
2768 : }
2769 :
2770 32393 : const auto *old_costs = old_loop_vinfo->vector_costs;
2771 32393 : const auto *new_costs = new_loop_vinfo->vector_costs;
2772 32393 : if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2773 1515 : return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2774 :
2775 30878 : return new_costs->better_main_loop_than_p (old_costs);
2776 : }
2777 :
2778 : /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2779 : true if we should. */
2780 :
2781 : static bool
2782 32393 : vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2783 : loop_vec_info old_loop_vinfo)
2784 : {
2785 32393 : if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2786 : return false;
2787 :
2788 1215 : if (dump_enabled_p ())
2789 11 : dump_printf_loc (MSG_NOTE, vect_location,
2790 : "***** Preferring vector mode %s to vector mode %s\n",
2791 11 : GET_MODE_NAME (new_loop_vinfo->vector_mode),
2792 11 : GET_MODE_NAME (old_loop_vinfo->vector_mode));
2793 : return true;
2794 : }
2795 :
2796 : /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if ORIG_LOOP_VINFO is
2797 : not NULL. When MASKED_P is not -1 override the default
2798 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P with it.
2799 : Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance MODE_I to the next
2800 : mode useful to analyze.
2801 : Return the loop_vinfo on success and wrapped null on failure. */
2802 :
2803 : static opt_loop_vec_info
2804 575492 : vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2805 : const vect_loop_form_info *loop_form_info,
2806 : loop_vec_info orig_loop_vinfo,
2807 : const vector_modes &vector_modes, unsigned &mode_i,
2808 : int masked_p,
2809 : machine_mode &autodetected_vector_mode,
2810 : bool &fatal)
2811 : {
2812 575492 : loop_vec_info loop_vinfo
2813 575492 : = vect_create_loop_vinfo (loop, shared, loop_form_info, orig_loop_vinfo);
2814 :
2815 575492 : machine_mode vector_mode = vector_modes[mode_i];
2816 575492 : loop_vinfo->vector_mode = vector_mode;
2817 575492 : unsigned int suggested_unroll_factor = 1;
2818 575492 : bool single_lane_slp_done_for_suggested_uf = false;
2819 :
2820 : /* Run the main analysis. */
2821 575492 : opt_result res = vect_analyze_loop_2 (loop_vinfo, masked_p, fatal,
2822 : &suggested_unroll_factor,
2823 : single_lane_slp_done_for_suggested_uf);
2824 575492 : if (dump_enabled_p ())
2825 21227 : dump_printf_loc (MSG_NOTE, vect_location,
2826 : "***** Analysis %s with vector mode %s\n",
2827 21227 : res ? "succeeded" : "failed",
2828 21227 : GET_MODE_NAME (loop_vinfo->vector_mode));
2829 :
2830 575492 : auto user_unroll = LOOP_VINFO_LOOP (loop_vinfo)->unroll;
2831 575492 : if (res && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2832 : /* Check to see if the user wants to unroll or if the target wants to. */
2833 661124 : && (suggested_unroll_factor > 1 || user_unroll > 1))
2834 : {
2835 463 : if (suggested_unroll_factor == 1)
2836 : {
2837 66 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
2838 66 : suggested_unroll_factor = user_unroll / assumed_vf;
2839 66 : if (suggested_unroll_factor > 1)
2840 : {
2841 40 : if (dump_enabled_p ())
2842 20 : dump_printf_loc (MSG_NOTE, vect_location,
2843 : "setting unroll factor to %d based on user requested "
2844 : "unroll factor %d and suggested vectorization "
2845 : "factor: %d\n",
2846 : suggested_unroll_factor, user_unroll, assumed_vf);
2847 : }
2848 : }
2849 :
2850 463 : if (suggested_unroll_factor > 1)
2851 : {
2852 437 : if (dump_enabled_p ())
2853 56 : dump_printf_loc (MSG_NOTE, vect_location,
2854 : "***** Re-trying analysis for unrolling"
2855 : " with unroll factor %d and %s slp.\n",
2856 : suggested_unroll_factor,
2857 : single_lane_slp_done_for_suggested_uf
2858 : ? "single-lane" : "");
2859 437 : loop_vec_info unroll_vinfo
2860 437 : = vect_create_loop_vinfo (loop, shared, loop_form_info, NULL);
2861 437 : unroll_vinfo->vector_mode = vector_mode;
2862 437 : unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
2863 437 : opt_result new_res
2864 437 : = vect_analyze_loop_2 (unroll_vinfo, masked_p, fatal, NULL,
2865 : single_lane_slp_done_for_suggested_uf);
2866 437 : if (new_res)
2867 : {
2868 378 : delete loop_vinfo;
2869 378 : loop_vinfo = unroll_vinfo;
2870 : }
2871 : else
2872 59 : delete unroll_vinfo;
2873 : }
2874 :
2875 : /* Record that we have honored a user unroll factor. */
2876 463 : LOOP_VINFO_USER_UNROLL (loop_vinfo) = user_unroll > 1;
2877 : }
2878 :
2879 : /* Remember the autodetected vector mode. */
2880 575492 : if (vector_mode == VOIDmode)
2881 267850 : autodetected_vector_mode = loop_vinfo->vector_mode;
2882 :
2883 : /* Advance mode_i, first skipping modes that would result in the
2884 : same analysis result. */
2885 2546314 : while (mode_i + 1 < vector_modes.length ()
2886 1753313 : && vect_chooses_same_modes_p (loop_vinfo,
2887 767902 : vector_modes[mode_i + 1]))
2888 : {
2889 409919 : if (dump_enabled_p ())
2890 17083 : dump_printf_loc (MSG_NOTE, vect_location,
2891 : "***** The result for vector mode %s would"
2892 : " be the same\n",
2893 17083 : GET_MODE_NAME (vector_modes[mode_i + 1]));
2894 409919 : mode_i += 1;
2895 : }
2896 575492 : if (mode_i + 1 < vector_modes.length ()
2897 933475 : && vect_chooses_same_modes_p (autodetected_vector_mode,
2898 357983 : vector_modes[mode_i + 1]))
2899 : {
2900 428 : if (dump_enabled_p ())
2901 11 : dump_printf_loc (MSG_NOTE, vect_location,
2902 : "***** Skipping vector mode %s, which would"
2903 : " repeat the analysis for %s\n",
2904 11 : GET_MODE_NAME (vector_modes[mode_i + 1]),
2905 11 : GET_MODE_NAME (autodetected_vector_mode));
2906 428 : mode_i += 1;
2907 : }
2908 575492 : mode_i++;
2909 :
2910 575492 : if (!res)
2911 : {
2912 481522 : delete loop_vinfo;
2913 481522 : if (fatal)
2914 103948 : gcc_checking_assert (orig_loop_vinfo == NULL);
2915 481522 : return opt_loop_vec_info::propagate_failure (res);
2916 : }
2917 :
2918 93970 : return opt_loop_vec_info::success (loop_vinfo);
2919 : }
2920 :
2921 : /* Function vect_analyze_loop.
2922 :
2923 : Apply a set of analyses on LOOP, and create a loop_vec_info struct
2924 : for it. The different analyses will record information in the
2925 : loop_vec_info struct. */
2926 : opt_loop_vec_info
2927 466508 : vect_analyze_loop (class loop *loop, gimple *loop_vectorized_call,
2928 : vec_info_shared *shared)
2929 : {
2930 466508 : DUMP_VECT_SCOPE ("analyze_loop_nest");
2931 :
2932 466508 : if (loop_outer (loop)
2933 466508 : && loop_vec_info_for_loop (loop_outer (loop))
2934 467091 : && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2935 583 : return opt_loop_vec_info::failure_at (vect_location,
2936 : "outer-loop already vectorized.\n");
2937 :
2938 465925 : if (!find_loop_nest (loop, &shared->loop_nest))
2939 22322 : return opt_loop_vec_info::failure_at
2940 22322 : (vect_location,
2941 : "not vectorized: loop nest containing two or more consecutive inner"
2942 : " loops cannot be vectorized\n");
2943 :
2944 : /* Analyze the loop form. */
2945 443603 : vect_loop_form_info loop_form_info;
2946 443603 : opt_result res = vect_analyze_loop_form (loop, loop_vectorized_call,
2947 : &loop_form_info);
2948 443603 : if (!res)
2949 : {
2950 175753 : if (dump_enabled_p ())
2951 1519 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2952 : "bad loop form.\n");
2953 175753 : return opt_loop_vec_info::propagate_failure (res);
2954 : }
2955 267850 : if (!integer_onep (loop_form_info.assumptions))
2956 : {
2957 : /* We consider to vectorize this loop by versioning it under
2958 : some assumptions. In order to do this, we need to clear
2959 : existing information computed by scev and niter analyzer. */
2960 8386 : scev_reset_htab ();
2961 8386 : free_numbers_of_iterations_estimates (loop);
2962 : /* Also set flag for this loop so that following scev and niter
2963 : analysis are done under the assumptions. */
2964 8386 : loop_constraint_set (loop, LOOP_C_FINITE);
2965 : }
2966 : else
2967 : /* Clear the existing niter information to make sure the nonwrapping flag
2968 : will be calculated and set propriately. */
2969 259464 : free_numbers_of_iterations_estimates (loop);
2970 :
2971 267850 : auto_vector_modes vector_modes;
2972 : /* Autodetect first vector size we try. */
2973 267850 : vector_modes.safe_push (VOIDmode);
2974 267850 : unsigned int autovec_flags
2975 535700 : = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2976 267850 : loop->simdlen != 0);
2977 267850 : bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2978 267850 : && !unlimited_cost_model (loop));
2979 267850 : machine_mode autodetected_vector_mode = VOIDmode;
2980 267850 : opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2981 267850 : unsigned int mode_i = 0;
2982 267850 : unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2983 :
2984 : /* Keep track of the VF for each mode. Initialize all to 0 which indicates
2985 : a mode has not been analyzed. */
2986 267850 : auto_vec<poly_uint64, 8> cached_vf_per_mode;
2987 2690206 : for (unsigned i = 0; i < vector_modes.length (); ++i)
2988 1077253 : cached_vf_per_mode.safe_push (0);
2989 :
2990 : /* First determine the main loop vectorization mode, either the first
2991 : one that works, starting with auto-detecting the vector mode and then
2992 : following the targets order of preference, or the one with the
2993 : lowest cost if pick_lowest_cost_p. */
2994 855324 : while (1)
2995 : {
2996 561587 : bool fatal;
2997 561587 : unsigned int last_mode_i = mode_i;
2998 : /* Set cached VF to -1 prior to analysis, which indicates a mode has
2999 : failed. */
3000 561587 : cached_vf_per_mode[last_mode_i] = -1;
3001 561587 : opt_loop_vec_info loop_vinfo
3002 561587 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3003 : NULL, vector_modes, mode_i, -1,
3004 : autodetected_vector_mode, fatal);
3005 561587 : if (fatal)
3006 : break;
3007 :
3008 457639 : if (loop_vinfo)
3009 : {
3010 : /* Analyzis has been successful so update the VF value. The
3011 : VF should always be a multiple of unroll_factor and we want to
3012 : capture the original VF here. */
3013 85632 : cached_vf_per_mode[last_mode_i]
3014 85632 : = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3015 85632 : loop_vinfo->suggested_unroll_factor);
3016 : /* Once we hit the desired simdlen for the first time,
3017 : discard any previous attempts. */
3018 85632 : if (simdlen
3019 85632 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3020 : {
3021 47 : delete first_loop_vinfo;
3022 : first_loop_vinfo = opt_loop_vec_info::success (NULL);
3023 : simdlen = 0;
3024 : }
3025 85585 : else if (pick_lowest_cost_p
3026 71682 : && first_loop_vinfo
3027 116463 : && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3028 : {
3029 : /* Pick loop_vinfo over first_loop_vinfo. */
3030 1029 : delete first_loop_vinfo;
3031 1029 : first_loop_vinfo = opt_loop_vec_info::success (NULL);
3032 : }
3033 85632 : if (first_loop_vinfo == NULL)
3034 : first_loop_vinfo = loop_vinfo;
3035 : else
3036 : {
3037 29851 : delete loop_vinfo;
3038 29851 : loop_vinfo = opt_loop_vec_info::success (NULL);
3039 : }
3040 :
3041 : /* Commit to first_loop_vinfo if we have no reason to try
3042 : alternatives. */
3043 85632 : if (!simdlen && !pick_lowest_cost_p)
3044 : break;
3045 : }
3046 443698 : if (mode_i == vector_modes.length ()
3047 443698 : || autodetected_vector_mode == VOIDmode)
3048 : break;
3049 :
3050 : /* Try the next biggest vector size. */
3051 293737 : if (dump_enabled_p ())
3052 4758 : dump_printf_loc (MSG_NOTE, vect_location,
3053 : "***** Re-trying analysis with vector mode %s\n",
3054 4758 : GET_MODE_NAME (vector_modes[mode_i]));
3055 293737 : }
3056 267850 : if (!first_loop_vinfo)
3057 213103 : return opt_loop_vec_info::propagate_failure (res);
3058 :
3059 54747 : if (dump_enabled_p ())
3060 9531 : dump_printf_loc (MSG_NOTE, vect_location,
3061 : "***** Choosing vector mode %s\n",
3062 9531 : GET_MODE_NAME (first_loop_vinfo->vector_mode));
3063 :
3064 : /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3065 : enabled, SIMDUID is not set, it is the innermost loop and we have
3066 : either already found the loop's SIMDLEN or there was no SIMDLEN to
3067 : begin with.
3068 : TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3069 54747 : bool vect_epilogues = (!simdlen
3070 54745 : && loop->inner == NULL
3071 54148 : && param_vect_epilogues_nomask
3072 53008 : && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3073 : /* No code motion support for multiple epilogues so for now
3074 : not supported when multiple exits. */
3075 25954 : && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3076 25492 : && !loop->simduid
3077 78826 : && loop_cost_model (loop) > VECT_COST_MODEL_VERY_CHEAP);
3078 54747 : if (!vect_epilogues)
3079 41850 : return first_loop_vinfo;
3080 :
3081 : /* Now analyze first_loop_vinfo for epilogue vectorization. */
3082 :
3083 : /* For epilogues start the analysis from the first mode. The motivation
3084 : behind starting from the beginning comes from cases where the VECTOR_MODES
3085 : array may contain length-agnostic and length-specific modes. Their
3086 : ordering is not guaranteed, so we could end up picking a mode for the main
3087 : loop that is after the epilogue's optimal mode. */
3088 12897 : int masked_p = -1;
3089 12897 : if (!unlimited_cost_model (loop)
3090 12897 : && (first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p)
3091 : != VOIDmode))
3092 : {
3093 4 : vector_modes[0]
3094 4 : = first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p);
3095 4 : cached_vf_per_mode[0] = 0;
3096 : }
3097 : else
3098 12893 : vector_modes[0] = autodetected_vector_mode;
3099 12897 : mode_i = 0;
3100 :
3101 12933 : bool supports_partial_vectors = (param_vect_partial_vector_usage != 0
3102 12897 : || masked_p == 1);
3103 : if (supports_partial_vectors
3104 36 : && !partial_vectors_supported_p ()
3105 36 : && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (first_loop_vinfo))
3106 : supports_partial_vectors = false;
3107 12897 : poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3108 :
3109 12897 : loop_vec_info orig_loop_vinfo = first_loop_vinfo;
3110 13079 : do
3111 : {
3112 : /* Let the user override what the target suggests. */
3113 12988 : if (OPTION_SET_P (param_vect_partial_vector_usage))
3114 45 : masked_p = -1;
3115 :
3116 50276 : while (1)
3117 : {
3118 : /* If the target does not support partial vectors we can shorten the
3119 : number of modes to analyze for the epilogue as we know we can't
3120 : pick a mode that would lead to a VF at least as big as the
3121 : FIRST_VINFO_VF. */
3122 66836 : if (!supports_partial_vectors
3123 50276 : && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3124 : {
3125 23660 : mode_i++;
3126 47320 : if (mode_i == vector_modes.length ())
3127 : break;
3128 29271 : continue;
3129 : }
3130 : /* We would need an exhaustive search to find all modes we
3131 : skipped but that would lead to the same result as the
3132 : analysis it was skipped for and where we'd could check
3133 : cached_vf_per_mode against.
3134 : Check for the autodetected mode, which is the common
3135 : situation on x86 which does not perform cost comparison. */
3136 39327 : if (!supports_partial_vectors
3137 26573 : && maybe_ge (cached_vf_per_mode[0], first_vinfo_vf)
3138 52396 : && vect_chooses_same_modes_p (autodetected_vector_mode,
3139 25780 : vector_modes[mode_i]))
3140 : {
3141 12711 : mode_i++;
3142 25422 : if (mode_i == vector_modes.length ())
3143 : break;
3144 12711 : continue;
3145 : }
3146 :
3147 13905 : if (dump_enabled_p ())
3148 3255 : dump_printf_loc (MSG_NOTE, vect_location,
3149 : "***** Re-trying epilogue analysis with vector "
3150 3255 : "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3151 :
3152 13905 : bool fatal;
3153 13905 : opt_loop_vec_info loop_vinfo
3154 13905 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3155 : orig_loop_vinfo,
3156 : vector_modes, mode_i, masked_p,
3157 : autodetected_vector_mode, fatal);
3158 13905 : if (fatal)
3159 : break;
3160 :
3161 13905 : if (loop_vinfo)
3162 : {
3163 8338 : if (pick_lowest_cost_p
3164 5388 : && orig_loop_vinfo->epilogue_vinfo
3165 9853 : && vect_joust_loop_vinfos (loop_vinfo,
3166 1515 : orig_loop_vinfo->epilogue_vinfo))
3167 : {
3168 186 : gcc_assert (vect_epilogues);
3169 186 : delete orig_loop_vinfo->epilogue_vinfo;
3170 186 : orig_loop_vinfo->epilogue_vinfo = nullptr;
3171 : }
3172 8338 : if (!orig_loop_vinfo->epilogue_vinfo)
3173 7009 : orig_loop_vinfo->epilogue_vinfo = loop_vinfo;
3174 : else
3175 : {
3176 1329 : delete loop_vinfo;
3177 1329 : loop_vinfo = opt_loop_vec_info::success (NULL);
3178 : }
3179 :
3180 : /* For now only allow one epilogue loop, but allow
3181 : pick_lowest_cost_p to replace it, so commit to the
3182 : first epilogue if we have no reason to try alternatives. */
3183 8338 : if (!pick_lowest_cost_p)
3184 : break;
3185 : }
3186 :
3187 : /* Revert back to the default from the suggested prefered
3188 : epilogue vectorization mode. */
3189 10955 : masked_p = -1;
3190 21910 : if (mode_i == vector_modes.length ())
3191 : break;
3192 : }
3193 :
3194 12988 : orig_loop_vinfo = orig_loop_vinfo->epilogue_vinfo;
3195 12988 : if (!orig_loop_vinfo)
3196 : break;
3197 :
3198 : /* When we selected a first vectorized epilogue, see if the target
3199 : suggests to have another one. */
3200 6823 : masked_p = -1;
3201 6823 : if (!unlimited_cost_model (loop)
3202 3879 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (orig_loop_vinfo)
3203 10696 : && (orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p)
3204 : != VOIDmode))
3205 : {
3206 182 : vector_modes[0]
3207 91 : = orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p);
3208 91 : cached_vf_per_mode[0] = 0;
3209 91 : mode_i = 0;
3210 : }
3211 : else
3212 : break;
3213 91 : }
3214 : while (1);
3215 :
3216 12897 : if (first_loop_vinfo->epilogue_vinfo)
3217 : {
3218 6740 : poly_uint64 lowest_th
3219 6740 : = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3220 6740 : loop_vec_info epilog_vinfo = first_loop_vinfo->epilogue_vinfo;
3221 6823 : do
3222 : {
3223 6823 : poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (epilog_vinfo);
3224 6823 : gcc_assert (!LOOP_REQUIRES_VERSIONING (epilog_vinfo)
3225 : || maybe_ne (lowest_th, 0U));
3226 : /* Keep track of the known smallest versioning threshold. */
3227 6823 : if (ordered_p (lowest_th, th))
3228 6823 : lowest_th = ordered_min (lowest_th, th);
3229 6823 : epilog_vinfo = epilog_vinfo->epilogue_vinfo;
3230 : }
3231 6823 : while (epilog_vinfo);
3232 6740 : LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3233 6740 : if (dump_enabled_p ())
3234 1449 : dump_printf_loc (MSG_NOTE, vect_location,
3235 : "***** Choosing epilogue vector mode %s\n",
3236 1449 : GET_MODE_NAME
3237 : (first_loop_vinfo->epilogue_vinfo->vector_mode));
3238 : }
3239 :
3240 12897 : return first_loop_vinfo;
3241 711453 : }
3242 :
3243 : /* Return true if there is an in-order reduction function for CODE, storing
3244 : it in *REDUC_FN if so. */
3245 :
3246 : static bool
3247 5098 : fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3248 : {
3249 : /* We support MINUS_EXPR by negating the operand. This also preserves an
3250 : initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3251 : (-0.0) = -0.0. */
3252 5098 : if (code == PLUS_EXPR || code == MINUS_EXPR)
3253 : {
3254 4426 : *reduc_fn = IFN_FOLD_LEFT_PLUS;
3255 0 : return true;
3256 : }
3257 : return false;
3258 : }
3259 :
3260 : /* Function reduction_fn_for_scalar_code
3261 :
3262 : Input:
3263 : CODE - tree_code of a reduction operations.
3264 :
3265 : Output:
3266 : REDUC_FN - the corresponding internal function to be used to reduce the
3267 : vector of partial results into a single scalar result, or IFN_LAST
3268 : if the operation is a supported reduction operation, but does not have
3269 : such an internal function.
3270 :
3271 : Return FALSE if CODE currently cannot be vectorized as reduction. */
3272 :
3273 : bool
3274 2028680 : reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3275 : {
3276 2028680 : if (code.is_tree_code ())
3277 2028622 : switch (tree_code (code))
3278 : {
3279 15267 : case MAX_EXPR:
3280 15267 : *reduc_fn = IFN_REDUC_MAX;
3281 15267 : return true;
3282 :
3283 63209 : case MIN_EXPR:
3284 63209 : *reduc_fn = IFN_REDUC_MIN;
3285 63209 : return true;
3286 :
3287 1101974 : case PLUS_EXPR:
3288 1101974 : *reduc_fn = IFN_REDUC_PLUS;
3289 1101974 : return true;
3290 :
3291 232581 : case BIT_AND_EXPR:
3292 232581 : *reduc_fn = IFN_REDUC_AND;
3293 232581 : return true;
3294 :
3295 287642 : case BIT_IOR_EXPR:
3296 287642 : *reduc_fn = IFN_REDUC_IOR;
3297 287642 : return true;
3298 :
3299 44308 : case BIT_XOR_EXPR:
3300 44308 : *reduc_fn = IFN_REDUC_XOR;
3301 44308 : return true;
3302 :
3303 283641 : case MULT_EXPR:
3304 283641 : case MINUS_EXPR:
3305 283641 : *reduc_fn = IFN_LAST;
3306 283641 : return true;
3307 :
3308 : default:
3309 : return false;
3310 : }
3311 : else
3312 58 : switch (combined_fn (code))
3313 : {
3314 34 : CASE_CFN_FMAX:
3315 34 : *reduc_fn = IFN_REDUC_FMAX;
3316 34 : return true;
3317 :
3318 24 : CASE_CFN_FMIN:
3319 24 : *reduc_fn = IFN_REDUC_FMIN;
3320 24 : return true;
3321 :
3322 : default:
3323 : return false;
3324 : }
3325 : }
3326 :
3327 : /* Set *SBOOL_FN to the corresponding function working on vector masks
3328 : for REDUC_FN. Return true if that exists, false otherwise. */
3329 :
3330 : static bool
3331 0 : sbool_reduction_fn_for_fn (internal_fn reduc_fn, internal_fn *sbool_fn)
3332 : {
3333 0 : switch (reduc_fn)
3334 : {
3335 0 : case IFN_REDUC_AND:
3336 0 : *sbool_fn = IFN_REDUC_SBOOL_AND;
3337 0 : return true;
3338 0 : case IFN_REDUC_IOR:
3339 0 : *sbool_fn = IFN_REDUC_SBOOL_IOR;
3340 0 : return true;
3341 0 : case IFN_REDUC_XOR:
3342 0 : *sbool_fn = IFN_REDUC_SBOOL_XOR;
3343 0 : return true;
3344 : default:
3345 : return false;
3346 : }
3347 : }
3348 :
3349 : /* If there is a neutral value X such that a reduction would not be affected
3350 : by the introduction of additional X elements, return that X, otherwise
3351 : return null. CODE is the code of the reduction and SCALAR_TYPE is type
3352 : of the scalar elements. If the reduction has just a single initial value
3353 : then INITIAL_VALUE is that value, otherwise it is null.
3354 : If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3355 : In that case no signed zero is returned. */
3356 :
3357 : tree
3358 77322 : neutral_op_for_reduction (tree scalar_type, code_helper code,
3359 : tree initial_value, bool as_initial)
3360 : {
3361 77322 : if (code.is_tree_code ())
3362 77264 : switch (tree_code (code))
3363 : {
3364 13812 : case DOT_PROD_EXPR:
3365 13812 : case SAD_EXPR:
3366 13812 : case MINUS_EXPR:
3367 13812 : case BIT_IOR_EXPR:
3368 13812 : case BIT_XOR_EXPR:
3369 13812 : return build_zero_cst (scalar_type);
3370 57358 : case WIDEN_SUM_EXPR:
3371 57358 : case PLUS_EXPR:
3372 57358 : if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3373 100 : return build_real (scalar_type, dconstm0);
3374 : else
3375 57258 : return build_zero_cst (scalar_type);
3376 :
3377 2100 : case MULT_EXPR:
3378 2100 : return build_one_cst (scalar_type);
3379 :
3380 1558 : case BIT_AND_EXPR:
3381 1558 : return build_all_ones_cst (scalar_type);
3382 :
3383 : case MAX_EXPR:
3384 : case MIN_EXPR:
3385 : return initial_value;
3386 :
3387 408 : default:
3388 408 : return NULL_TREE;
3389 : }
3390 : else
3391 58 : switch (combined_fn (code))
3392 : {
3393 : CASE_CFN_FMIN:
3394 : CASE_CFN_FMAX:
3395 : return initial_value;
3396 :
3397 0 : default:
3398 0 : return NULL_TREE;
3399 : }
3400 : }
3401 :
3402 : /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3403 : STMT is printed with a message MSG. */
3404 :
3405 : static void
3406 578 : report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3407 : {
3408 578 : dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3409 578 : }
3410 :
3411 : /* Return true if we need an in-order reduction for operation CODE
3412 : on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3413 : overflow must wrap. */
3414 :
3415 : bool
3416 6554616 : needs_fold_left_reduction_p (tree type, code_helper code)
3417 : {
3418 : /* CHECKME: check for !flag_finite_math_only too? */
3419 6554616 : if (SCALAR_FLOAT_TYPE_P (type))
3420 : {
3421 578052 : if (code.is_tree_code ())
3422 577998 : switch (tree_code (code))
3423 : {
3424 : case MIN_EXPR:
3425 : case MAX_EXPR:
3426 : return false;
3427 :
3428 576150 : default:
3429 576150 : return !flag_associative_math;
3430 : }
3431 : else
3432 54 : switch (combined_fn (code))
3433 : {
3434 : CASE_CFN_FMIN:
3435 : CASE_CFN_FMAX:
3436 : return false;
3437 :
3438 2 : default:
3439 2 : return !flag_associative_math;
3440 : }
3441 : }
3442 :
3443 5976564 : if (INTEGRAL_TYPE_P (type))
3444 5975691 : return (!code.is_tree_code ()
3445 5975691 : || !operation_no_trapping_overflow (type, tree_code (code)));
3446 :
3447 873 : if (SAT_FIXED_POINT_TYPE_P (type))
3448 : return true;
3449 :
3450 : return false;
3451 : }
3452 :
3453 : /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3454 : has a handled computation expression. Store the main reduction
3455 : operation in *CODE. */
3456 :
3457 : static bool
3458 101648 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3459 : tree loop_arg, code_helper *code,
3460 : vec<std::pair<ssa_op_iter, use_operand_p> > &path,
3461 : bool inner_loop_of_double_reduc)
3462 : {
3463 101648 : auto_bitmap visited;
3464 101648 : tree lookfor = PHI_RESULT (phi);
3465 101648 : ssa_op_iter curri;
3466 101648 : use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3467 211447 : while (USE_FROM_PTR (curr) != loop_arg)
3468 8151 : curr = op_iter_next_use (&curri);
3469 101648 : curri.i = curri.numops;
3470 946188 : do
3471 : {
3472 946188 : path.safe_push (std::make_pair (curri, curr));
3473 946188 : tree use = USE_FROM_PTR (curr);
3474 946188 : if (use == lookfor)
3475 : break;
3476 844940 : gimple *def = SSA_NAME_DEF_STMT (use);
3477 844940 : if (gimple_nop_p (def)
3478 844940 : || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3479 : {
3480 711211 : pop:
3481 711211 : do
3482 : {
3483 711211 : std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3484 711211 : curri = x.first;
3485 711211 : curr = x.second;
3486 778422 : do
3487 778422 : curr = op_iter_next_use (&curri);
3488 : /* Skip already visited or non-SSA operands (from iterating
3489 : over PHI args). */
3490 : while (curr != NULL_USE_OPERAND_P
3491 1556844 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3492 268950 : || ! bitmap_set_bit (visited,
3493 268950 : SSA_NAME_VERSION
3494 : (USE_FROM_PTR (curr)))));
3495 : }
3496 1422422 : while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3497 237885 : if (curr == NULL_USE_OPERAND_P)
3498 : break;
3499 : }
3500 : else
3501 : {
3502 710598 : if (gimple_code (def) == GIMPLE_PHI)
3503 72162 : curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3504 : else
3505 638436 : curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3506 : while (curr != NULL_USE_OPERAND_P
3507 848541 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3508 739867 : || ! bitmap_set_bit (visited,
3509 739867 : SSA_NAME_VERSION
3510 : (USE_FROM_PTR (curr)))))
3511 137943 : curr = op_iter_next_use (&curri);
3512 710598 : if (curr == NULL_USE_OPERAND_P)
3513 103543 : goto pop;
3514 : }
3515 : }
3516 : while (1);
3517 101648 : if (dump_file && (dump_flags & TDF_DETAILS))
3518 : {
3519 4084 : dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3520 4084 : unsigned i;
3521 4084 : std::pair<ssa_op_iter, use_operand_p> *x;
3522 13869 : FOR_EACH_VEC_ELT (path, i, x)
3523 9785 : dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3524 4084 : dump_printf (MSG_NOTE, "\n");
3525 : }
3526 :
3527 : /* Check whether the reduction path detected is valid. */
3528 101648 : bool fail = path.length () == 0;
3529 101648 : bool neg = false;
3530 101648 : int sign = -1;
3531 101648 : *code = ERROR_MARK;
3532 218678 : for (unsigned i = 1; i < path.length (); ++i)
3533 : {
3534 123707 : gimple *use_stmt = USE_STMT (path[i].second);
3535 123707 : gimple_match_op op;
3536 123707 : if (!gimple_extract_op (use_stmt, &op))
3537 : {
3538 : fail = true;
3539 6677 : break;
3540 : }
3541 122820 : unsigned int opi = op.num_ops;
3542 122820 : if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3543 : {
3544 : /* The following make sure we can compute the operand index
3545 : easily plus it mostly disallows chaining via COND_EXPR condition
3546 : operands. */
3547 189959 : for (opi = 0; opi < op.num_ops; ++opi)
3548 188946 : if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3549 : break;
3550 : }
3551 6236 : else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3552 : {
3553 12497 : for (opi = 0; opi < op.num_ops; ++opi)
3554 12497 : if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3555 : break;
3556 : }
3557 122820 : if (opi == op.num_ops)
3558 : {
3559 : fail = true;
3560 : break;
3561 : }
3562 121807 : op.code = canonicalize_code (op.code, op.type);
3563 121807 : if (op.code == MINUS_EXPR)
3564 : {
3565 5602 : op.code = PLUS_EXPR;
3566 : /* Track whether we negate the reduction value each iteration. */
3567 5602 : if (op.ops[1] == op.ops[opi])
3568 34 : neg = ! neg;
3569 : }
3570 116205 : else if (op.code == IFN_COND_SUB)
3571 : {
3572 9 : op.code = IFN_COND_ADD;
3573 : /* Track whether we negate the reduction value each iteration. */
3574 9 : if (op.ops[2] == op.ops[opi])
3575 0 : neg = ! neg;
3576 : }
3577 : /* For an FMA the reduction code is the PLUS if the addition chain
3578 : is the reduction. */
3579 116196 : else if (op.code == IFN_FMA && opi == 2)
3580 33 : op.code = PLUS_EXPR;
3581 121807 : if (CONVERT_EXPR_CODE_P (op.code)
3582 121807 : && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3583 : ;
3584 116329 : else if (*code == ERROR_MARK)
3585 : {
3586 99445 : *code = op.code;
3587 99445 : sign = TYPE_SIGN (op.type);
3588 : }
3589 16884 : else if (op.code != *code)
3590 : {
3591 : fail = true;
3592 : break;
3593 : }
3594 15564 : else if ((op.code == MIN_EXPR
3595 15408 : || op.code == MAX_EXPR)
3596 15579 : && sign != TYPE_SIGN (op.type))
3597 : {
3598 : fail = true;
3599 : break;
3600 : }
3601 : /* Check there's only a single stmt the op is used on. For the
3602 : not value-changing tail and the last stmt allow out-of-loop uses,
3603 : but not when this is the inner loop of a double reduction.
3604 : ??? We could relax this and handle arbitrary live stmts by
3605 : forcing a scalar epilogue for example. */
3606 120484 : imm_use_iterator imm_iter;
3607 120484 : use_operand_p use_p;
3608 120484 : gimple *op_use_stmt;
3609 120484 : unsigned cnt = 0;
3610 126685 : bool cond_fn_p = op.code.is_internal_fn ()
3611 6201 : && (conditional_internal_fn_code (internal_fn (op.code))
3612 120484 : != ERROR_MARK);
3613 :
3614 409710 : FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3615 : {
3616 : /* In case of a COND_OP (mask, op1, op2, op1) reduction we should
3617 : have op1 twice (once as definition, once as else) in the same
3618 : operation. Enforce this. */
3619 168742 : if (cond_fn_p && op_use_stmt == use_stmt)
3620 : {
3621 6135 : gcall *call = as_a<gcall *> (use_stmt);
3622 6135 : unsigned else_pos
3623 6135 : = internal_fn_else_index (internal_fn (op.code));
3624 6135 : if (gimple_call_arg (call, else_pos) != op.ops[opi])
3625 : {
3626 : fail = true;
3627 : break;
3628 : }
3629 30675 : for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
3630 : {
3631 24540 : if (j == else_pos)
3632 6135 : continue;
3633 18405 : if (gimple_call_arg (call, j) == op.ops[opi])
3634 6135 : cnt++;
3635 : }
3636 : }
3637 162607 : else if (!is_gimple_debug (op_use_stmt)
3638 162607 : && ((*code != ERROR_MARK || inner_loop_of_double_reduc)
3639 2813 : || flow_bb_inside_loop_p (loop,
3640 2813 : gimple_bb (op_use_stmt))))
3641 235753 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3642 117881 : cnt++;
3643 120484 : }
3644 :
3645 120484 : if (cnt != 1)
3646 : {
3647 : fail = true;
3648 : break;
3649 : }
3650 : }
3651 108736 : return ! fail && ! neg && *code != ERROR_MARK;
3652 101648 : }
3653 :
3654 : bool
3655 21 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3656 : tree loop_arg, enum tree_code code)
3657 : {
3658 21 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3659 21 : code_helper code_;
3660 21 : return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path, false)
3661 21 : && code_ == code);
3662 21 : }
3663 :
3664 :
3665 :
3666 : /* Function vect_is_simple_reduction
3667 :
3668 : (1) Detect a cross-iteration def-use cycle that represents a simple
3669 : reduction computation. We look for the following pattern:
3670 :
3671 : loop_header:
3672 : a1 = phi < a0, a2 >
3673 : a3 = ...
3674 : a2 = operation (a3, a1)
3675 :
3676 : or
3677 :
3678 : a3 = ...
3679 : loop_header:
3680 : a1 = phi < a0, a2 >
3681 : a2 = operation (a3, a1)
3682 :
3683 : such that:
3684 : 1. operation is commutative and associative and it is safe to
3685 : change the order of the computation
3686 : 2. no uses for a2 in the loop (a2 is used out of the loop)
3687 : 3. no uses of a1 in the loop besides the reduction operation
3688 : 4. no uses of a1 outside the loop.
3689 :
3690 : Conditions 1,4 are tested here.
3691 : Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3692 :
3693 : (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3694 : nested cycles.
3695 :
3696 : (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3697 : reductions:
3698 :
3699 : a1 = phi < a0, a2 >
3700 : inner loop (def of a3)
3701 : a2 = phi < a3 >
3702 :
3703 : (4) Detect condition expressions, ie:
3704 : for (int i = 0; i < N; i++)
3705 : if (a[i] < val)
3706 : ret_val = a[i];
3707 :
3708 : */
3709 :
3710 : static stmt_vec_info
3711 164479 : vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3712 : gphi **double_reduc)
3713 : {
3714 164479 : gphi *phi = as_a <gphi *> (phi_info->stmt);
3715 164479 : gimple *phi_use_stmt = NULL;
3716 164479 : imm_use_iterator imm_iter;
3717 164479 : use_operand_p use_p;
3718 :
3719 : /* When double_reduc is NULL we are testing the inner loop of a
3720 : double reduction. */
3721 164479 : bool inner_loop_of_double_reduc = double_reduc == NULL;
3722 164479 : if (double_reduc)
3723 163377 : *double_reduc = NULL;
3724 164479 : STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3725 :
3726 164479 : tree phi_name = PHI_RESULT (phi);
3727 : /* ??? If there are no uses of the PHI result the inner loop reduction
3728 : won't be detected as possibly double-reduction by vectorizable_reduction
3729 : because that tries to walk the PHI arg from the preheader edge which
3730 : can be constant. See PR60382. */
3731 164479 : if (has_zero_uses (phi_name))
3732 : return NULL;
3733 164346 : class loop *loop = (gimple_bb (phi))->loop_father;
3734 164346 : unsigned nphi_def_loop_uses = 0;
3735 621173 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3736 : {
3737 304207 : gimple *use_stmt = USE_STMT (use_p);
3738 304207 : if (is_gimple_debug (use_stmt))
3739 82564 : continue;
3740 :
3741 221643 : if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3742 : {
3743 11726 : if (dump_enabled_p ())
3744 40 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3745 : "intermediate value used outside loop.\n");
3746 :
3747 11726 : return NULL;
3748 : }
3749 :
3750 : /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
3751 : op1 twice (once as definition, once as else) in the same operation.
3752 : Only count it as one. */
3753 209917 : if (use_stmt != phi_use_stmt)
3754 : {
3755 203375 : nphi_def_loop_uses++;
3756 203375 : phi_use_stmt = use_stmt;
3757 : }
3758 11726 : }
3759 :
3760 152620 : tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3761 152620 : if (TREE_CODE (latch_def) != SSA_NAME)
3762 : {
3763 1451 : if (dump_enabled_p ())
3764 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3765 : "reduction: not ssa_name: %T\n", latch_def);
3766 1451 : return NULL;
3767 : }
3768 :
3769 151169 : stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3770 151169 : if (!def_stmt_info
3771 151169 : || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3772 161 : return NULL;
3773 :
3774 151008 : bool nested_in_vect_loop
3775 151008 : = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3776 151008 : unsigned nlatch_def_loop_uses = 0;
3777 151008 : auto_vec<gphi *, 3> lcphis;
3778 742437 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3779 : {
3780 440421 : gimple *use_stmt = USE_STMT (use_p);
3781 440421 : if (is_gimple_debug (use_stmt))
3782 135626 : continue;
3783 304795 : if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3784 190264 : nlatch_def_loop_uses++;
3785 : else
3786 : /* We can have more than one loop-closed PHI. */
3787 114531 : lcphis.safe_push (as_a <gphi *> (use_stmt));
3788 151008 : }
3789 :
3790 : /* If we are vectorizing an inner reduction we are executing that
3791 : in the original order only in case we are not dealing with a
3792 : double reduction. */
3793 151008 : if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3794 : {
3795 2431 : if (dump_enabled_p ())
3796 434 : report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3797 : "detected nested cycle: ");
3798 2431 : return def_stmt_info;
3799 : }
3800 :
3801 : /* When the inner loop of a double reduction ends up with more than
3802 : one loop-closed PHI we have failed to classify alternate such
3803 : PHIs as double reduction, leading to wrong code. See PR103237. */
3804 149667 : if (inner_loop_of_double_reduc && lcphis.length () != 1)
3805 : {
3806 1 : if (dump_enabled_p ())
3807 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3808 : "unhandle double reduction\n");
3809 1 : return NULL;
3810 : }
3811 :
3812 : /* If this isn't a nested cycle or if the nested cycle reduction value
3813 : is used ouside of the inner loop we cannot handle uses of the reduction
3814 : value. */
3815 148576 : if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3816 : {
3817 45592 : if (dump_enabled_p ())
3818 401 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3819 : "reduction used in loop.\n");
3820 45592 : return NULL;
3821 : }
3822 :
3823 : /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3824 : defined in the inner loop. */
3825 102984 : if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3826 : {
3827 1357 : tree op1 = PHI_ARG_DEF (def_stmt, 0);
3828 1357 : if (gimple_phi_num_args (def_stmt) != 1
3829 1357 : || TREE_CODE (op1) != SSA_NAME)
3830 : {
3831 91 : if (dump_enabled_p ())
3832 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3833 : "unsupported phi node definition.\n");
3834 :
3835 91 : return NULL;
3836 : }
3837 :
3838 : /* Verify there is an inner cycle composed of the PHI phi_use_stmt
3839 : and the latch definition op1. */
3840 1266 : gimple *def1 = SSA_NAME_DEF_STMT (op1);
3841 1266 : if (gimple_bb (def1)
3842 1266 : && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3843 1266 : && loop->inner
3844 1212 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3845 1212 : && (is_gimple_assign (def1) || is_gimple_call (def1))
3846 1203 : && is_a <gphi *> (phi_use_stmt)
3847 1191 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
3848 1191 : && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
3849 : loop_latch_edge (loop->inner)))
3850 2455 : && lcphis.length () == 1)
3851 : {
3852 1102 : if (dump_enabled_p ())
3853 144 : report_vect_op (MSG_NOTE, def_stmt,
3854 : "detected double reduction: ");
3855 :
3856 1102 : *double_reduc = as_a <gphi *> (phi_use_stmt);
3857 1102 : return def_stmt_info;
3858 : }
3859 :
3860 164 : return NULL;
3861 : }
3862 :
3863 : /* Look for the expression computing latch_def from then loop PHI result. */
3864 101627 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3865 101627 : code_helper code;
3866 101627 : if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3867 : path, inner_loop_of_double_reduc))
3868 : {
3869 94539 : STMT_VINFO_REDUC_CODE (phi_info) = code;
3870 94539 : if (code == COND_EXPR && !nested_in_vect_loop)
3871 8222 : STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3872 :
3873 : /* Fill in STMT_VINFO_REDUC_IDX. */
3874 94539 : unsigned i;
3875 304450 : for (i = path.length () - 1; i >= 1; --i)
3876 : {
3877 115372 : gimple *stmt = USE_STMT (path[i].second);
3878 115372 : stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3879 115372 : gimple_match_op op;
3880 115372 : if (!gimple_extract_op (stmt, &op))
3881 0 : gcc_unreachable ();
3882 115372 : if (gassign *assign = dyn_cast<gassign *> (stmt))
3883 109156 : STMT_VINFO_REDUC_IDX (stmt_info)
3884 109156 : = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3885 : else
3886 : {
3887 6216 : gcall *call = as_a<gcall *> (stmt);
3888 6216 : STMT_VINFO_REDUC_IDX (stmt_info)
3889 6216 : = path[i].second->use - gimple_call_arg_ptr (call, 0);
3890 : }
3891 : }
3892 94539 : if (dump_enabled_p ())
3893 4080 : dump_printf_loc (MSG_NOTE, vect_location,
3894 : "reduction: detected reduction\n");
3895 :
3896 94539 : return def_stmt_info;
3897 : }
3898 :
3899 7088 : if (dump_enabled_p ())
3900 86 : dump_printf_loc (MSG_NOTE, vect_location,
3901 : "reduction: unknown pattern\n");
3902 :
3903 : return NULL;
3904 252635 : }
3905 :
3906 : /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3907 : PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3908 : or -1 if not known. */
3909 :
3910 : static int
3911 482410 : vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3912 : {
3913 482410 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
3914 482410 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3915 : {
3916 205002 : if (dump_enabled_p ())
3917 3585 : dump_printf_loc (MSG_NOTE, vect_location,
3918 : "cost model: epilogue peel iters set to vf/2 "
3919 : "because loop iterations are unknown .\n");
3920 205002 : return assumed_vf / 2;
3921 : }
3922 : else
3923 : {
3924 277408 : int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3925 277408 : peel_iters_prologue = MIN (niters, peel_iters_prologue);
3926 277408 : int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3927 : /* If we need to peel for gaps, but no peeling is required, we have to
3928 : peel VF iterations. */
3929 277408 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3930 277408 : peel_iters_epilogue = assumed_vf;
3931 277408 : return peel_iters_epilogue;
3932 : }
3933 : }
3934 :
3935 : /* Calculate cost of peeling the scalar loop PEEL_ITERS_PROLOGUE times for
3936 : a prologue and the corresponding times for the epilogue. */
3937 : int
3938 357593 : vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue)
3939 : {
3940 357593 : int retval = 0;
3941 :
3942 357593 : int peel_iters_epilogue
3943 357593 : = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3944 :
3945 357593 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3946 : {
3947 : /* If peeled iterations are known but number of scalar loop
3948 : iterations are unknown, count a taken branch per peeled loop. */
3949 138759 : if (peel_iters_prologue > 0)
3950 84651 : retval = builtin_vectorization_cost (cond_branch_taken, NULL_TREE, 0);
3951 138759 : if (peel_iters_epilogue > 0)
3952 138651 : retval += builtin_vectorization_cost (cond_branch_taken, NULL_TREE, 0);
3953 : }
3954 :
3955 715186 : retval += ((peel_iters_prologue + peel_iters_epilogue)
3956 357593 : * loop_vinfo->scalar_costs->body_cost ());
3957 715186 : retval += (((peel_iters_prologue != 0) + (peel_iters_epilogue != 0))
3958 357593 : * loop_vinfo->scalar_costs->outside_cost ());
3959 :
3960 357593 : return retval;
3961 : }
3962 :
3963 : /* Function vect_estimate_min_profitable_iters
3964 :
3965 : Return the number of iterations required for the vector version of the
3966 : loop to be profitable relative to the cost of the scalar version of the
3967 : loop.
3968 :
3969 : *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3970 : of iterations for vectorization. -1 value means loop vectorization
3971 : is not profitable. This returned value may be used for dynamic
3972 : profitability check.
3973 :
3974 : *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3975 : for static check against estimated number of iterations. */
3976 :
3977 : static void
3978 142026 : vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3979 : int *ret_min_profitable_niters,
3980 : int *ret_min_profitable_estimate,
3981 : unsigned *suggested_unroll_factor)
3982 : {
3983 142026 : int min_profitable_iters;
3984 142026 : int min_profitable_estimate;
3985 142026 : int peel_iters_prologue;
3986 142026 : int peel_iters_epilogue;
3987 142026 : unsigned vec_inside_cost = 0;
3988 142026 : int vec_outside_cost = 0;
3989 142026 : unsigned vec_prologue_cost = 0;
3990 142026 : unsigned vec_epilogue_cost = 0;
3991 142026 : int scalar_single_iter_cost = 0;
3992 142026 : int scalar_outside_cost = 0;
3993 142026 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
3994 142026 : int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3995 142026 : vector_costs *target_cost_data = loop_vinfo->vector_costs;
3996 :
3997 : /* Cost model disabled. */
3998 142026 : if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3999 : {
4000 16904 : if (dump_enabled_p ())
4001 10630 : dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4002 16904 : *ret_min_profitable_niters = 0;
4003 16904 : *ret_min_profitable_estimate = 0;
4004 16904 : return;
4005 : }
4006 :
4007 : /* Requires loop versioning tests to handle misalignment. */
4008 125122 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4009 : {
4010 : /* FIXME: Make cost depend on complexity of individual check. */
4011 18 : unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4012 18 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4013 18 : if (dump_enabled_p ())
4014 2 : dump_printf (MSG_NOTE,
4015 : "cost model: Adding cost of checks for loop "
4016 : "versioning to treat misalignment.\n");
4017 : }
4018 :
4019 : /* Requires loop versioning with alias checks. */
4020 125122 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4021 : {
4022 : /* FIXME: Make cost depend on complexity of individual check. */
4023 7065 : unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4024 7065 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4025 7065 : len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4026 4 : if (len)
4027 : /* Count LEN - 1 ANDs and LEN comparisons. */
4028 4 : (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4029 : scalar_stmt, vect_prologue);
4030 7065 : len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4031 1220 : if (len)
4032 : {
4033 : /* Count LEN - 1 ANDs and LEN comparisons. */
4034 1220 : unsigned int nstmts = len * 2 - 1;
4035 : /* +1 for each bias that needs adding. */
4036 2440 : for (unsigned int i = 0; i < len; ++i)
4037 1220 : if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4038 143 : nstmts += 1;
4039 1220 : (void) add_stmt_cost (target_cost_data, nstmts,
4040 : scalar_stmt, vect_prologue);
4041 : }
4042 7065 : if (dump_enabled_p ())
4043 32 : dump_printf (MSG_NOTE,
4044 : "cost model: Adding cost of checks for loop "
4045 : "versioning aliasing.\n");
4046 : }
4047 :
4048 : /* Requires loop versioning with niter checks. */
4049 125122 : if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4050 : {
4051 : /* FIXME: Make cost depend on complexity of individual check. */
4052 751 : (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4053 : NULL, NULL, NULL_TREE, 0, vect_prologue);
4054 751 : if (dump_enabled_p ())
4055 1 : dump_printf (MSG_NOTE,
4056 : "cost model: Adding cost of checks for loop "
4057 : "versioning niters.\n");
4058 : }
4059 :
4060 125122 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4061 7828 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4062 : vect_prologue);
4063 :
4064 : /* Count statements in scalar loop. Using this as scalar cost for a single
4065 : iteration for now.
4066 :
4067 : TODO: Add outer loop support.
4068 :
4069 : TODO: Consider assigning different costs to different scalar
4070 : statements. */
4071 :
4072 125122 : scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4073 :
4074 : /* Add additional cost for the peeled instructions in prologue and epilogue
4075 : loop. (For fully-masked loops there will be no peeling.)
4076 :
4077 : FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4078 : at compile-time - we assume it's vf/2 (the worst would be vf-1).
4079 :
4080 : TODO: Build an expression that represents peel_iters for prologue and
4081 : epilogue to be used in a run-time test. */
4082 :
4083 125122 : bool prologue_need_br_taken_cost = false;
4084 125122 : bool prologue_need_br_not_taken_cost = false;
4085 :
4086 : /* Calculate peel_iters_prologue. */
4087 125122 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4088 : peel_iters_prologue = 0;
4089 125122 : else if (npeel < 0)
4090 : {
4091 279 : peel_iters_prologue = assumed_vf / 2;
4092 279 : if (dump_enabled_p ())
4093 8 : dump_printf (MSG_NOTE, "cost model: "
4094 : "prologue peel iters set to vf/2.\n");
4095 :
4096 : /* If peeled iterations are unknown, count a taken branch and a not taken
4097 : branch per peeled loop. Even if scalar loop iterations are known,
4098 : vector iterations are not known since peeled prologue iterations are
4099 : not known. Hence guards remain the same. */
4100 : prologue_need_br_taken_cost = true;
4101 : prologue_need_br_not_taken_cost = true;
4102 : }
4103 : else
4104 : {
4105 124843 : peel_iters_prologue = npeel;
4106 124843 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4107 : /* If peeled iterations are known but number of scalar loop
4108 : iterations are unknown, count a taken branch per peeled loop. */
4109 125122 : prologue_need_br_taken_cost = true;
4110 : }
4111 :
4112 125122 : bool epilogue_need_br_taken_cost = false;
4113 125122 : bool epilogue_need_br_not_taken_cost = false;
4114 :
4115 : /* Calculate peel_iters_epilogue. */
4116 125122 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4117 : /* We need to peel exactly one iteration for gaps. */
4118 26 : peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4119 125096 : else if (npeel < 0)
4120 : {
4121 : /* If peeling for alignment is unknown, loop bound of main loop
4122 : becomes unknown. */
4123 279 : peel_iters_epilogue = assumed_vf / 2;
4124 279 : if (dump_enabled_p ())
4125 8 : dump_printf (MSG_NOTE, "cost model: "
4126 : "epilogue peel iters set to vf/2 because "
4127 : "peeling for alignment is unknown.\n");
4128 :
4129 : /* See the same reason above in peel_iters_prologue calculation. */
4130 : epilogue_need_br_taken_cost = true;
4131 : epilogue_need_br_not_taken_cost = true;
4132 : }
4133 : else
4134 : {
4135 124817 : peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4136 124817 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4137 : /* If peeled iterations are known but number of scalar loop
4138 : iterations are unknown, count a taken branch per peeled loop. */
4139 125122 : epilogue_need_br_taken_cost = true;
4140 : }
4141 :
4142 : /* The way we cummulate peeling costs into the vector prologue/epilogue
4143 : cost is a bit awkward given we cannot reuse scalar_costs which is
4144 : already computed and also because it cannot take into account any
4145 : epilogue vectorization we'll carry out in the end. */
4146 :
4147 125122 : stmt_info_for_cost *si;
4148 125122 : int j;
4149 : /* Add costs associated with peel_iters_prologue. */
4150 125122 : if (peel_iters_prologue)
4151 1068 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4152 : {
4153 775 : (void) add_stmt_cost (target_cost_data,
4154 775 : si->count * peel_iters_prologue, si->kind,
4155 : si->stmt_info, si->node, si->vectype,
4156 : si->misalign, vect_prologue);
4157 : }
4158 :
4159 : /* Add costs associated with peel_iters_epilogue. */
4160 125122 : if (peel_iters_epilogue)
4161 388756 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4162 : {
4163 311330 : (void) add_stmt_cost (target_cost_data,
4164 311330 : si->count * peel_iters_epilogue, si->kind,
4165 : si->stmt_info, si->node, si->vectype,
4166 : si->misalign, vect_epilogue);
4167 : }
4168 :
4169 : /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4170 :
4171 125122 : if (prologue_need_br_taken_cost)
4172 279 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4173 : vect_prologue);
4174 :
4175 125122 : if (prologue_need_br_not_taken_cost)
4176 279 : (void) add_stmt_cost (target_cost_data, 1,
4177 : cond_branch_not_taken, vect_prologue);
4178 :
4179 125122 : if (epilogue_need_br_taken_cost)
4180 65642 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4181 : vect_epilogue);
4182 :
4183 125122 : if (epilogue_need_br_not_taken_cost)
4184 279 : (void) add_stmt_cost (target_cost_data, 1,
4185 : cond_branch_not_taken, vect_epilogue);
4186 :
4187 : /* Take care of special costs for rgroup controls of partial vectors. */
4188 26 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4189 125148 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4190 : == vect_partial_vectors_avx512))
4191 : {
4192 : /* Calculate how many masks we need to generate. */
4193 26 : unsigned int num_masks = 0;
4194 26 : bool need_saturation = false;
4195 108 : for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4196 30 : if (rgm.type)
4197 : {
4198 26 : unsigned nvectors = rgm.factor;
4199 26 : num_masks += nvectors;
4200 26 : if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4201 26 : < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4202 9 : need_saturation = true;
4203 : }
4204 :
4205 : /* ??? The target isn't able to identify the costs below as
4206 : producing masks so it cannot penaltize cases where we'd run
4207 : out of mask registers for example. */
4208 :
4209 : /* ??? We are also failing to account for smaller vector masks
4210 : we generate by splitting larger masks in vect_get_loop_mask. */
4211 :
4212 : /* In the worst case, we need to generate each mask in the prologue
4213 : and in the loop body. We need one splat per group and one
4214 : compare per mask.
4215 :
4216 : Sometimes the prologue mask will fold to a constant,
4217 : so the actual prologue cost might be smaller. However, it's
4218 : simpler and safer to use the worst-case cost; if this ends up
4219 : being the tie-breaker between vectorizing or not, then it's
4220 : probably better not to vectorize. */
4221 26 : (void) add_stmt_cost (target_cost_data,
4222 : num_masks
4223 26 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4224 : vector_stmt, NULL, NULL, NULL_TREE, 0,
4225 : vect_prologue);
4226 52 : (void) add_stmt_cost (target_cost_data,
4227 : num_masks
4228 52 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4229 : vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4230 :
4231 : /* When we need saturation we need it both in the prologue and
4232 : the epilogue. */
4233 26 : if (need_saturation)
4234 : {
4235 9 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4236 : NULL, NULL, NULL_TREE, 0, vect_prologue);
4237 9 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4238 : NULL, NULL, NULL_TREE, 0, vect_body);
4239 : }
4240 : }
4241 0 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4242 125096 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4243 : == vect_partial_vectors_while_ult))
4244 : {
4245 : /* Calculate how many masks we need to generate. */
4246 : unsigned int num_masks = 0;
4247 : rgroup_controls *rgm;
4248 : unsigned int num_vectors_m1;
4249 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4250 : num_vectors_m1, rgm)
4251 0 : if (rgm->type)
4252 0 : num_masks += num_vectors_m1 + 1;
4253 0 : gcc_assert (num_masks > 0);
4254 :
4255 : /* In the worst case, we need to generate each mask in the prologue
4256 : and in the loop body. One of the loop body mask instructions
4257 : replaces the comparison in the scalar loop, and since we don't
4258 : count the scalar comparison against the scalar body, we shouldn't
4259 : count that vector instruction against the vector body either.
4260 :
4261 : Sometimes we can use unpacks instead of generating prologue
4262 : masks and sometimes the prologue mask will fold to a constant,
4263 : so the actual prologue cost might be smaller. However, it's
4264 : simpler and safer to use the worst-case cost; if this ends up
4265 : being the tie-breaker between vectorizing or not, then it's
4266 : probably better not to vectorize. */
4267 0 : (void) add_stmt_cost (target_cost_data, num_masks,
4268 : vector_stmt, NULL, NULL, NULL_TREE, 0,
4269 : vect_prologue);
4270 0 : (void) add_stmt_cost (target_cost_data, num_masks - 1,
4271 : vector_stmt, NULL, NULL, NULL_TREE, 0,
4272 : vect_body);
4273 : }
4274 125096 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4275 : {
4276 : /* Referring to the functions vect_set_loop_condition_partial_vectors
4277 : and vect_set_loop_controls_directly, we need to generate each
4278 : length in the prologue and in the loop body if required. Although
4279 : there are some possible optimizations, we consider the worst case
4280 : here. */
4281 :
4282 0 : bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4283 0 : signed char partial_load_store_bias
4284 : = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4285 0 : bool need_iterate_p
4286 0 : = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4287 0 : && !vect_known_niters_smaller_than_vf (loop_vinfo));
4288 :
4289 : /* Calculate how many statements to be added. */
4290 0 : unsigned int prologue_stmts = 0;
4291 0 : unsigned int body_stmts = 0;
4292 :
4293 0 : rgroup_controls *rgc;
4294 0 : unsigned int num_vectors_m1;
4295 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4296 0 : if (rgc->type)
4297 : {
4298 : /* May need one SHIFT for nitems_total computation. */
4299 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4300 0 : if (nitems != 1 && !niters_known_p)
4301 0 : prologue_stmts += 1;
4302 :
4303 : /* May need one MAX and one MINUS for wrap around. */
4304 0 : if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4305 0 : prologue_stmts += 2;
4306 :
4307 : /* Need one MAX and one MINUS for each batch limit excepting for
4308 : the 1st one. */
4309 0 : prologue_stmts += num_vectors_m1 * 2;
4310 :
4311 0 : unsigned int num_vectors = num_vectors_m1 + 1;
4312 :
4313 : /* Need to set up lengths in prologue, only one MIN required
4314 : for each since start index is zero. */
4315 0 : prologue_stmts += num_vectors;
4316 :
4317 : /* If we have a non-zero partial load bias, we need one PLUS
4318 : to adjust the load length. */
4319 0 : if (partial_load_store_bias != 0)
4320 0 : body_stmts += 1;
4321 :
4322 0 : unsigned int length_update_cost = 0;
4323 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4324 : /* For decrement IV style, Each only need a single SELECT_VL
4325 : or MIN since beginning to calculate the number of elements
4326 : need to be processed in current iteration. */
4327 : length_update_cost = 1;
4328 : else
4329 : /* For increment IV stype, Each may need two MINs and one MINUS to
4330 : update lengths in body for next iteration. */
4331 0 : length_update_cost = 3;
4332 :
4333 0 : if (need_iterate_p)
4334 0 : body_stmts += length_update_cost * num_vectors;
4335 : }
4336 :
4337 0 : (void) add_stmt_cost (target_cost_data, prologue_stmts,
4338 : scalar_stmt, vect_prologue);
4339 0 : (void) add_stmt_cost (target_cost_data, body_stmts,
4340 : scalar_stmt, vect_body);
4341 : }
4342 :
4343 : /* FORNOW: The scalar outside cost is incremented in one of the
4344 : following ways:
4345 :
4346 : 1. The vectorizer checks for alignment and aliasing and generates
4347 : a condition that allows dynamic vectorization. A cost model
4348 : check is ANDED with the versioning condition. Hence scalar code
4349 : path now has the added cost of the versioning check.
4350 :
4351 : if (cost > th & versioning_check)
4352 : jmp to vector code
4353 :
4354 : Hence run-time scalar is incremented by not-taken branch cost.
4355 :
4356 : 2. The vectorizer then checks if a prologue is required. If the
4357 : cost model check was not done before during versioning, it has to
4358 : be done before the prologue check.
4359 :
4360 : if (cost <= th)
4361 : prologue = scalar_iters
4362 : if (prologue == 0)
4363 : jmp to vector code
4364 : else
4365 : execute prologue
4366 : if (prologue == num_iters)
4367 : go to exit
4368 :
4369 : Hence the run-time scalar cost is incremented by a taken branch,
4370 : plus a not-taken branch, plus a taken branch cost.
4371 :
4372 : 3. The vectorizer then checks if an epilogue is required. If the
4373 : cost model check was not done before during prologue check, it
4374 : has to be done with the epilogue check.
4375 :
4376 : if (prologue == 0)
4377 : jmp to vector code
4378 : else
4379 : execute prologue
4380 : if (prologue == num_iters)
4381 : go to exit
4382 : vector code:
4383 : if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4384 : jmp to epilogue
4385 :
4386 : Hence the run-time scalar cost should be incremented by 2 taken
4387 : branches.
4388 :
4389 : TODO: The back end may reorder the BBS's differently and reverse
4390 : conditions/branch directions. Change the estimates below to
4391 : something more reasonable. */
4392 :
4393 : /* If the number of iterations is known and we do not do versioning, we can
4394 : decide whether to vectorize at compile time. Hence the scalar version
4395 : do not carry cost model guard costs. */
4396 58622 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4397 183744 : || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4398 : {
4399 : /* Cost model check occurs at versioning. */
4400 67565 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4401 7828 : scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4402 : else
4403 : {
4404 : /* Cost model check occurs at prologue generation. */
4405 59737 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4406 152 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4407 152 : + vect_get_stmt_cost (cond_branch_not_taken);
4408 : /* Cost model check occurs at epilogue generation. */
4409 : else
4410 59585 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4411 : }
4412 : }
4413 :
4414 : /* Complete the target-specific cost calculations. */
4415 125122 : loop_vinfo->vector_costs->finish_cost (loop_vinfo->scalar_costs);
4416 125122 : vec_prologue_cost = loop_vinfo->vector_costs->prologue_cost ();
4417 125122 : vec_inside_cost = loop_vinfo->vector_costs->body_cost ();
4418 125122 : vec_epilogue_cost = loop_vinfo->vector_costs->epilogue_cost ();
4419 125122 : if (suggested_unroll_factor)
4420 124754 : *suggested_unroll_factor
4421 124754 : = loop_vinfo->vector_costs->suggested_unroll_factor ();
4422 :
4423 124754 : if (suggested_unroll_factor && *suggested_unroll_factor > 1
4424 413 : && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4425 0 : && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4426 : *suggested_unroll_factor,
4427 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4428 : {
4429 0 : if (dump_enabled_p ())
4430 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4431 : "can't unroll as unrolled vectorization factor larger"
4432 : " than maximum vectorization factor: "
4433 : HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4434 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4435 0 : *suggested_unroll_factor = 1;
4436 : }
4437 :
4438 125122 : vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4439 :
4440 125122 : if (dump_enabled_p ())
4441 : {
4442 1075 : dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4443 1075 : dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4444 : vec_inside_cost);
4445 1075 : dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4446 : vec_prologue_cost);
4447 1075 : dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4448 : vec_epilogue_cost);
4449 1075 : dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4450 : scalar_single_iter_cost);
4451 1075 : dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4452 : scalar_outside_cost);
4453 1075 : dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4454 : vec_outside_cost);
4455 1075 : dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4456 : peel_iters_prologue);
4457 1075 : dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4458 : peel_iters_epilogue);
4459 : }
4460 :
4461 : /* Calculate number of iterations required to make the vector version
4462 : profitable, relative to the loop bodies only. The following condition
4463 : must hold true:
4464 : SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4465 : where
4466 : SIC = scalar iteration cost, VIC = vector iteration cost,
4467 : VOC = vector outside cost, VF = vectorization factor,
4468 : NPEEL = prologue iterations + epilogue iterations,
4469 : SOC = scalar outside cost for run time cost model check. */
4470 :
4471 125122 : int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4472 125122 : - vec_inside_cost);
4473 125122 : if (saving_per_viter <= 0)
4474 : {
4475 24892 : if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4476 0 : warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4477 : "vectorization did not happen for a simd loop");
4478 :
4479 24892 : if (dump_enabled_p ())
4480 34 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4481 : "cost model: the vector iteration cost = %d "
4482 : "divided by the scalar iteration cost = %d "
4483 : "is greater or equal to the vectorization factor = %d"
4484 : ".\n",
4485 : vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4486 24892 : *ret_min_profitable_niters = -1;
4487 24892 : *ret_min_profitable_estimate = -1;
4488 24892 : return;
4489 : }
4490 :
4491 : /* ??? The "if" arm is written to handle all cases; see below for what
4492 : we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4493 100230 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4494 : {
4495 : /* Rewriting the condition above in terms of the number of
4496 : vector iterations (vniters) rather than the number of
4497 : scalar iterations (niters) gives:
4498 :
4499 : SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4500 :
4501 : <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4502 :
4503 : For integer N, X and Y when X > 0:
4504 :
4505 : N * X > Y <==> N >= (Y /[floor] X) + 1. */
4506 18 : int outside_overhead = (vec_outside_cost
4507 18 : - scalar_single_iter_cost * peel_iters_prologue
4508 18 : - scalar_single_iter_cost * peel_iters_epilogue
4509 : - scalar_outside_cost);
4510 : /* We're only interested in cases that require at least one
4511 : vector iteration. */
4512 18 : int min_vec_niters = 1;
4513 18 : if (outside_overhead > 0)
4514 13 : min_vec_niters = outside_overhead / saving_per_viter + 1;
4515 :
4516 18 : if (dump_enabled_p ())
4517 7 : dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4518 : min_vec_niters);
4519 :
4520 18 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4521 : {
4522 : /* Now that we know the minimum number of vector iterations,
4523 : find the minimum niters for which the scalar cost is larger:
4524 :
4525 : SIC * niters > VIC * vniters + VOC - SOC
4526 :
4527 : We know that the minimum niters is no more than
4528 : vniters * VF + NPEEL, but it might be (and often is) less
4529 : than that if a partial vector iteration is cheaper than the
4530 : equivalent scalar code. */
4531 18 : int threshold = (vec_inside_cost * min_vec_niters
4532 18 : + vec_outside_cost
4533 18 : - scalar_outside_cost);
4534 18 : if (threshold <= 0)
4535 : min_profitable_iters = 1;
4536 : else
4537 18 : min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4538 : }
4539 : else
4540 : /* Convert the number of vector iterations into a number of
4541 : scalar iterations. */
4542 0 : min_profitable_iters = (min_vec_niters * assumed_vf
4543 0 : + peel_iters_prologue
4544 : + peel_iters_epilogue);
4545 : }
4546 : else
4547 : {
4548 100212 : min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4549 100212 : * assumed_vf
4550 100212 : - vec_inside_cost * peel_iters_prologue
4551 100212 : - vec_inside_cost * peel_iters_epilogue);
4552 100212 : if (min_profitable_iters <= 0)
4553 : min_profitable_iters = 0;
4554 : else
4555 : {
4556 85143 : min_profitable_iters /= saving_per_viter;
4557 :
4558 85143 : if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4559 85143 : <= (((int) vec_inside_cost * min_profitable_iters)
4560 85143 : + (((int) vec_outside_cost - scalar_outside_cost)
4561 : * assumed_vf)))
4562 85143 : min_profitable_iters++;
4563 : }
4564 : }
4565 :
4566 100230 : if (dump_enabled_p ())
4567 1041 : dump_printf (MSG_NOTE,
4568 : " Calculated minimum iters for profitability: %d\n",
4569 : min_profitable_iters);
4570 :
4571 100230 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4572 100212 : && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4573 : /* We want the vectorized loop to execute at least once. */
4574 : min_profitable_iters = assumed_vf + peel_iters_prologue;
4575 21975 : else if (min_profitable_iters < peel_iters_prologue)
4576 : /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4577 : vectorized loop executes at least once. */
4578 : min_profitable_iters = peel_iters_prologue;
4579 :
4580 100230 : if (dump_enabled_p ())
4581 1041 : dump_printf_loc (MSG_NOTE, vect_location,
4582 : " Runtime profitability threshold = %d\n",
4583 : min_profitable_iters);
4584 :
4585 100230 : *ret_min_profitable_niters = min_profitable_iters;
4586 :
4587 : /* Calculate number of iterations required to make the vector version
4588 : profitable, relative to the loop bodies only.
4589 :
4590 : Non-vectorized variant is SIC * niters and it must win over vector
4591 : variant on the expected loop trip count. The following condition must hold true:
4592 : SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4593 :
4594 100230 : if (vec_outside_cost <= 0)
4595 : min_profitable_estimate = 0;
4596 : /* ??? This "else if" arm is written to handle all cases; see below for
4597 : what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4598 89665 : else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4599 : {
4600 : /* This is a repeat of the code above, but with + SOC rather
4601 : than - SOC. */
4602 18 : int outside_overhead = (vec_outside_cost
4603 18 : - scalar_single_iter_cost * peel_iters_prologue
4604 18 : - scalar_single_iter_cost * peel_iters_epilogue
4605 : + scalar_outside_cost);
4606 18 : int min_vec_niters = 1;
4607 18 : if (outside_overhead > 0)
4608 18 : min_vec_niters = outside_overhead / saving_per_viter + 1;
4609 :
4610 18 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4611 : {
4612 18 : int threshold = (vec_inside_cost * min_vec_niters
4613 18 : + vec_outside_cost
4614 18 : + scalar_outside_cost);
4615 18 : min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4616 : }
4617 : else
4618 : min_profitable_estimate = (min_vec_niters * assumed_vf
4619 : + peel_iters_prologue
4620 : + peel_iters_epilogue);
4621 : }
4622 : else
4623 : {
4624 89647 : min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4625 89647 : * assumed_vf
4626 89647 : - vec_inside_cost * peel_iters_prologue
4627 89647 : - vec_inside_cost * peel_iters_epilogue)
4628 89647 : / ((scalar_single_iter_cost * assumed_vf)
4629 : - vec_inside_cost);
4630 : }
4631 100230 : min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4632 100230 : if (dump_enabled_p ())
4633 1041 : dump_printf_loc (MSG_NOTE, vect_location,
4634 : " Static estimate profitability threshold = %d\n",
4635 : min_profitable_estimate);
4636 :
4637 100230 : *ret_min_profitable_estimate = min_profitable_estimate;
4638 : }
4639 :
4640 : /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4641 : vector elements (not bits) for a vector with NELT elements. */
4642 : static void
4643 2292 : calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4644 : vec_perm_builder *sel)
4645 : {
4646 : /* The encoding is a single stepped pattern. Any wrap-around is handled
4647 : by vec_perm_indices. */
4648 2292 : sel->new_vector (nelt, 1, 3);
4649 9168 : for (unsigned int i = 0; i < 3; i++)
4650 6876 : sel->quick_push (i + offset);
4651 2292 : }
4652 :
4653 : /* Checks whether the target supports whole-vector shifts for vectors of mode
4654 : MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4655 : it supports vec_perm_const with masks for all necessary shift amounts. */
4656 : static bool
4657 13628 : have_whole_vector_shift (machine_mode mode)
4658 : {
4659 13628 : if (can_implement_p (vec_shr_optab, mode))
4660 : return true;
4661 :
4662 : /* Variable-length vectors should be handled via the optab. */
4663 63 : unsigned int nelt;
4664 126 : if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4665 : return false;
4666 :
4667 63 : vec_perm_builder sel;
4668 63 : vec_perm_indices indices;
4669 315 : for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4670 : {
4671 252 : calc_vec_perm_mask_for_shift (i, nelt, &sel);
4672 252 : indices.new_vector (sel, 2, nelt);
4673 252 : if (!can_vec_perm_const_p (mode, mode, indices, false))
4674 : return false;
4675 : }
4676 : return true;
4677 63 : }
4678 :
4679 : /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
4680 : multiplication operands have differing signs and (b) we intend
4681 : to emulate the operation using a series of signed DOT_PROD_EXPRs.
4682 : See vect_emulate_mixed_dot_prod for the actual sequence used. */
4683 :
4684 : static bool
4685 2456 : vect_is_emulated_mixed_dot_prod (slp_tree slp_node)
4686 : {
4687 2456 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
4688 2456 : gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
4689 2003 : if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
4690 : return false;
4691 :
4692 821 : tree rhs1 = gimple_assign_rhs1 (assign);
4693 821 : tree rhs2 = gimple_assign_rhs2 (assign);
4694 821 : if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
4695 : return false;
4696 :
4697 627 : return !directly_supported_p (DOT_PROD_EXPR,
4698 : SLP_TREE_VECTYPE (slp_node),
4699 209 : SLP_TREE_VECTYPE
4700 : (SLP_TREE_CHILDREN (slp_node)[0]),
4701 209 : optab_vector_mixed_sign);
4702 : }
4703 :
4704 : /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4705 : functions. Design better to avoid maintenance issues. */
4706 :
4707 : /* Function vect_model_reduction_cost.
4708 :
4709 : Models cost for a reduction operation, including the vector ops
4710 : generated within the strip-mine loop in some cases, the initial
4711 : definition before the loop, and the epilogue code that must be generated. */
4712 :
4713 : static void
4714 71981 : vect_model_reduction_cost (loop_vec_info loop_vinfo,
4715 : slp_tree node, internal_fn reduc_fn,
4716 : vect_reduction_type reduction_type,
4717 : int ncopies, stmt_vector_for_cost *cost_vec)
4718 : {
4719 71981 : int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4720 71981 : tree vectype;
4721 71981 : machine_mode mode;
4722 71981 : class loop *loop = NULL;
4723 :
4724 71981 : if (loop_vinfo)
4725 71981 : loop = LOOP_VINFO_LOOP (loop_vinfo);
4726 :
4727 : /* Condition reductions generate two reductions in the loop. */
4728 71981 : if (reduction_type == COND_REDUCTION)
4729 304 : ncopies *= 2;
4730 :
4731 71981 : vectype = SLP_TREE_VECTYPE (node);
4732 71981 : mode = TYPE_MODE (vectype);
4733 71981 : stmt_vec_info orig_stmt_info
4734 71981 : = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
4735 :
4736 71981 : gimple_match_op op;
4737 71981 : if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4738 0 : gcc_unreachable ();
4739 :
4740 71981 : if (reduction_type == EXTRACT_LAST_REDUCTION)
4741 : /* No extra instructions are needed in the prologue. The loop body
4742 : operations are costed in vectorizable_condition. */
4743 : inside_cost = 0;
4744 71981 : else if (reduction_type == FOLD_LEFT_REDUCTION)
4745 : {
4746 : /* No extra instructions needed in the prologue. */
4747 4306 : prologue_cost = 0;
4748 :
4749 4306 : if (reduc_fn != IFN_LAST)
4750 : /* Count one reduction-like operation per vector. */
4751 0 : inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4752 : node, 0, vect_body);
4753 : else
4754 : {
4755 : /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4756 4306 : unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4757 4306 : inside_cost = record_stmt_cost (cost_vec, nelements,
4758 : vec_to_scalar, node, 0,
4759 : vect_body);
4760 4306 : inside_cost += record_stmt_cost (cost_vec, nelements,
4761 : scalar_stmt, node, 0,
4762 : vect_body);
4763 : }
4764 : }
4765 : else
4766 : {
4767 : /* Add in the cost of the initial definitions. */
4768 67675 : int prologue_stmts;
4769 67675 : if (reduction_type == COND_REDUCTION)
4770 : /* For cond reductions we have four vectors: initial index, step,
4771 : initial result of the data reduction, initial value of the index
4772 : reduction. */
4773 : prologue_stmts = 4;
4774 : else
4775 : /* We need the initial reduction value. */
4776 67371 : prologue_stmts = 1;
4777 67675 : prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4778 : scalar_to_vec, node, 0,
4779 : vect_prologue);
4780 : }
4781 :
4782 : /* Determine cost of epilogue code.
4783 :
4784 : We have a reduction operator that will reduce the vector in one statement.
4785 : Also requires scalar extract. */
4786 :
4787 71981 : if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4788 : {
4789 71797 : if (reduc_fn != IFN_LAST)
4790 : {
4791 52315 : if (reduction_type == COND_REDUCTION)
4792 : {
4793 : /* An EQ stmt and an COND_EXPR stmt. */
4794 8 : epilogue_cost += record_stmt_cost (cost_vec, 2,
4795 : vector_stmt, node, 0,
4796 : vect_epilogue);
4797 : /* Reduction of the max index and a reduction of the found
4798 : values. */
4799 8 : epilogue_cost += record_stmt_cost (cost_vec, 2,
4800 : vec_to_scalar, node, 0,
4801 : vect_epilogue);
4802 : /* A broadcast of the max value. */
4803 8 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4804 : scalar_to_vec, node, 0,
4805 : vect_epilogue);
4806 : }
4807 : else
4808 : {
4809 52307 : epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4810 : node, 0, vect_epilogue);
4811 52307 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4812 : vec_to_scalar, node, 0,
4813 : vect_epilogue);
4814 : }
4815 : }
4816 19482 : else if (reduction_type == COND_REDUCTION)
4817 : {
4818 296 : unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4819 : /* Extraction of scalar elements. */
4820 592 : epilogue_cost += record_stmt_cost (cost_vec,
4821 296 : 2 * estimated_nunits,
4822 : vec_to_scalar, node, 0,
4823 : vect_epilogue);
4824 : /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4825 296 : epilogue_cost += record_stmt_cost (cost_vec,
4826 296 : 2 * estimated_nunits - 3,
4827 : scalar_stmt, node, 0,
4828 : vect_epilogue);
4829 : }
4830 19186 : else if (reduction_type == EXTRACT_LAST_REDUCTION
4831 19186 : || reduction_type == FOLD_LEFT_REDUCTION)
4832 : /* No extra instructions need in the epilogue. */
4833 : ;
4834 : else
4835 : {
4836 14880 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4837 14880 : tree bitsize = TYPE_SIZE (op.type);
4838 14880 : int element_bitsize = tree_to_uhwi (bitsize);
4839 14880 : int nelements = vec_size_in_bits / element_bitsize;
4840 :
4841 14880 : if (op.code == COND_EXPR)
4842 31 : op.code = MAX_EXPR;
4843 :
4844 : /* We have a whole vector shift available. */
4845 3092 : if (VECTOR_MODE_P (mode)
4846 14880 : && directly_supported_p (op.code, vectype)
4847 26538 : && have_whole_vector_shift (mode))
4848 : {
4849 : /* Final reduction via vector shifts and the reduction operator.
4850 : Also requires scalar extract. */
4851 34974 : epilogue_cost += record_stmt_cost (cost_vec,
4852 23316 : exact_log2 (nelements) * 2,
4853 : vector_stmt, node, 0,
4854 : vect_epilogue);
4855 11658 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4856 : vec_to_scalar, node, 0,
4857 : vect_epilogue);
4858 : }
4859 : else
4860 : /* Use extracts and reduction op for final reduction. For N
4861 : elements, we have N extracts and N-1 reduction ops. */
4862 3222 : epilogue_cost += record_stmt_cost (cost_vec,
4863 3222 : nelements + nelements - 1,
4864 : vector_stmt, node, 0,
4865 : vect_epilogue);
4866 : }
4867 : }
4868 :
4869 71981 : if (dump_enabled_p ())
4870 2985 : dump_printf (MSG_NOTE,
4871 : "vect_model_reduction_cost: inside_cost = %d, "
4872 : "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4873 : prologue_cost, epilogue_cost);
4874 71981 : }
4875 :
4876 : /* SEQ is a sequence of instructions that initialize the reduction
4877 : described by REDUC_INFO. Emit them in the appropriate place. */
4878 :
4879 : static void
4880 442 : vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4881 : vect_reduc_info reduc_info, gimple *seq)
4882 : {
4883 442 : if (VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info))
4884 : {
4885 : /* When reusing an accumulator from the main loop, we only need
4886 : initialization instructions if the main loop can be skipped.
4887 : In that case, emit the initialization instructions at the end
4888 : of the guard block that does the skip. */
4889 22 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
4890 22 : gcc_assert (skip_edge);
4891 22 : gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4892 22 : gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4893 : }
4894 : else
4895 : {
4896 : /* The normal case: emit the initialization instructions on the
4897 : preheader edge. */
4898 420 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4899 420 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4900 : }
4901 442 : }
4902 :
4903 : /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4904 : which performs a reduction involving GROUP_SIZE scalar statements.
4905 : NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
4906 : is nonnull, introducing extra elements of that value will not change the
4907 : result. */
4908 :
4909 : static void
4910 21604 : get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4911 : vect_reduc_info reduc_info,
4912 : tree vector_type,
4913 : vec<tree> *vec_oprnds,
4914 : unsigned int number_of_vectors,
4915 : unsigned int group_size, tree neutral_op)
4916 : {
4917 21604 : vec<tree> &initial_values = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info);
4918 21604 : unsigned HOST_WIDE_INT nunits;
4919 21604 : unsigned j, number_of_places_left_in_vector;
4920 21604 : unsigned int i;
4921 :
4922 43208 : gcc_assert (group_size == initial_values.length () || neutral_op);
4923 :
4924 : /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4925 : created vectors. It is greater than 1 if unrolling is performed.
4926 :
4927 : For example, we have two scalar operands, s1 and s2 (e.g., group of
4928 : strided accesses of size two), while NUNITS is four (i.e., four scalars
4929 : of this type can be packed in a vector). The output vector will contain
4930 : two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4931 : will be 2).
4932 :
4933 : If GROUP_SIZE > NUNITS, the scalars will be split into several
4934 : vectors containing the operands.
4935 :
4936 : For example, NUNITS is four as before, and the group size is 8
4937 : (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4938 : {s5, s6, s7, s8}. */
4939 :
4940 21604 : if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4941 : nunits = group_size;
4942 :
4943 21604 : tree vector_elt_type = TREE_TYPE (vector_type);
4944 21604 : number_of_places_left_in_vector = nunits;
4945 21604 : bool constant_p = true;
4946 21604 : tree_vector_builder elts (vector_type, nunits, 1);
4947 21604 : elts.quick_grow (nunits);
4948 21604 : gimple_seq ctor_seq = NULL;
4949 21604 : if (neutral_op
4950 42642 : && !useless_type_conversion_p (vector_elt_type,
4951 21038 : TREE_TYPE (neutral_op)))
4952 : {
4953 222 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4954 201 : neutral_op = gimple_build (&ctor_seq, COND_EXPR,
4955 : vector_elt_type,
4956 : neutral_op,
4957 : build_all_ones_cst (vector_elt_type),
4958 : build_zero_cst (vector_elt_type));
4959 : else
4960 21 : neutral_op = gimple_convert (&ctor_seq, vector_elt_type, neutral_op);
4961 : }
4962 202576 : for (j = 0; j < nunits * number_of_vectors; ++j)
4963 : {
4964 180972 : tree op;
4965 180972 : i = j % group_size;
4966 :
4967 : /* Get the def before the loop. In reduction chain we have only
4968 : one initial value. Else we have as many as PHIs in the group. */
4969 180972 : if (i >= initial_values.length () || (j > i && neutral_op))
4970 : op = neutral_op;
4971 : else
4972 : {
4973 50734 : if (!useless_type_conversion_p (vector_elt_type,
4974 25367 : TREE_TYPE (initial_values[i])))
4975 : {
4976 237 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4977 426 : initial_values[i] = gimple_build (&ctor_seq, COND_EXPR,
4978 : vector_elt_type,
4979 213 : initial_values[i],
4980 : build_all_ones_cst
4981 : (vector_elt_type),
4982 : build_zero_cst
4983 : (vector_elt_type));
4984 : else
4985 48 : initial_values[i] = gimple_convert (&ctor_seq,
4986 : vector_elt_type,
4987 24 : initial_values[i]);
4988 : }
4989 25367 : op = initial_values[i];
4990 : }
4991 :
4992 : /* Create 'vect_ = {op0,op1,...,opn}'. */
4993 180972 : number_of_places_left_in_vector--;
4994 180972 : elts[nunits - number_of_places_left_in_vector - 1] = op;
4995 180972 : if (!CONSTANT_CLASS_P (op))
4996 2337 : constant_p = false;
4997 :
4998 180972 : if (number_of_places_left_in_vector == 0)
4999 : {
5000 23068 : tree init;
5001 46136 : if (constant_p && !neutral_op
5002 45853 : ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5003 23068 : : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5004 : /* Build the vector directly from ELTS. */
5005 23068 : init = gimple_build_vector (&ctor_seq, &elts);
5006 0 : else if (neutral_op)
5007 : {
5008 : /* Build a vector of the neutral value and shift the
5009 : other elements into place. */
5010 0 : init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5011 : neutral_op);
5012 0 : int k = nunits;
5013 0 : while (k > 0 && operand_equal_p (elts[k - 1], neutral_op))
5014 : k -= 1;
5015 0 : while (k > 0)
5016 : {
5017 0 : k -= 1;
5018 0 : init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5019 0 : vector_type, init, elts[k]);
5020 : }
5021 : }
5022 : else
5023 : {
5024 : /* First time round, duplicate ELTS to fill the
5025 : required number of vectors. */
5026 0 : duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5027 : elts, number_of_vectors, *vec_oprnds);
5028 0 : break;
5029 : }
5030 23068 : vec_oprnds->quick_push (init);
5031 :
5032 23068 : number_of_places_left_in_vector = nunits;
5033 23068 : elts.new_vector (vector_type, nunits, 1);
5034 23068 : elts.quick_grow (nunits);
5035 23068 : constant_p = true;
5036 : }
5037 : }
5038 21604 : if (ctor_seq != NULL)
5039 442 : vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5040 21604 : }
5041 :
5042 : vect_reduc_info
5043 160868 : info_for_reduction (loop_vec_info loop_vinfo, slp_tree node)
5044 : {
5045 160868 : if (node->cycle_info.id == -1)
5046 : return NULL;
5047 158900 : return loop_vinfo->reduc_infos[node->cycle_info.id];
5048 : }
5049 :
5050 : /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5051 : REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5052 : return false. */
5053 :
5054 : static bool
5055 21243 : vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5056 : vect_reduc_info reduc_info, tree vectype)
5057 : {
5058 21243 : loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5059 21243 : if (!main_loop_vinfo)
5060 : return false;
5061 :
5062 4576 : if (VECT_REDUC_INFO_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5063 : return false;
5064 :
5065 : /* We are not set up to handle vector bools when they are not mapped
5066 : to vector integer data types. */
5067 4561 : if (VECTOR_BOOLEAN_TYPE_P (vectype)
5068 4633 : && GET_MODE_CLASS (TYPE_MODE (vectype)) != MODE_VECTOR_INT)
5069 : return false;
5070 :
5071 4559 : unsigned int num_phis = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).length ();
5072 4559 : auto_vec<tree, 16> main_loop_results (num_phis);
5073 4559 : auto_vec<tree, 16> initial_values (num_phis);
5074 4559 : if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5075 : {
5076 : /* The epilogue loop can be entered either from the main loop or
5077 : from an earlier guard block. */
5078 4336 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
5079 17368 : for (tree incoming_value : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info))
5080 : {
5081 : /* Look for:
5082 :
5083 : INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5084 : INITIAL_VALUE(guard block)>. */
5085 4360 : gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5086 :
5087 4360 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5088 4360 : gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5089 :
5090 4360 : tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5091 4360 : tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5092 :
5093 4360 : main_loop_results.quick_push (from_main_loop);
5094 4360 : initial_values.quick_push (from_skip);
5095 : }
5096 : }
5097 : else
5098 : /* The main loop dominates the epilogue loop. */
5099 223 : main_loop_results.splice (VECT_REDUC_INFO_INITIAL_VALUES (reduc_info));
5100 :
5101 : /* See if the main loop has the kind of accumulator we need. */
5102 4559 : vect_reusable_accumulator *accumulator
5103 4559 : = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5104 4559 : if (!accumulator
5105 9102 : || num_phis != VECT_REDUC_INFO_SCALAR_RESULTS (accumulator->reduc_info).length ()
5106 13657 : || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5107 : VECT_REDUC_INFO_SCALAR_RESULTS (accumulator->reduc_info).begin ()))
5108 : return false;
5109 :
5110 : /* Handle the case where we can reduce wider vectors to narrower ones. */
5111 4549 : tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5112 4549 : unsigned HOST_WIDE_INT m;
5113 4549 : if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5114 4549 : TYPE_VECTOR_SUBPARTS (vectype), &m))
5115 0 : return false;
5116 : /* Check the intermediate vector types and operations are available. */
5117 4549 : tree prev_vectype = old_vectype;
5118 4549 : poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5119 13273 : while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5120 : {
5121 4699 : intermediate_nunits = exact_div (intermediate_nunits, 2);
5122 4699 : tree intermediate_vectype = get_related_vectype_for_scalar_type
5123 4699 : (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5124 4699 : if (!intermediate_vectype
5125 4699 : || !directly_supported_p (VECT_REDUC_INFO_CODE (reduc_info),
5126 : intermediate_vectype)
5127 8878 : || !can_vec_extract (TYPE_MODE (prev_vectype),
5128 4179 : TYPE_MODE (intermediate_vectype)))
5129 : return false;
5130 : prev_vectype = intermediate_vectype;
5131 : }
5132 :
5133 : /* Non-SLP reductions might apply an adjustment after the reduction
5134 : operation, in order to simplify the initialization of the accumulator.
5135 : If the epilogue loop carries on from where the main loop left off,
5136 : it should apply the same adjustment to the final reduction result.
5137 :
5138 : If the epilogue loop can also be entered directly (rather than via
5139 : the main loop), we need to be able to handle that case in the same way,
5140 : with the same adjustment. (In principle we could add a PHI node
5141 : to select the correct adjustment, but in practice that shouldn't be
5142 : necessary.) */
5143 4025 : tree main_adjustment
5144 4025 : = VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5145 4025 : if (loop_vinfo->main_loop_edge && main_adjustment)
5146 : {
5147 3435 : gcc_assert (num_phis == 1);
5148 3435 : tree initial_value = initial_values[0];
5149 : /* Check that we can use INITIAL_VALUE as the adjustment and
5150 : initialize the accumulator with a neutral value instead. */
5151 3435 : if (!operand_equal_p (initial_value, main_adjustment))
5152 : return false;
5153 3425 : initial_values[0] = VECT_REDUC_INFO_NEUTRAL_OP (reduc_info);
5154 : }
5155 4015 : VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5156 4015 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).truncate (0);
5157 4015 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).splice (initial_values);
5158 4015 : VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info) = accumulator;
5159 4015 : return true;
5160 4559 : }
5161 :
5162 : /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5163 : CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5164 :
5165 : static tree
5166 4059 : vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5167 : gimple_seq *seq)
5168 : {
5169 4059 : gcc_assert (!VECTOR_BOOLEAN_TYPE_P (TREE_TYPE (vec_def))
5170 : || (GET_MODE_CLASS (TYPE_MODE (TREE_TYPE (vec_def)))
5171 : == MODE_VECTOR_INT));
5172 4059 : unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5173 4059 : unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5174 4059 : tree stype = TREE_TYPE (vectype);
5175 4059 : tree new_temp = vec_def;
5176 8261 : while (nunits > nunits1)
5177 : {
5178 4202 : nunits /= 2;
5179 4202 : tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5180 4202 : stype, nunits);
5181 4202 : unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5182 :
5183 : /* The target has to make sure we support lowpart/highpart
5184 : extraction, either via direct vector extract or through
5185 : an integer mode punning. */
5186 4202 : tree dst1, dst2;
5187 4202 : gimple *epilog_stmt;
5188 4202 : if (convert_optab_handler (vec_extract_optab,
5189 4202 : TYPE_MODE (TREE_TYPE (new_temp)),
5190 4202 : TYPE_MODE (vectype1))
5191 : != CODE_FOR_nothing)
5192 : {
5193 : /* Extract sub-vectors directly once vec_extract becomes
5194 : a conversion optab. */
5195 2590 : dst1 = make_ssa_name (vectype1);
5196 2590 : epilog_stmt
5197 5180 : = gimple_build_assign (dst1, BIT_FIELD_REF,
5198 : build3 (BIT_FIELD_REF, vectype1,
5199 2590 : new_temp, TYPE_SIZE (vectype1),
5200 : bitsize_int (0)));
5201 2590 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5202 2590 : dst2 = make_ssa_name (vectype1);
5203 2590 : epilog_stmt
5204 2590 : = gimple_build_assign (dst2, BIT_FIELD_REF,
5205 : build3 (BIT_FIELD_REF, vectype1,
5206 2590 : new_temp, TYPE_SIZE (vectype1),
5207 2590 : bitsize_int (bitsize)));
5208 2590 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5209 : }
5210 : else
5211 : {
5212 : /* Extract via punning to appropriately sized integer mode
5213 : vector. */
5214 1612 : tree eltype = build_nonstandard_integer_type (bitsize, 1);
5215 1612 : tree etype = build_vector_type (eltype, 2);
5216 3224 : gcc_assert (convert_optab_handler (vec_extract_optab,
5217 : TYPE_MODE (etype),
5218 : TYPE_MODE (eltype))
5219 : != CODE_FOR_nothing);
5220 1612 : tree tem = make_ssa_name (etype);
5221 1612 : epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5222 : build1 (VIEW_CONVERT_EXPR,
5223 : etype, new_temp));
5224 1612 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5225 1612 : new_temp = tem;
5226 1612 : tem = make_ssa_name (eltype);
5227 1612 : epilog_stmt
5228 3224 : = gimple_build_assign (tem, BIT_FIELD_REF,
5229 : build3 (BIT_FIELD_REF, eltype,
5230 1612 : new_temp, TYPE_SIZE (eltype),
5231 : bitsize_int (0)));
5232 1612 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5233 1612 : dst1 = make_ssa_name (vectype1);
5234 1612 : epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5235 : build1 (VIEW_CONVERT_EXPR,
5236 : vectype1, tem));
5237 1612 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5238 1612 : tem = make_ssa_name (eltype);
5239 1612 : epilog_stmt
5240 1612 : = gimple_build_assign (tem, BIT_FIELD_REF,
5241 : build3 (BIT_FIELD_REF, eltype,
5242 1612 : new_temp, TYPE_SIZE (eltype),
5243 1612 : bitsize_int (bitsize)));
5244 1612 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5245 1612 : dst2 = make_ssa_name (vectype1);
5246 1612 : epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5247 : build1 (VIEW_CONVERT_EXPR,
5248 : vectype1, tem));
5249 1612 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5250 : }
5251 :
5252 4202 : new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5253 : }
5254 4059 : if (!useless_type_conversion_p (vectype, TREE_TYPE (new_temp)))
5255 : {
5256 66 : tree dst3 = make_ssa_name (vectype);
5257 66 : gimple *epilog_stmt = gimple_build_assign (dst3, VIEW_CONVERT_EXPR,
5258 : build1 (VIEW_CONVERT_EXPR,
5259 : vectype, new_temp));
5260 66 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5261 66 : new_temp = dst3;
5262 : }
5263 :
5264 4059 : return new_temp;
5265 : }
5266 :
5267 : /* Function vect_create_epilog_for_reduction
5268 :
5269 : Create code at the loop-epilog to finalize the result of a reduction
5270 : computation.
5271 :
5272 : STMT_INFO is the scalar reduction stmt that is being vectorized.
5273 : SLP_NODE is an SLP node containing a group of reduction statements. The
5274 : first one in this group is STMT_INFO.
5275 : SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5276 : REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5277 : (counting from 0)
5278 : LOOP_EXIT is the edge to update in the merge block. In the case of a single
5279 : exit this edge is always the main loop exit.
5280 :
5281 : This function:
5282 : 1. Completes the reduction def-use cycles.
5283 : 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5284 : by calling the function specified by REDUC_FN if available, or by
5285 : other means (whole-vector shifts or a scalar loop).
5286 : The function also creates a new phi node at the loop exit to preserve
5287 : loop-closed form, as illustrated below.
5288 :
5289 : The flow at the entry to this function:
5290 :
5291 : loop:
5292 : vec_def = phi <vec_init, null> # REDUCTION_PHI
5293 : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5294 : s_loop = scalar_stmt # (scalar) STMT_INFO
5295 : loop_exit:
5296 : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5297 : use <s_out0>
5298 : use <s_out0>
5299 :
5300 : The above is transformed by this function into:
5301 :
5302 : loop:
5303 : vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5304 : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5305 : s_loop = scalar_stmt # (scalar) STMT_INFO
5306 : loop_exit:
5307 : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5308 : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5309 : v_out2 = reduce <v_out1>
5310 : s_out3 = extract_field <v_out2, 0>
5311 : s_out4 = adjust_result <s_out3>
5312 : use <s_out4>
5313 : use <s_out4>
5314 : */
5315 :
5316 : static void
5317 21951 : vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5318 : stmt_vec_info stmt_info,
5319 : slp_tree slp_node,
5320 : slp_instance slp_node_instance,
5321 : edge loop_exit)
5322 : {
5323 21951 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
5324 21951 : code_helper code = VECT_REDUC_INFO_CODE (reduc_info);
5325 21951 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
5326 21951 : tree vectype;
5327 21951 : machine_mode mode;
5328 21951 : basic_block exit_bb;
5329 21951 : gimple *new_phi = NULL, *phi = NULL;
5330 21951 : gimple_stmt_iterator exit_gsi;
5331 21951 : tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5332 21951 : gimple *epilog_stmt = NULL;
5333 21951 : gimple *exit_phi;
5334 21951 : tree def;
5335 21951 : tree orig_name, scalar_result;
5336 21951 : imm_use_iterator imm_iter;
5337 21951 : use_operand_p use_p;
5338 21951 : gimple *use_stmt;
5339 21951 : auto_vec<tree> reduc_inputs;
5340 21951 : int j, i;
5341 21951 : vec<tree> &scalar_results = VECT_REDUC_INFO_SCALAR_RESULTS (reduc_info);
5342 21951 : unsigned int k;
5343 : /* SLP reduction without reduction chain, e.g.,
5344 : # a1 = phi <a2, a0>
5345 : # b1 = phi <b2, b0>
5346 : a2 = operation (a1)
5347 : b2 = operation (b1) */
5348 21951 : const bool slp_reduc = !reduc_info->is_reduc_chain;
5349 21951 : tree induction_index = NULL_TREE;
5350 :
5351 21951 : unsigned int group_size = SLP_TREE_LANES (slp_node);
5352 :
5353 21951 : bool double_reduc = false;
5354 21951 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5355 21951 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5356 : {
5357 0 : double_reduc = true;
5358 0 : gcc_assert (slp_reduc);
5359 : }
5360 :
5361 21951 : vectype = VECT_REDUC_INFO_VECTYPE (reduc_info);
5362 21951 : gcc_assert (vectype);
5363 21951 : mode = TYPE_MODE (vectype);
5364 :
5365 21951 : tree induc_val = NULL_TREE;
5366 21951 : tree adjustment_def = NULL;
5367 : /* Optimize: for induction condition reduction, if we can't use zero
5368 : for induc_val, use initial_def. */
5369 21951 : if (VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5370 62 : induc_val = VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info);
5371 21889 : else if (double_reduc)
5372 : ;
5373 : else
5374 21889 : adjustment_def = VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info);
5375 :
5376 21951 : stmt_vec_info single_live_out_stmt[] = { stmt_info };
5377 21951 : array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5378 21951 : if (slp_reduc)
5379 : /* All statements produce live-out values. */
5380 43500 : live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5381 :
5382 21951 : unsigned vec_num
5383 21951 : = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5384 :
5385 : /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5386 : which is updated with the current index of the loop for every match of
5387 : the original loop's cond_expr (VEC_STMT). This results in a vector
5388 : containing the last time the condition passed for that vector lane.
5389 : The first match will be a 1 to allow 0 to be used for non-matching
5390 : indexes. If there are no matches at all then the vector will be all
5391 : zeroes.
5392 :
5393 : PR92772: This algorithm is broken for architectures that support
5394 : masked vectors, but do not provide fold_extract_last. */
5395 21951 : if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION)
5396 : {
5397 67 : gcc_assert (!double_reduc);
5398 67 : auto_vec<std::pair<tree, bool>, 2> ccompares;
5399 67 : slp_tree cond_node = slp_node_instance->root;
5400 143 : while (cond_node != slp_node_instance->reduc_phis)
5401 : {
5402 76 : stmt_vec_info cond_info = SLP_TREE_REPRESENTATIVE (cond_node);
5403 76 : if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5404 : {
5405 76 : gimple *vec_stmt
5406 76 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (cond_node)[0]);
5407 76 : gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5408 76 : ccompares.safe_push
5409 76 : (std::make_pair (gimple_assign_rhs1 (vec_stmt),
5410 76 : SLP_TREE_REDUC_IDX (cond_node) == 2));
5411 : }
5412 76 : int slp_reduc_idx = SLP_TREE_REDUC_IDX (cond_node);
5413 76 : cond_node = SLP_TREE_CHILDREN (cond_node)[slp_reduc_idx];
5414 : }
5415 67 : gcc_assert (ccompares.length () != 0);
5416 :
5417 67 : tree indx_before_incr, indx_after_incr;
5418 67 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5419 67 : int scalar_precision
5420 67 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5421 67 : tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5422 67 : tree cr_index_vector_type = get_related_vectype_for_scalar_type
5423 67 : (TYPE_MODE (vectype), cr_index_scalar_type,
5424 : TYPE_VECTOR_SUBPARTS (vectype));
5425 :
5426 : /* First we create a simple vector induction variable which starts
5427 : with the values {1,2,3,...} (SERIES_VECT) and increments by the
5428 : vector size (STEP). */
5429 :
5430 : /* Create a {1,2,3,...} vector. */
5431 67 : tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5432 :
5433 : /* Create a vector of the step value. */
5434 67 : tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5435 67 : tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5436 :
5437 : /* Create an induction variable. */
5438 67 : gimple_stmt_iterator incr_gsi;
5439 67 : bool insert_after;
5440 67 : vect_iv_increment_position (LOOP_VINFO_MAIN_EXIT (loop_vinfo),
5441 : &incr_gsi, &insert_after);
5442 67 : create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
5443 : insert_after, &indx_before_incr, &indx_after_incr);
5444 :
5445 : /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5446 : filled with zeros (VEC_ZERO). */
5447 :
5448 : /* Create a vector of 0s. */
5449 67 : tree zero = build_zero_cst (cr_index_scalar_type);
5450 67 : tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5451 :
5452 : /* Create a vector phi node. */
5453 67 : tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5454 67 : new_phi = create_phi_node (new_phi_tree, loop->header);
5455 67 : add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5456 : loop_preheader_edge (loop), UNKNOWN_LOCATION);
5457 :
5458 : /* Now take the condition from the loops original cond_exprs
5459 : and produce a new cond_exprs (INDEX_COND_EXPR) which for
5460 : every match uses values from the induction variable
5461 : (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5462 : (NEW_PHI_TREE).
5463 : Finally, we update the phi (NEW_PHI_TREE) to take the value of
5464 : the new cond_expr (INDEX_COND_EXPR). */
5465 67 : gimple_seq stmts = NULL;
5466 210 : for (int i = ccompares.length () - 1; i != -1; --i)
5467 : {
5468 76 : tree ccompare = ccompares[i].first;
5469 76 : if (ccompares[i].second)
5470 69 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5471 : cr_index_vector_type,
5472 : ccompare,
5473 : indx_before_incr, new_phi_tree);
5474 : else
5475 7 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5476 : cr_index_vector_type,
5477 : ccompare,
5478 : new_phi_tree, indx_before_incr);
5479 : }
5480 67 : gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5481 :
5482 : /* Update the phi with the vec cond. */
5483 67 : induction_index = new_phi_tree;
5484 67 : add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5485 : loop_latch_edge (loop), UNKNOWN_LOCATION);
5486 67 : }
5487 :
5488 : /* 2. Create epilog code.
5489 : The reduction epilog code operates across the elements of the vector
5490 : of partial results computed by the vectorized loop.
5491 : The reduction epilog code consists of:
5492 :
5493 : step 1: compute the scalar result in a vector (v_out2)
5494 : step 2: extract the scalar result (s_out3) from the vector (v_out2)
5495 : step 3: adjust the scalar result (s_out3) if needed.
5496 :
5497 : Step 1 can be accomplished using one the following three schemes:
5498 : (scheme 1) using reduc_fn, if available.
5499 : (scheme 2) using whole-vector shifts, if available.
5500 : (scheme 3) using a scalar loop. In this case steps 1+2 above are
5501 : combined.
5502 :
5503 : The overall epilog code looks like this:
5504 :
5505 : s_out0 = phi <s_loop> # original EXIT_PHI
5506 : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5507 : v_out2 = reduce <v_out1> # step 1
5508 : s_out3 = extract_field <v_out2, 0> # step 2
5509 : s_out4 = adjust_result <s_out3> # step 3
5510 :
5511 : (step 3 is optional, and steps 1 and 2 may be combined).
5512 : Lastly, the uses of s_out0 are replaced by s_out4. */
5513 :
5514 :
5515 : /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5516 : v_out1 = phi <VECT_DEF>
5517 : Store them in NEW_PHIS. */
5518 : /* We need to reduce values in all exits. */
5519 21951 : exit_bb = loop_exit->dest;
5520 21951 : exit_gsi = gsi_after_labels (exit_bb);
5521 21951 : reduc_inputs.create (vec_num);
5522 45376 : for (unsigned i = 0; i < vec_num; i++)
5523 : {
5524 23425 : gimple_seq stmts = NULL;
5525 23425 : def = vect_get_slp_vect_def (slp_node, i);
5526 23425 : tree new_def = copy_ssa_name (def);
5527 23425 : phi = create_phi_node (new_def, exit_bb);
5528 23425 : if (LOOP_VINFO_MAIN_EXIT (loop_vinfo) == loop_exit)
5529 23398 : SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
5530 : else
5531 : {
5532 57 : for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
5533 30 : SET_PHI_ARG_DEF (phi, k, def);
5534 : }
5535 23425 : new_def = gimple_convert (&stmts, vectype, new_def);
5536 23425 : reduc_inputs.quick_push (new_def);
5537 23425 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5538 : }
5539 :
5540 : /* 2.2 Get the original scalar reduction variable as defined in the loop.
5541 : In case STMT is a "pattern-stmt" (i.e. - it represents a reduction
5542 : pattern), the scalar-def is taken from the original stmt that the
5543 : pattern-stmt (STMT) replaces. */
5544 :
5545 22776 : tree scalar_dest = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
5546 21951 : tree scalar_type = TREE_TYPE (scalar_dest);
5547 21951 : scalar_results.truncate (0);
5548 21951 : scalar_results.reserve_exact (group_size);
5549 21951 : new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5550 :
5551 : /* True if we should implement SLP_REDUC using native reduction operations
5552 : instead of scalar operations. */
5553 21951 : const bool direct_slp_reduc
5554 21951 : = (reduc_fn != IFN_LAST
5555 21951 : && slp_reduc
5556 21951 : && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5557 :
5558 : /* If signed overflow is undefined we might need to perform reduction
5559 : computations in an unsigned type. */
5560 21951 : tree compute_vectype = vectype;
5561 21951 : if (ANY_INTEGRAL_TYPE_P (vectype)
5562 15012 : && TYPE_OVERFLOW_UNDEFINED (vectype)
5563 5571 : && code.is_tree_code ()
5564 27522 : && arith_code_with_undefined_signed_overflow ((tree_code) code))
5565 4105 : compute_vectype = unsigned_type_for (vectype);
5566 :
5567 : /* In case of reduction chain, e.g.,
5568 : # a1 = phi <a3, a0>
5569 : a2 = operation (a1)
5570 : a3 = operation (a2),
5571 :
5572 : we may end up with more than one vector result. Here we reduce them
5573 : to one vector.
5574 :
5575 : The same is true for a SLP reduction, e.g.,
5576 : # a1 = phi <a2, a0>
5577 : # b1 = phi <b2, b0>
5578 : a2 = operation (a1)
5579 : b2 = operation (a2),
5580 :
5581 : where we can end up with more than one vector as well. We can
5582 : easily accumulate vectors when the number of vector elements is
5583 : a multiple of the SLP group size.
5584 :
5585 : The same is true if we couldn't use a single defuse cycle. */
5586 21951 : if ((!slp_reduc
5587 : || direct_slp_reduc
5588 : || (slp_reduc
5589 21951 : && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size)))
5590 43902 : && reduc_inputs.length () > 1)
5591 : {
5592 544 : gimple_seq stmts = NULL;
5593 544 : tree single_input = reduc_inputs[0];
5594 544 : if (compute_vectype != vectype)
5595 159 : single_input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5596 : compute_vectype, single_input);
5597 1865 : for (k = 1; k < reduc_inputs.length (); k++)
5598 : {
5599 1321 : tree input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5600 1321 : compute_vectype, reduc_inputs[k]);
5601 1321 : single_input = gimple_build (&stmts, code, compute_vectype,
5602 : single_input, input);
5603 : }
5604 544 : if (compute_vectype != vectype)
5605 159 : single_input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5606 : vectype, single_input);
5607 544 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5608 :
5609 544 : reduc_inputs.truncate (0);
5610 544 : reduc_inputs.safe_push (single_input);
5611 : }
5612 :
5613 21951 : tree orig_reduc_input = reduc_inputs[0];
5614 :
5615 : /* If this loop is an epilogue loop that can be skipped after the
5616 : main loop, we can only share a reduction operation between the
5617 : main loop and the epilogue if we put it at the target of the
5618 : skip edge.
5619 :
5620 : We can still reuse accumulators if this check fails. Doing so has
5621 : the minor(?) benefit of making the epilogue loop's scalar result
5622 : independent of the main loop's scalar result. */
5623 21951 : bool unify_with_main_loop_p = false;
5624 21951 : if (VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)
5625 4015 : && loop_vinfo->skip_this_loop_edge
5626 3775 : && single_succ_p (exit_bb)
5627 21972 : && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5628 : {
5629 21 : unify_with_main_loop_p = true;
5630 :
5631 21 : basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5632 21 : reduc_inputs[0] = make_ssa_name (vectype);
5633 21 : gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5634 21 : add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5635 : UNKNOWN_LOCATION);
5636 21 : add_phi_arg (new_phi,
5637 21 : VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)->reduc_input,
5638 : loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5639 21 : exit_gsi = gsi_after_labels (reduc_block);
5640 : }
5641 :
5642 : /* Shouldn't be used beyond this point. */
5643 21951 : exit_bb = nullptr;
5644 :
5645 : /* If we are operating on a mask vector and do not support direct mask
5646 : reduction, work on a bool data vector instead of a mask vector. */
5647 21951 : if (VECTOR_BOOLEAN_TYPE_P (vectype)
5648 229 : && VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info)
5649 22145 : && vectype != VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info))
5650 : {
5651 194 : compute_vectype = vectype = VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info);
5652 194 : gimple_seq stmts = NULL;
5653 396 : for (unsigned i = 0; i < reduc_inputs.length (); ++i)
5654 404 : reduc_inputs[i] = gimple_build (&stmts, VEC_COND_EXPR, vectype,
5655 202 : reduc_inputs[i],
5656 : build_one_cst (vectype),
5657 : build_zero_cst (vectype));
5658 194 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5659 : }
5660 :
5661 21951 : if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
5662 67 : && reduc_fn != IFN_LAST)
5663 : {
5664 : /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5665 : various data values where the condition matched and another vector
5666 : (INDUCTION_INDEX) containing all the indexes of those matches. We
5667 : need to extract the last matching index (which will be the index with
5668 : highest value) and use this to index into the data vector.
5669 : For the case where there were no matches, the data vector will contain
5670 : all default values and the index vector will be all zeros. */
5671 :
5672 : /* Get various versions of the type of the vector of indexes. */
5673 4 : tree index_vec_type = TREE_TYPE (induction_index);
5674 4 : gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5675 4 : tree index_scalar_type = TREE_TYPE (index_vec_type);
5676 4 : tree index_vec_cmp_type = truth_type_for (index_vec_type);
5677 :
5678 : /* Get an unsigned integer version of the type of the data vector. */
5679 4 : int scalar_precision
5680 4 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5681 4 : tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5682 4 : tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5683 : vectype);
5684 :
5685 : /* First we need to create a vector (ZERO_VEC) of zeros and another
5686 : vector (MAX_INDEX_VEC) filled with the last matching index, which we
5687 : can create using a MAX reduction and then expanding.
5688 : In the case where the loop never made any matches, the max index will
5689 : be zero. */
5690 :
5691 : /* Vector of {0, 0, 0,...}. */
5692 4 : tree zero_vec = build_zero_cst (vectype);
5693 :
5694 : /* Find maximum value from the vector of found indexes. */
5695 4 : tree max_index = make_ssa_name (index_scalar_type);
5696 4 : gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5697 : 1, induction_index);
5698 4 : gimple_call_set_lhs (max_index_stmt, max_index);
5699 4 : gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5700 :
5701 : /* Vector of {max_index, max_index, max_index,...}. */
5702 4 : tree max_index_vec = make_ssa_name (index_vec_type);
5703 4 : tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5704 : max_index);
5705 4 : gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5706 : max_index_vec_rhs);
5707 4 : gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5708 :
5709 : /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5710 : with the vector (INDUCTION_INDEX) of found indexes, choosing values
5711 : from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5712 : otherwise. Only one value should match, resulting in a vector
5713 : (VEC_COND) with one data value and the rest zeros.
5714 : In the case where the loop never made any matches, every index will
5715 : match, resulting in a vector with all data values (which will all be
5716 : the default value). */
5717 :
5718 : /* Compare the max index vector to the vector of found indexes to find
5719 : the position of the max value. */
5720 4 : tree vec_compare = make_ssa_name (index_vec_cmp_type);
5721 4 : gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5722 : induction_index,
5723 : max_index_vec);
5724 4 : gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5725 :
5726 : /* Use the compare to choose either values from the data vector or
5727 : zero. */
5728 4 : tree vec_cond = make_ssa_name (vectype);
5729 4 : gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5730 : vec_compare,
5731 4 : reduc_inputs[0],
5732 : zero_vec);
5733 4 : gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5734 :
5735 : /* Finally we need to extract the data value from the vector (VEC_COND)
5736 : into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5737 : reduction, but because this doesn't exist, we can use a MAX reduction
5738 : instead. The data value might be signed or a float so we need to cast
5739 : it first.
5740 : In the case where the loop never made any matches, the data values are
5741 : all identical, and so will reduce down correctly. */
5742 :
5743 : /* Make the matched data values unsigned. */
5744 4 : tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5745 4 : tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5746 : vec_cond);
5747 4 : gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5748 : VIEW_CONVERT_EXPR,
5749 : vec_cond_cast_rhs);
5750 4 : gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5751 :
5752 : /* Reduce down to a scalar value. */
5753 4 : tree data_reduc = make_ssa_name (scalar_type_unsigned);
5754 4 : gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5755 : 1, vec_cond_cast);
5756 4 : gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5757 4 : gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5758 :
5759 : /* Convert the reduced value back to the result type and set as the
5760 : result. */
5761 4 : gimple_seq stmts = NULL;
5762 4 : new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5763 : data_reduc);
5764 4 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5765 4 : scalar_results.safe_push (new_temp);
5766 4 : }
5767 21947 : else if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
5768 63 : && reduc_fn == IFN_LAST)
5769 : {
5770 : /* Condition reduction without supported IFN_REDUC_MAX. Generate
5771 : idx = 0;
5772 : idx_val = induction_index[0];
5773 : val = data_reduc[0];
5774 : for (idx = 0, val = init, i = 0; i < nelts; ++i)
5775 : if (induction_index[i] > idx_val)
5776 : val = data_reduc[i], idx_val = induction_index[i];
5777 : return val; */
5778 :
5779 63 : tree data_eltype = TREE_TYPE (vectype);
5780 63 : tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5781 63 : unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5782 63 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5783 : /* Enforced by vectorizable_reduction, which ensures we have target
5784 : support before allowing a conditional reduction on variable-length
5785 : vectors. */
5786 63 : unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5787 63 : tree idx_val = NULL_TREE, val = NULL_TREE;
5788 419 : for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5789 : {
5790 356 : tree old_idx_val = idx_val;
5791 356 : tree old_val = val;
5792 356 : idx_val = make_ssa_name (idx_eltype);
5793 356 : epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5794 : build3 (BIT_FIELD_REF, idx_eltype,
5795 : induction_index,
5796 356 : bitsize_int (el_size),
5797 356 : bitsize_int (off)));
5798 356 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5799 356 : val = make_ssa_name (data_eltype);
5800 712 : epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5801 : build3 (BIT_FIELD_REF,
5802 : data_eltype,
5803 356 : reduc_inputs[0],
5804 356 : bitsize_int (el_size),
5805 356 : bitsize_int (off)));
5806 356 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5807 356 : if (off != 0)
5808 : {
5809 293 : tree new_idx_val = idx_val;
5810 293 : if (off != v_size - el_size)
5811 : {
5812 230 : new_idx_val = make_ssa_name (idx_eltype);
5813 230 : epilog_stmt = gimple_build_assign (new_idx_val,
5814 : MAX_EXPR, idx_val,
5815 : old_idx_val);
5816 230 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5817 : }
5818 293 : tree cond = make_ssa_name (boolean_type_node);
5819 293 : epilog_stmt = gimple_build_assign (cond, GT_EXPR,
5820 : idx_val, old_idx_val);
5821 293 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5822 293 : tree new_val = make_ssa_name (data_eltype);
5823 293 : epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
5824 : cond, val, old_val);
5825 293 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5826 293 : idx_val = new_idx_val;
5827 293 : val = new_val;
5828 : }
5829 : }
5830 : /* Convert the reduced value back to the result type and set as the
5831 : result. */
5832 63 : gimple_seq stmts = NULL;
5833 63 : val = gimple_convert (&stmts, scalar_type, val);
5834 63 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5835 63 : scalar_results.safe_push (val);
5836 63 : }
5837 :
5838 : /* 2.3 Create the reduction code, using one of the three schemes described
5839 : above. In SLP we simply need to extract all the elements from the
5840 : vector (without reducing them), so we use scalar shifts. */
5841 21884 : else if (reduc_fn != IFN_LAST && (!slp_reduc || group_size == 1))
5842 : {
5843 19914 : tree tmp;
5844 19914 : tree vec_elem_type;
5845 :
5846 : /* Case 1: Create:
5847 : v_out2 = reduc_expr <v_out1> */
5848 :
5849 19914 : if (dump_enabled_p ())
5850 1512 : dump_printf_loc (MSG_NOTE, vect_location,
5851 : "Reduce using direct vector reduction.\n");
5852 :
5853 19914 : gimple_seq stmts = NULL;
5854 19914 : vec_elem_type = TREE_TYPE (vectype);
5855 19914 : new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5856 19914 : vec_elem_type, reduc_inputs[0]);
5857 19914 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5858 19914 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5859 :
5860 19914 : if ((VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5861 62 : && induc_val)
5862 : {
5863 : /* Earlier we set the initial value to be a vector if induc_val
5864 : values. Check the result and if it is induc_val then replace
5865 : with the original initial value, unless induc_val is
5866 : the same as initial_def already. */
5867 60 : tree zcompare = make_ssa_name (boolean_type_node);
5868 60 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
5869 : new_temp, induc_val);
5870 60 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5871 60 : tree initial_def = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[0];
5872 60 : tmp = make_ssa_name (new_scalar_dest);
5873 60 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5874 : initial_def, new_temp);
5875 60 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5876 60 : new_temp = tmp;
5877 : }
5878 :
5879 19914 : scalar_results.safe_push (new_temp);
5880 19914 : }
5881 1783 : else if (direct_slp_reduc)
5882 : {
5883 : /* Here we create one vector for each of the GROUP_SIZE results,
5884 : with the elements for other SLP statements replaced with the
5885 : neutral value. We can then do a normal reduction on each vector. */
5886 :
5887 : /* Enforced by vectorizable_reduction. */
5888 : gcc_assert (reduc_inputs.length () == 1);
5889 : gcc_assert (pow2p_hwi (group_size));
5890 :
5891 : gimple_seq seq = NULL;
5892 :
5893 : /* Build a vector {0, 1, 2, ...}, with the same number of elements
5894 : and the same element size as VECTYPE. */
5895 : tree index = build_index_vector (vectype, 0, 1);
5896 : tree index_type = TREE_TYPE (index);
5897 : tree index_elt_type = TREE_TYPE (index_type);
5898 : tree mask_type = truth_type_for (index_type);
5899 :
5900 : /* Create a vector that, for each element, identifies which of
5901 : the results should use it. */
5902 : tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5903 : index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5904 : build_vector_from_val (index_type, index_mask));
5905 :
5906 : /* Get a neutral vector value. This is simply a splat of the neutral
5907 : scalar value if we have one, otherwise the initial scalar value
5908 : is itself a neutral value. */
5909 : tree vector_identity = NULL_TREE;
5910 : tree neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5911 : NULL_TREE, false);
5912 : if (neutral_op)
5913 : vector_identity = gimple_build_vector_from_val (&seq, vectype,
5914 : neutral_op);
5915 : for (unsigned int i = 0; i < group_size; ++i)
5916 : {
5917 : /* If there's no univeral neutral value, we can use the
5918 : initial scalar value from the original PHI. This is used
5919 : for MIN and MAX reduction, for example. */
5920 : if (!neutral_op)
5921 : {
5922 : tree scalar_value
5923 : = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[i];
5924 : scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5925 : scalar_value);
5926 : vector_identity = gimple_build_vector_from_val (&seq, vectype,
5927 : scalar_value);
5928 : }
5929 :
5930 : /* Calculate the equivalent of:
5931 :
5932 : sel[j] = (index[j] == i);
5933 :
5934 : which selects the elements of REDUC_INPUTS[0] that should
5935 : be included in the result. */
5936 : tree compare_val = build_int_cst (index_elt_type, i);
5937 : compare_val = build_vector_from_val (index_type, compare_val);
5938 : tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5939 : index, compare_val);
5940 :
5941 : /* Calculate the equivalent of:
5942 :
5943 : vec = seq ? reduc_inputs[0] : vector_identity;
5944 :
5945 : VEC is now suitable for a full vector reduction. */
5946 : tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5947 : sel, reduc_inputs[0], vector_identity);
5948 :
5949 : /* Do the reduction and convert it to the appropriate type. */
5950 : tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5951 : TREE_TYPE (vectype), vec);
5952 : scalar = gimple_convert (&seq, scalar_type, scalar);
5953 : scalar_results.safe_push (scalar);
5954 : }
5955 : gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5956 : }
5957 : else
5958 : {
5959 1783 : bool reduce_with_shift;
5960 1783 : tree vec_temp;
5961 :
5962 1783 : gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5963 :
5964 : /* See if the target wants to do the final (shift) reduction
5965 : in a vector mode of smaller size and first reduce upper/lower
5966 : halves against each other. */
5967 1970 : enum machine_mode mode1 = mode;
5968 1970 : tree stype = TREE_TYPE (vectype);
5969 1970 : if (compute_vectype != vectype)
5970 : {
5971 544 : stype = unsigned_type_for (stype);
5972 544 : gimple_seq stmts = NULL;
5973 1146 : for (unsigned i = 0; i < reduc_inputs.length (); ++i)
5974 : {
5975 602 : tree new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5976 602 : compute_vectype, reduc_inputs[i]);
5977 602 : reduc_inputs[i] = new_temp;
5978 : }
5979 544 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5980 : }
5981 1970 : unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5982 1970 : unsigned nunits1 = nunits;
5983 1970 : if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5984 1970 : && reduc_inputs.length () == 1)
5985 : {
5986 41 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5987 : /* For SLP reductions we have to make sure lanes match up, but
5988 : since we're doing individual element final reduction reducing
5989 : vector width here is even more important.
5990 : ??? We can also separate lanes with permutes, for the common
5991 : case of power-of-two group-size odd/even extracts would work. */
5992 41 : if (slp_reduc && nunits != nunits1)
5993 : {
5994 41 : nunits1 = least_common_multiple (nunits1, group_size);
5995 82 : gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5996 : }
5997 : }
5998 1929 : else if (!slp_reduc
5999 1929 : && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6000 0 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6001 :
6002 1970 : tree vectype1 = compute_vectype;
6003 1970 : if (mode1 != mode)
6004 : {
6005 47 : vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6006 47 : stype, nunits1);
6007 : /* First reduce the vector to the desired vector size we should
6008 : do shift reduction on by combining upper and lower halves. */
6009 47 : gimple_seq stmts = NULL;
6010 47 : new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6011 : code, &stmts);
6012 47 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6013 47 : reduc_inputs[0] = new_temp;
6014 : }
6015 :
6016 1970 : reduce_with_shift = have_whole_vector_shift (mode1);
6017 732 : if (!VECTOR_MODE_P (mode1)
6018 2700 : || !directly_supported_p (code, vectype1))
6019 : reduce_with_shift = false;
6020 :
6021 1953 : if (reduce_with_shift && (!slp_reduc || group_size == 1))
6022 : {
6023 1730 : int element_bitsize = vector_element_bits (vectype1);
6024 : /* Enforced by vectorizable_reduction, which disallows SLP reductions
6025 : for variable-length vectors and also requires direct target support
6026 : for loop reductions. */
6027 1730 : int nelements = TYPE_VECTOR_SUBPARTS (vectype1).to_constant ();
6028 1730 : vec_perm_builder sel;
6029 1730 : vec_perm_indices indices;
6030 :
6031 1730 : int elt_offset;
6032 :
6033 1730 : tree zero_vec = build_zero_cst (vectype1);
6034 : /* Case 2: Create:
6035 : for (offset = nelements/2; offset >= 1; offset/=2)
6036 : {
6037 : Create: va' = vec_shift <va, offset>
6038 : Create: va = vop <va, va'>
6039 : } */
6040 :
6041 1730 : if (dump_enabled_p ())
6042 366 : dump_printf_loc (MSG_NOTE, vect_location,
6043 : "Reduce using vector shifts\n");
6044 :
6045 1730 : gimple_seq stmts = NULL;
6046 1730 : new_temp = gimple_convert (&stmts, vectype1, reduc_inputs[0]);
6047 1730 : for (elt_offset = nelements / 2;
6048 3770 : elt_offset >= 1;
6049 2040 : elt_offset /= 2)
6050 : {
6051 2040 : calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6052 2040 : indices.new_vector (sel, 2, nelements);
6053 2040 : tree mask = vect_gen_perm_mask_any (vectype1, indices);
6054 2040 : new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6055 : new_temp, zero_vec, mask);
6056 2040 : new_temp = gimple_build (&stmts, code,
6057 : vectype1, new_name, new_temp);
6058 : }
6059 :
6060 : /* 2.4 Extract the final scalar result. Create:
6061 : s_out3 = extract_field <v_out2, bitpos> */
6062 :
6063 1730 : if (dump_enabled_p ())
6064 366 : dump_printf_loc (MSG_NOTE, vect_location,
6065 : "extract scalar result\n");
6066 :
6067 1730 : new_temp = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype1),
6068 1730 : new_temp, bitsize_int (element_bitsize),
6069 1730 : bitsize_zero_node);
6070 1730 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6071 1730 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6072 1730 : scalar_results.safe_push (new_temp);
6073 1730 : }
6074 : else
6075 : {
6076 : /* Case 3: Create:
6077 : s = extract_field <v_out2, 0>
6078 : for (offset = element_size;
6079 : offset < vector_size;
6080 : offset += element_size;)
6081 : {
6082 : Create: s' = extract_field <v_out2, offset>
6083 : Create: s = op <s, s'> // For non SLP cases
6084 : } */
6085 :
6086 240 : if (dump_enabled_p ())
6087 151 : dump_printf_loc (MSG_NOTE, vect_location,
6088 : "Reduce using scalar code.\n");
6089 :
6090 240 : tree compute_type = TREE_TYPE (vectype1);
6091 240 : unsigned element_bitsize = vector_element_bits (vectype1);
6092 240 : unsigned vec_size_in_bits = element_bitsize
6093 240 : * TYPE_VECTOR_SUBPARTS (vectype1).to_constant ();
6094 240 : tree bitsize = bitsize_int (element_bitsize);
6095 240 : gimple_seq stmts = NULL;
6096 633 : FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6097 : {
6098 393 : unsigned bit_offset;
6099 786 : new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6100 393 : vec_temp, bitsize, bitsize_zero_node);
6101 :
6102 : /* In SLP we don't need to apply reduction operation, so we just
6103 : collect s' values in SCALAR_RESULTS. */
6104 393 : if (slp_reduc)
6105 383 : scalar_results.safe_push (new_temp);
6106 :
6107 955 : for (bit_offset = element_bitsize;
6108 1348 : bit_offset < vec_size_in_bits;
6109 955 : bit_offset += element_bitsize)
6110 : {
6111 955 : tree bitpos = bitsize_int (bit_offset);
6112 955 : new_name = gimple_build (&stmts, BIT_FIELD_REF,
6113 : compute_type, vec_temp,
6114 : bitsize, bitpos);
6115 955 : if (slp_reduc)
6116 : {
6117 : /* In SLP we don't need to apply reduction operation, so
6118 : we just collect s' values in SCALAR_RESULTS. */
6119 945 : new_temp = new_name;
6120 945 : scalar_results.safe_push (new_name);
6121 : }
6122 : else
6123 10 : new_temp = gimple_build (&stmts, code, compute_type,
6124 : new_name, new_temp);
6125 : }
6126 : }
6127 :
6128 : /* The only case where we need to reduce scalar results in a SLP
6129 : reduction, is unrolling. If the size of SCALAR_RESULTS is
6130 : greater than GROUP_SIZE, we reduce them combining elements modulo
6131 : GROUP_SIZE. */
6132 240 : if (slp_reduc)
6133 : {
6134 230 : tree res, first_res, new_res;
6135 :
6136 : /* Reduce multiple scalar results in case of SLP unrolling. */
6137 878 : for (j = group_size; scalar_results.iterate (j, &res);
6138 : j++)
6139 : {
6140 648 : first_res = scalar_results[j % group_size];
6141 648 : new_res = gimple_build (&stmts, code, compute_type,
6142 : first_res, res);
6143 648 : scalar_results[j % group_size] = new_res;
6144 : }
6145 230 : scalar_results.truncate (group_size);
6146 1140 : for (k = 0; k < group_size; k++)
6147 1360 : scalar_results[k] = gimple_convert (&stmts, scalar_type,
6148 680 : scalar_results[k]);
6149 : }
6150 : else
6151 : {
6152 : /* Reduction chain - we have one scalar to keep in
6153 : SCALAR_RESULTS. */
6154 10 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6155 10 : scalar_results.safe_push (new_temp);
6156 : }
6157 :
6158 240 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6159 : }
6160 :
6161 1970 : if ((VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6162 0 : && induc_val)
6163 : {
6164 : /* Earlier we set the initial value to be a vector if induc_val
6165 : values. Check the result and if it is induc_val then replace
6166 : with the original initial value, unless induc_val is
6167 : the same as initial_def already. */
6168 0 : tree zcompare = make_ssa_name (boolean_type_node);
6169 0 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6170 0 : scalar_results[0], induc_val);
6171 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6172 0 : tree initial_def = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[0];
6173 0 : tree tmp = make_ssa_name (new_scalar_dest);
6174 0 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6175 0 : initial_def, scalar_results[0]);
6176 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6177 0 : scalar_results[0] = tmp;
6178 : }
6179 : }
6180 :
6181 : /* 2.5 Adjust the final result by the initial value of the reduction
6182 : variable. (When such adjustment is not needed, then
6183 : 'adjustment_def' is zero). For example, if code is PLUS we create:
6184 : new_temp = loop_exit_def + adjustment_def */
6185 :
6186 21951 : if (adjustment_def)
6187 : {
6188 15735 : gcc_assert (!slp_reduc || group_size == 1);
6189 15735 : gimple_seq stmts = NULL;
6190 15735 : if (double_reduc)
6191 : {
6192 0 : gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6193 0 : adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6194 0 : new_temp = gimple_build (&stmts, code, vectype,
6195 0 : reduc_inputs[0], adjustment_def);
6196 : }
6197 : else
6198 : {
6199 15735 : new_temp = scalar_results[0];
6200 15735 : gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6201 15735 : adjustment_def = gimple_convert (&stmts, TREE_TYPE (compute_vectype),
6202 : adjustment_def);
6203 15735 : new_temp = gimple_convert (&stmts, TREE_TYPE (compute_vectype),
6204 : new_temp);
6205 15735 : new_temp = gimple_build (&stmts, code, TREE_TYPE (compute_vectype),
6206 : new_temp, adjustment_def);
6207 15735 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6208 : }
6209 :
6210 15735 : epilog_stmt = gimple_seq_last_stmt (stmts);
6211 15735 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6212 15735 : scalar_results[0] = new_temp;
6213 : }
6214 :
6215 : /* Record this operation if it could be reused by the epilogue loop. */
6216 21951 : if (VECT_REDUC_INFO_TYPE (reduc_info) == TREE_CODE_REDUCTION
6217 21951 : && reduc_inputs.length () == 1)
6218 21767 : loop_vinfo->reusable_accumulators.put (scalar_results[0],
6219 : { orig_reduc_input, reduc_info });
6220 :
6221 : /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6222 : phis with new adjusted scalar results, i.e., replace use <s_out0>
6223 : with use <s_out4>.
6224 :
6225 : Transform:
6226 : loop_exit:
6227 : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6228 : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6229 : v_out2 = reduce <v_out1>
6230 : s_out3 = extract_field <v_out2, 0>
6231 : s_out4 = adjust_result <s_out3>
6232 : use <s_out0>
6233 : use <s_out0>
6234 :
6235 : into:
6236 :
6237 : loop_exit:
6238 : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6239 : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6240 : v_out2 = reduce <v_out1>
6241 : s_out3 = extract_field <v_out2, 0>
6242 : s_out4 = adjust_result <s_out3>
6243 : use <s_out4>
6244 : use <s_out4> */
6245 :
6246 43902 : gcc_assert (live_out_stmts.size () == scalar_results.length ());
6247 21951 : auto_vec<gimple *> phis;
6248 44352 : for (k = 0; k < live_out_stmts.size (); k++)
6249 : {
6250 22401 : stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6251 22401 : tree scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6252 :
6253 : /* Find the loop-closed-use at the loop exit of the original scalar
6254 : result. (The reduction result is expected to have two immediate uses,
6255 : one at the latch block, and one at the loop exit). Note with
6256 : early break we can have two exit blocks, so pick the correct PHI. */
6257 113541 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6258 68739 : if (!is_gimple_debug (USE_STMT (use_p))
6259 68739 : && !flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6260 : {
6261 22396 : gcc_assert (is_a <gphi *> (USE_STMT (use_p)));
6262 22396 : if (gimple_bb (USE_STMT (use_p)) == loop_exit->dest)
6263 22388 : phis.safe_push (USE_STMT (use_p));
6264 22401 : }
6265 :
6266 44789 : FOR_EACH_VEC_ELT (phis, i, exit_phi)
6267 : {
6268 : /* Replace the uses: */
6269 22388 : orig_name = PHI_RESULT (exit_phi);
6270 :
6271 : /* Look for a single use at the target of the skip edge. */
6272 22388 : if (unify_with_main_loop_p)
6273 : {
6274 38 : use_operand_p use_p;
6275 38 : gimple *user;
6276 38 : if (!single_imm_use (orig_name, &use_p, &user))
6277 0 : gcc_unreachable ();
6278 38 : orig_name = gimple_get_lhs (user);
6279 : }
6280 :
6281 22388 : scalar_result = scalar_results[k];
6282 82987 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6283 : {
6284 38211 : gphi *use_phi = dyn_cast <gphi *> (use_stmt);
6285 114677 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6286 : {
6287 38233 : if (use_phi
6288 38233 : && (phi_arg_edge_from_use (use_p)->flags & EDGE_ABNORMAL))
6289 : {
6290 0 : gcc_assert (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (orig_name));
6291 0 : SSA_NAME_OCCURS_IN_ABNORMAL_PHI (scalar_result) = 1;
6292 : }
6293 38233 : SET_USE (use_p, scalar_result);
6294 : }
6295 38211 : update_stmt (use_stmt);
6296 22388 : }
6297 : }
6298 :
6299 22401 : phis.truncate (0);
6300 : }
6301 21951 : }
6302 :
6303 : /* Return a vector of type VECTYPE that is equal to the vector select
6304 : operation "MASK ? VEC : IDENTITY". Insert the select statements
6305 : before GSI. */
6306 :
6307 : static tree
6308 9 : merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6309 : tree vec, tree identity)
6310 : {
6311 9 : tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6312 9 : gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6313 : mask, vec, identity);
6314 9 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6315 9 : return cond;
6316 : }
6317 :
6318 : /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6319 : order, starting with LHS. Insert the extraction statements before GSI and
6320 : associate the new scalar SSA names with variable SCALAR_DEST.
6321 : If MASK is nonzero mask the input and then operate on it unconditionally.
6322 : Return the SSA name for the result. */
6323 :
6324 : static tree
6325 1105 : vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6326 : tree_code code, tree lhs, tree vector_rhs,
6327 : tree mask)
6328 : {
6329 1105 : tree vectype = TREE_TYPE (vector_rhs);
6330 1105 : tree scalar_type = TREE_TYPE (vectype);
6331 1105 : tree bitsize = TYPE_SIZE (scalar_type);
6332 1105 : unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6333 1105 : unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6334 :
6335 : /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6336 : to perform an unconditional element-wise reduction of it. */
6337 1105 : if (mask)
6338 : {
6339 85 : tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
6340 : "masked_vector_rhs");
6341 85 : tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
6342 : false);
6343 85 : tree vector_identity = build_vector_from_val (vectype, neutral_op);
6344 85 : gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
6345 : mask, vector_rhs, vector_identity);
6346 85 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6347 85 : vector_rhs = masked_vector_rhs;
6348 : }
6349 :
6350 1105 : for (unsigned HOST_WIDE_INT bit_offset = 0;
6351 5181 : bit_offset < vec_size_in_bits;
6352 4076 : bit_offset += element_bitsize)
6353 : {
6354 4076 : tree bitpos = bitsize_int (bit_offset);
6355 4076 : tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6356 : bitsize, bitpos);
6357 :
6358 4076 : gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6359 4076 : rhs = make_ssa_name (scalar_dest, stmt);
6360 4076 : gimple_assign_set_lhs (stmt, rhs);
6361 4076 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6362 : /* Fold the vector extract, combining it with a previous reversal
6363 : like seen in PR90579. */
6364 4076 : auto gsi2 = gsi_for_stmt (stmt);
6365 4076 : if (fold_stmt (&gsi2, follow_all_ssa_edges))
6366 356 : update_stmt (gsi_stmt (gsi2));
6367 :
6368 4076 : stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6369 4076 : tree new_name = make_ssa_name (scalar_dest, stmt);
6370 4076 : gimple_assign_set_lhs (stmt, new_name);
6371 4076 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6372 4076 : lhs = new_name;
6373 : }
6374 1105 : return lhs;
6375 : }
6376 :
6377 : /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6378 : type of the vector input. */
6379 :
6380 : static internal_fn
6381 2969 : get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6382 : {
6383 2969 : internal_fn mask_reduc_fn;
6384 2969 : internal_fn mask_len_reduc_fn;
6385 :
6386 2969 : switch (reduc_fn)
6387 : {
6388 0 : case IFN_FOLD_LEFT_PLUS:
6389 0 : mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6390 0 : mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6391 0 : break;
6392 :
6393 : default:
6394 : return IFN_LAST;
6395 : }
6396 :
6397 0 : if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6398 : OPTIMIZE_FOR_SPEED))
6399 : return mask_reduc_fn;
6400 0 : if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6401 : OPTIMIZE_FOR_SPEED))
6402 : return mask_len_reduc_fn;
6403 : return IFN_LAST;
6404 : }
6405 :
6406 : /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6407 : statement that sets the live-out value. REDUC_DEF_STMT is the phi
6408 : statement. CODE is the operation performed by STMT_INFO and OPS are
6409 : its scalar operands. REDUC_INDEX is the index of the operand in
6410 : OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6411 : implements in-order reduction, or IFN_LAST if we should open-code it.
6412 : VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6413 : that should be used to control the operation in a fully-masked loop. */
6414 :
6415 : static bool
6416 847 : vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6417 : stmt_vec_info stmt_info,
6418 : gimple_stmt_iterator *gsi,
6419 : slp_tree slp_node,
6420 : code_helper code, internal_fn reduc_fn,
6421 : int num_ops, tree vectype_in,
6422 : int reduc_index, vec_loop_masks *masks,
6423 : vec_loop_lens *lens)
6424 : {
6425 847 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6426 847 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
6427 847 : internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6428 :
6429 847 : gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6430 :
6431 847 : bool is_cond_op = false;
6432 847 : if (!code.is_tree_code ())
6433 : {
6434 31 : code = conditional_internal_fn_code (internal_fn (code));
6435 31 : gcc_assert (code != ERROR_MARK);
6436 : is_cond_op = true;
6437 : }
6438 :
6439 847 : gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
6440 :
6441 847 : gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6442 : TYPE_VECTOR_SUBPARTS (vectype_in)));
6443 :
6444 : /* ??? We should, when transforming the cycle PHI, record the existing
6445 : scalar def as vector def so looking up the vector def works. This
6446 : would also allow generalizing this for reduction paths of length > 1
6447 : and/or SLP reductions. */
6448 847 : slp_tree reduc_node = SLP_TREE_CHILDREN (slp_node)[reduc_index];
6449 847 : stmt_vec_info reduc_var_def = SLP_TREE_SCALAR_STMTS (reduc_node)[0];
6450 847 : tree reduc_var = gimple_get_lhs (STMT_VINFO_STMT (reduc_var_def));
6451 :
6452 : /* The operands either come from a binary operation or an IFN_COND operation.
6453 : The former is a gimple assign with binary rhs and the latter is a
6454 : gimple call with four arguments. */
6455 847 : gcc_assert (num_ops == 2 || num_ops == 4);
6456 :
6457 847 : auto_vec<tree> vec_oprnds0, vec_opmask;
6458 847 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[(is_cond_op ? 2 : 0)
6459 847 : + (1 - reduc_index)],
6460 : &vec_oprnds0);
6461 : /* For an IFN_COND_OP we also need the vector mask operand. */
6462 847 : if (is_cond_op)
6463 31 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], &vec_opmask);
6464 :
6465 : /* The transform below relies on preserving the original scalar PHI
6466 : and its latch def which we replace. So work backwards from there. */
6467 847 : tree scalar_dest
6468 847 : = gimple_phi_arg_def_from_edge (as_a <gphi *> (STMT_VINFO_STMT
6469 : (reduc_var_def)),
6470 847 : loop_latch_edge (loop));
6471 847 : stmt_vec_info scalar_dest_def_info
6472 847 : = vect_stmt_to_vectorize (loop_vinfo->lookup_def (scalar_dest));
6473 847 : tree scalar_type = TREE_TYPE (scalar_dest);
6474 :
6475 847 : int vec_num = vec_oprnds0.length ();
6476 847 : tree vec_elem_type = TREE_TYPE (vectype_out);
6477 847 : gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6478 :
6479 847 : tree vector_identity = NULL_TREE;
6480 847 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6481 : {
6482 2 : vector_identity = build_zero_cst (vectype_out);
6483 2 : if (!HONOR_SIGNED_ZEROS (vectype_out))
6484 : ;
6485 : else
6486 : {
6487 2 : gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
6488 2 : vector_identity = const_unop (NEGATE_EXPR, vectype_out,
6489 : vector_identity);
6490 : }
6491 : }
6492 :
6493 847 : tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6494 847 : int i;
6495 847 : tree def0;
6496 1952 : FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6497 : {
6498 1105 : gimple *new_stmt;
6499 1105 : tree mask = NULL_TREE;
6500 1105 : tree len = NULL_TREE;
6501 1105 : tree bias = NULL_TREE;
6502 1105 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6503 : {
6504 9 : tree loop_mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
6505 : vec_num, vectype_in, i);
6506 9 : if (is_cond_op)
6507 9 : mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (loop_mask),
6508 9 : loop_mask, vec_opmask[i], gsi);
6509 : else
6510 : mask = loop_mask;
6511 : }
6512 1096 : else if (is_cond_op)
6513 76 : mask = vec_opmask[i];
6514 1105 : if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
6515 : {
6516 0 : len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
6517 : i, 1, false);
6518 0 : signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
6519 0 : bias = build_int_cst (intQI_type_node, biasval);
6520 0 : if (!is_cond_op)
6521 0 : mask = build_minus_one_cst (truth_type_for (vectype_in));
6522 : }
6523 :
6524 : /* Handle MINUS by adding the negative. */
6525 1105 : if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6526 : {
6527 0 : tree negated = make_ssa_name (vectype_out);
6528 0 : new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6529 0 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6530 0 : def0 = negated;
6531 : }
6532 :
6533 9 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
6534 1114 : && mask && mask_reduc_fn == IFN_LAST)
6535 9 : def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6536 : vector_identity);
6537 :
6538 : /* On the first iteration the input is simply the scalar phi
6539 : result, and for subsequent iterations it is the output of
6540 : the preceding operation. */
6541 1105 : if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6542 : {
6543 0 : if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6544 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
6545 : def0, mask, len, bias);
6546 0 : else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
6547 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6548 : def0, mask);
6549 : else
6550 0 : new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6551 : def0);
6552 : /* For chained SLP reductions the output of the previous reduction
6553 : operation serves as the input of the next. For the final statement
6554 : the output cannot be a temporary - we reuse the original
6555 : scalar destination of the last statement. */
6556 0 : if (i != vec_num - 1)
6557 : {
6558 0 : gimple_set_lhs (new_stmt, scalar_dest_var);
6559 0 : reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6560 0 : gimple_set_lhs (new_stmt, reduc_var);
6561 : }
6562 : }
6563 : else
6564 : {
6565 1105 : reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
6566 : tree_code (code), reduc_var, def0,
6567 : mask);
6568 1105 : new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6569 : /* Remove the statement, so that we can use the same code paths
6570 : as for statements that we've just created. */
6571 1105 : gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6572 1105 : gsi_remove (&tmp_gsi, true);
6573 : }
6574 :
6575 1105 : if (i == vec_num - 1)
6576 : {
6577 847 : gimple_set_lhs (new_stmt, scalar_dest);
6578 847 : vect_finish_replace_stmt (loop_vinfo,
6579 : scalar_dest_def_info,
6580 : new_stmt);
6581 : }
6582 : else
6583 258 : vect_finish_stmt_generation (loop_vinfo,
6584 : scalar_dest_def_info,
6585 : new_stmt, gsi);
6586 :
6587 1105 : slp_node->push_vec_def (new_stmt);
6588 : }
6589 :
6590 847 : return true;
6591 847 : }
6592 :
6593 : /* Function is_nonwrapping_integer_induction.
6594 :
6595 : Check if STMT_VINO (which is part of loop LOOP) both increments and
6596 : does not cause overflow. */
6597 :
6598 : static bool
6599 408 : is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6600 : {
6601 408 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6602 408 : tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6603 408 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6604 408 : tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6605 408 : widest_int ni, max_loop_value, lhs_max;
6606 408 : wi::overflow_type overflow = wi::OVF_NONE;
6607 :
6608 : /* Make sure the loop is integer based. */
6609 408 : if (TREE_CODE (base) != INTEGER_CST
6610 109 : || TREE_CODE (step) != INTEGER_CST)
6611 : return false;
6612 :
6613 : /* Check that the max size of the loop will not wrap. */
6614 :
6615 109 : if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6616 : return true;
6617 :
6618 8 : if (! max_stmt_executions (loop, &ni))
6619 : return false;
6620 :
6621 8 : max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6622 8 : &overflow);
6623 8 : if (overflow)
6624 : return false;
6625 :
6626 8 : max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6627 16 : TYPE_SIGN (lhs_type), &overflow);
6628 8 : if (overflow)
6629 : return false;
6630 :
6631 8 : return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6632 8 : <= TYPE_PRECISION (lhs_type));
6633 408 : }
6634 :
6635 : /* Check if masking can be supported by inserting a conditional expression.
6636 : CODE is the code for the operation. COND_FN is the conditional internal
6637 : function, if it exists. VECTYPE_IN is the type of the vector input. */
6638 : static bool
6639 5927 : use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6640 : tree vectype_in)
6641 : {
6642 5927 : if (cond_fn != IFN_LAST
6643 5927 : && direct_internal_fn_supported_p (cond_fn, vectype_in,
6644 : OPTIMIZE_FOR_SPEED))
6645 : return false;
6646 :
6647 4185 : if (code.is_tree_code ())
6648 4171 : switch (tree_code (code))
6649 : {
6650 : case DOT_PROD_EXPR:
6651 : case SAD_EXPR:
6652 : return true;
6653 :
6654 : default:
6655 : break;
6656 : }
6657 : return false;
6658 : }
6659 :
6660 : /* Insert a conditional expression to enable masked vectorization. CODE is the
6661 : code for the operation. VOP is the array of operands. MASK is the loop
6662 : mask. GSI is a statement iterator used to place the new conditional
6663 : expression. */
6664 : static void
6665 4 : build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6666 : gimple_stmt_iterator *gsi)
6667 : {
6668 4 : switch (tree_code (code))
6669 : {
6670 4 : case DOT_PROD_EXPR:
6671 4 : {
6672 4 : tree vectype = TREE_TYPE (vop[1]);
6673 4 : tree zero = build_zero_cst (vectype);
6674 4 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6675 4 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6676 : mask, vop[1], zero);
6677 4 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6678 4 : vop[1] = masked_op1;
6679 4 : break;
6680 : }
6681 :
6682 0 : case SAD_EXPR:
6683 0 : {
6684 0 : tree vectype = TREE_TYPE (vop[1]);
6685 0 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6686 0 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6687 : mask, vop[1], vop[0]);
6688 0 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6689 0 : vop[1] = masked_op1;
6690 0 : break;
6691 : }
6692 :
6693 0 : default:
6694 0 : gcc_unreachable ();
6695 : }
6696 4 : }
6697 :
6698 : /* Given an operation with CODE in loop reduction path whose reduction PHI is
6699 : specified by REDUC_INFO, the operation has TYPE of scalar result, and its
6700 : input vectype is represented by VECTYPE_IN. The vectype of vectorized result
6701 : may be different from VECTYPE_IN, either in base type or vectype lanes,
6702 : lane-reducing operation is the case. This function check if it is possible,
6703 : and how to perform partial vectorization on the operation in the context
6704 : of LOOP_VINFO. */
6705 :
6706 : static void
6707 4133 : vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo,
6708 : vect_reduc_info reduc_info,
6709 : slp_tree slp_node,
6710 : code_helper code, tree type,
6711 : tree vectype_in)
6712 : {
6713 4133 : enum vect_reduction_type reduc_type = VECT_REDUC_INFO_TYPE (reduc_info);
6714 4133 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
6715 4133 : internal_fn cond_fn
6716 1164 : = ((code.is_internal_fn ()
6717 1164 : && internal_fn_mask_index ((internal_fn)code) != -1)
6718 4133 : ? (internal_fn)code : get_conditional_internal_fn (code, type));
6719 :
6720 4133 : if (reduc_type != FOLD_LEFT_REDUCTION
6721 3338 : && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6722 7358 : && (cond_fn == IFN_LAST
6723 3225 : || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6724 : OPTIMIZE_FOR_SPEED)))
6725 : {
6726 2011 : if (dump_enabled_p ())
6727 98 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6728 : "can't operate on partial vectors because"
6729 : " no conditional operation is available.\n");
6730 2011 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6731 : }
6732 2122 : else if (reduc_type == FOLD_LEFT_REDUCTION
6733 2122 : && reduc_fn == IFN_LAST
6734 2122 : && !expand_vec_cond_expr_p (vectype_in, truth_type_for (vectype_in)))
6735 : {
6736 0 : if (dump_enabled_p ())
6737 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6738 : "can't operate on partial vectors because"
6739 : " no conditional operation is available.\n");
6740 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6741 : }
6742 2122 : else if (reduc_type == FOLD_LEFT_REDUCTION
6743 795 : && internal_fn_mask_index (reduc_fn) == -1
6744 795 : && FLOAT_TYPE_P (vectype_in)
6745 2910 : && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
6746 : {
6747 0 : if (dump_enabled_p ())
6748 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6749 : "can't operate on partial vectors because"
6750 : " signed zeros cannot be preserved.\n");
6751 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6752 : }
6753 : else
6754 : {
6755 2122 : internal_fn mask_reduc_fn
6756 2122 : = get_masked_reduction_fn (reduc_fn, vectype_in);
6757 2122 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6758 2122 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
6759 2122 : unsigned nvectors = vect_get_num_copies (loop_vinfo, slp_node);
6760 :
6761 2122 : if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6762 0 : vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_in, 1);
6763 : else
6764 2122 : vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_in, NULL);
6765 : }
6766 4133 : }
6767 :
6768 : /* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
6769 : the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC,
6770 : and the analysis is for slp if SLP_NODE is not NULL.
6771 :
6772 : For a lane-reducing operation, the loop reduction path that it lies in,
6773 : may contain normal operation, or other lane-reducing operation of different
6774 : input type size, an example as:
6775 :
6776 : int sum = 0;
6777 : for (i)
6778 : {
6779 : ...
6780 : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
6781 : sum += w[i]; // widen-sum <vector(16) char>
6782 : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
6783 : sum += n[i]; // normal <vector(4) int>
6784 : ...
6785 : }
6786 :
6787 : Vectorization factor is essentially determined by operation whose input
6788 : vectype has the most lanes ("vector(16) char" in the example), while we
6789 : need to choose input vectype with the least lanes ("vector(4) int" in the
6790 : example) to determine effective number of vector reduction PHIs. */
6791 :
6792 : bool
6793 382321 : vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
6794 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
6795 : {
6796 382321 : gimple *stmt = stmt_info->stmt;
6797 :
6798 382321 : if (!lane_reducing_stmt_p (stmt))
6799 : return false;
6800 :
6801 714 : tree type = TREE_TYPE (gimple_assign_lhs (stmt));
6802 :
6803 714 : if (!INTEGRAL_TYPE_P (type))
6804 : return false;
6805 :
6806 : /* Do not try to vectorize bit-precision reductions. */
6807 714 : if (!type_has_mode_precision_p (type))
6808 : return false;
6809 :
6810 714 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
6811 :
6812 : /* TODO: Support lane-reducing operation that does not directly participate
6813 : in loop reduction. */
6814 714 : if (!reduc_info)
6815 : return false;
6816 :
6817 : /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
6818 : recoginized. */
6819 714 : gcc_assert (!nested_in_vect_loop_p (LOOP_VINFO_LOOP (loop_vinfo), stmt_info));
6820 714 : gcc_assert (VECT_REDUC_INFO_TYPE (reduc_info) == TREE_CODE_REDUCTION);
6821 :
6822 2856 : for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
6823 : {
6824 2142 : slp_tree slp_op;
6825 2142 : tree op;
6826 2142 : tree vectype;
6827 2142 : enum vect_def_type dt;
6828 :
6829 2142 : if (!vect_is_simple_use (loop_vinfo, slp_node, i, &op,
6830 : &slp_op, &dt, &vectype))
6831 : {
6832 0 : if (dump_enabled_p ())
6833 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6834 : "use not simple.\n");
6835 0 : return false;
6836 : }
6837 :
6838 2142 : if (!vectype)
6839 : {
6840 6 : vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
6841 : slp_op);
6842 6 : if (!vectype)
6843 : return false;
6844 : }
6845 :
6846 2142 : if (!vect_maybe_update_slp_op_vectype (slp_op, vectype))
6847 : {
6848 0 : if (dump_enabled_p ())
6849 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6850 : "incompatible vector types for invariants\n");
6851 0 : return false;
6852 : }
6853 :
6854 2142 : if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6855 714 : continue;
6856 :
6857 : /* There should be at most one cycle def in the stmt. */
6858 1428 : if (VECTORIZABLE_CYCLE_DEF (dt))
6859 : return false;
6860 : }
6861 :
6862 714 : slp_tree node_in = SLP_TREE_CHILDREN (slp_node)[0];
6863 714 : tree vectype_in = SLP_TREE_VECTYPE (node_in);
6864 714 : gcc_assert (vectype_in);
6865 :
6866 : /* Compute number of effective vector statements for costing. */
6867 714 : unsigned int ncopies_for_cost = vect_get_num_copies (loop_vinfo, node_in);
6868 714 : gcc_assert (ncopies_for_cost >= 1);
6869 :
6870 714 : if (vect_is_emulated_mixed_dot_prod (slp_node))
6871 : {
6872 : /* We need extra two invariants: one that contains the minimum signed
6873 : value and one that contains half of its negative. */
6874 15 : int prologue_stmts = 2;
6875 15 : unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
6876 : scalar_to_vec, slp_node, 0,
6877 : vect_prologue);
6878 15 : if (dump_enabled_p ())
6879 0 : dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
6880 : "extra prologue_cost = %d .\n", cost);
6881 :
6882 : /* Three dot-products and a subtraction. */
6883 15 : ncopies_for_cost *= 4;
6884 : }
6885 :
6886 714 : record_stmt_cost (cost_vec, (int) ncopies_for_cost, vector_stmt, slp_node,
6887 : 0, vect_body);
6888 :
6889 714 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
6890 : {
6891 113 : enum tree_code code = gimple_assign_rhs_code (stmt);
6892 113 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
6893 113 : node_in, code, type,
6894 : vectype_in);
6895 : }
6896 :
6897 : /* Transform via vect_transform_reduction. */
6898 714 : SLP_TREE_TYPE (slp_node) = reduc_vec_info_type;
6899 714 : return true;
6900 : }
6901 :
6902 : /* Function vectorizable_reduction.
6903 :
6904 : Check if STMT_INFO performs a reduction operation that can be vectorized.
6905 : If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6906 : stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6907 : Return true if STMT_INFO is vectorizable in this way.
6908 :
6909 : This function also handles reduction idioms (patterns) that have been
6910 : recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6911 : may be of this form:
6912 : X = pattern_expr (arg0, arg1, ..., X)
6913 : and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6914 : sequence that had been detected and replaced by the pattern-stmt
6915 : (STMT_INFO).
6916 :
6917 : This function also handles reduction of condition expressions, for example:
6918 : for (int i = 0; i < N; i++)
6919 : if (a[i] < value)
6920 : last = a[i];
6921 : This is handled by vectorising the loop and creating an additional vector
6922 : containing the loop indexes for which "a[i] < value" was true. In the
6923 : function epilogue this is reduced to a single max value and then used to
6924 : index into the vector of results.
6925 :
6926 : In some cases of reduction patterns, the type of the reduction variable X is
6927 : different than the type of the other arguments of STMT_INFO.
6928 : In such cases, the vectype that is used when transforming STMT_INFO into
6929 : a vector stmt is different than the vectype that is used to determine the
6930 : vectorization factor, because it consists of a different number of elements
6931 : than the actual number of elements that are being operated upon in parallel.
6932 :
6933 : For example, consider an accumulation of shorts into an int accumulator.
6934 : On some targets it's possible to vectorize this pattern operating on 8
6935 : shorts at a time (hence, the vectype for purposes of determining the
6936 : vectorization factor should be V8HI); on the other hand, the vectype that
6937 : is used to create the vector form is actually V4SI (the type of the result).
6938 :
6939 : Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6940 : indicates what is the actual level of parallelism (V8HI in the example), so
6941 : that the right vectorization factor would be derived. This vectype
6942 : corresponds to the type of arguments to the reduction stmt, and should *NOT*
6943 : be used to create the vectorized stmt. The right vectype for the vectorized
6944 : stmt is obtained from the type of the result X:
6945 : get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6946 :
6947 : This means that, contrary to "regular" reductions (or "regular" stmts in
6948 : general), the following equation:
6949 : STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6950 : does *NOT* necessarily hold for reduction patterns. */
6951 :
6952 : bool
6953 381607 : vectorizable_reduction (loop_vec_info loop_vinfo,
6954 : stmt_vec_info stmt_info, slp_tree slp_node,
6955 : slp_instance slp_node_instance,
6956 : stmt_vector_for_cost *cost_vec)
6957 : {
6958 381607 : tree vectype_in = NULL_TREE;
6959 381607 : enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6960 381607 : stmt_vec_info cond_stmt_vinfo = NULL;
6961 381607 : int i;
6962 381607 : int ncopies;
6963 381607 : bool single_defuse_cycle = false;
6964 381607 : tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6965 381607 : tree cond_reduc_val = NULL_TREE;
6966 :
6967 : /* Make sure it was already recognized as a reduction computation. */
6968 381607 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6969 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6970 381607 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6971 : return false;
6972 :
6973 : /* The reduction meta. */
6974 84218 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
6975 :
6976 84218 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6977 : {
6978 1490 : gcc_assert (is_a <gphi *> (stmt_info->stmt));
6979 : /* We eventually need to set a vector type on invariant arguments. */
6980 : unsigned j;
6981 : slp_tree child;
6982 4462 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6983 2980 : if (!vect_maybe_update_slp_op_vectype (child,
6984 : SLP_TREE_VECTYPE (slp_node)))
6985 : {
6986 0 : if (dump_enabled_p ())
6987 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6988 : "incompatible vector types for "
6989 : "invariants\n");
6990 0 : return false;
6991 : }
6992 2980 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
6993 2980 : && !useless_type_conversion_p (SLP_TREE_VECTYPE (slp_node),
6994 : SLP_TREE_VECTYPE (child)))
6995 : {
6996 : /* With bools we can have mask and non-mask precision vectors
6997 : or different non-mask precisions. while pattern recog is
6998 : supposed to guarantee consistency here, we do not have
6999 : pattern stmts for PHIs (PR123316).
7000 : Deal with that here instead of ICEing later. */
7001 8 : if (dump_enabled_p ())
7002 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7003 : "incompatible vector type setup from "
7004 : "bool pattern detection\n");
7005 8 : return false;
7006 : }
7007 : /* Analysis for double-reduction is done on the outer
7008 : loop PHI, nested cycles have no further restrictions. */
7009 1482 : SLP_TREE_TYPE (slp_node) = cycle_phi_info_type;
7010 1482 : return true;
7011 : }
7012 :
7013 82728 : if (!is_a <gphi *> (stmt_info->stmt))
7014 : {
7015 7958 : gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def);
7016 7958 : SLP_TREE_TYPE (slp_node) = reduc_vec_info_type;
7017 7958 : return true;
7018 : }
7019 :
7020 74770 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7021 74770 : stmt_vec_info phi_info = stmt_info;
7022 74770 : bool double_reduc = false;
7023 74770 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7024 : {
7025 : /* We arrive here for both the inner loop LC PHI and the
7026 : outer loop PHI. The latter is what we want to analyze the
7027 : reduction with. The LC PHI is handled by vectorizable_lc_phi. */
7028 312 : if (gimple_bb (stmt_info->stmt) != loop->header)
7029 0 : return false;
7030 :
7031 : /* Set loop and phi_info to the inner loop. */
7032 312 : use_operand_p use_p;
7033 312 : gimple *use_stmt;
7034 312 : bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7035 : &use_p, &use_stmt);
7036 312 : gcc_assert (res);
7037 312 : phi_info = loop_vinfo->lookup_stmt (use_stmt);
7038 312 : loop = loop->inner;
7039 312 : double_reduc = true;
7040 : }
7041 :
7042 74770 : const bool reduc_chain = reduc_info->is_reduc_chain;
7043 74770 : slp_node_instance->reduc_phis = slp_node;
7044 : /* ??? We're leaving slp_node to point to the PHIs, we only
7045 : need it to get at the number of vector stmts which wasn't
7046 : yet initialized for the instance root. */
7047 :
7048 : /* PHIs should not participate in patterns. */
7049 74770 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7050 74770 : gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7051 :
7052 : /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7053 : and compute the reduction chain length. Discover the real
7054 : reduction operation stmt on the way (slp_for_stmt_info). */
7055 74770 : unsigned reduc_chain_length = 0;
7056 74770 : stmt_info = NULL;
7057 74770 : slp_tree slp_for_stmt_info = NULL;
7058 74770 : slp_tree vdef_slp = slp_node_instance->root;
7059 165076 : while (vdef_slp != slp_node)
7060 : {
7061 91348 : int reduc_idx = SLP_TREE_REDUC_IDX (vdef_slp);
7062 91348 : if (reduc_idx == -1)
7063 : {
7064 1034 : if (dump_enabled_p ())
7065 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7066 : "reduction chain broken by patterns.\n");
7067 1042 : return false;
7068 : }
7069 90314 : stmt_vec_info vdef = SLP_TREE_REPRESENTATIVE (vdef_slp);
7070 90314 : if (is_a <gphi *> (vdef->stmt))
7071 : {
7072 624 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[reduc_idx];
7073 : /* Do not count PHIs towards the chain length. */
7074 624 : continue;
7075 : }
7076 89690 : gimple_match_op op;
7077 89690 : if (!gimple_extract_op (vdef->stmt, &op))
7078 : {
7079 0 : if (dump_enabled_p ())
7080 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7081 : "reduction chain includes unsupported"
7082 : " statement type.\n");
7083 0 : return false;
7084 : }
7085 89690 : if (CONVERT_EXPR_CODE_P (op.code))
7086 : {
7087 5234 : if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7088 : {
7089 8 : if (dump_enabled_p ())
7090 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7091 : "conversion in the reduction chain.\n");
7092 8 : return false;
7093 : }
7094 5226 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[0];
7095 : }
7096 : else
7097 : {
7098 : /* First non-conversion stmt. */
7099 84456 : if (!slp_for_stmt_info)
7100 73728 : slp_for_stmt_info = vdef_slp;
7101 :
7102 84456 : if (lane_reducing_op_p (op.code))
7103 : {
7104 : /* The last operand of lane-reducing operation is for
7105 : reduction. */
7106 714 : gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
7107 :
7108 714 : slp_tree op_node = SLP_TREE_CHILDREN (vdef_slp)[0];
7109 714 : tree vectype_op = SLP_TREE_VECTYPE (op_node);
7110 714 : tree type_op = TREE_TYPE (op.ops[0]);
7111 714 : if (!vectype_op)
7112 : {
7113 9 : vectype_op = get_vectype_for_scalar_type (loop_vinfo,
7114 : type_op);
7115 9 : if (!vectype_op
7116 9 : || !vect_maybe_update_slp_op_vectype (op_node,
7117 : vectype_op))
7118 0 : return false;
7119 : }
7120 :
7121 : /* To accommodate lane-reducing operations of mixed input
7122 : vectypes, choose input vectype with the least lanes for the
7123 : reduction PHI statement, which would result in the most
7124 : ncopies for vectorized reduction results. */
7125 714 : if (!vectype_in
7126 714 : || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7127 58 : < GET_MODE_SIZE (SCALAR_TYPE_MODE (type_op))))
7128 685 : vectype_in = vectype_op;
7129 : }
7130 83742 : else if (!vectype_in)
7131 73043 : vectype_in = SLP_TREE_VECTYPE (slp_node);
7132 84456 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[reduc_idx];
7133 : }
7134 89682 : reduc_chain_length++;
7135 : }
7136 73728 : if (!slp_for_stmt_info)
7137 : {
7138 0 : if (dump_enabled_p ())
7139 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7140 : "only noop-conversions in the reduction chain.\n");
7141 0 : return false;
7142 : }
7143 73728 : stmt_info = SLP_TREE_REPRESENTATIVE (slp_for_stmt_info);
7144 :
7145 : /* PHIs should not participate in patterns. */
7146 73728 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7147 :
7148 : /* 1. Is vectorizable reduction? */
7149 : /* Not supportable if the reduction variable is used in the loop, unless
7150 : it's a reduction chain. */
7151 73728 : if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7152 0 : && !reduc_chain)
7153 : return false;
7154 :
7155 : /* Reductions that are not used even in an enclosing outer-loop,
7156 : are expected to be "live" (used out of the loop). */
7157 73728 : if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7158 0 : && !STMT_VINFO_LIVE_P (stmt_info))
7159 : return false;
7160 :
7161 : /* 2. Has this been recognized as a reduction pattern?
7162 :
7163 : Check if STMT represents a pattern that has been recognized
7164 : in earlier analysis stages. For stmts that represent a pattern,
7165 : the STMT_VINFO_RELATED_STMT field records the last stmt in
7166 : the original sequence that constitutes the pattern. */
7167 :
7168 73728 : stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7169 73728 : if (orig_stmt_info)
7170 : {
7171 5111 : gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7172 5111 : gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7173 : }
7174 :
7175 : /* 3. Check the operands of the operation. The first operands are defined
7176 : inside the loop body. The last operand is the reduction variable,
7177 : which is defined by the loop-header-phi. */
7178 :
7179 73728 : tree vectype_out = SLP_TREE_VECTYPE (slp_for_stmt_info);
7180 73728 : VECT_REDUC_INFO_VECTYPE (reduc_info) = vectype_out;
7181 :
7182 73728 : gimple_match_op op;
7183 73728 : if (!gimple_extract_op (stmt_info->stmt, &op))
7184 0 : gcc_unreachable ();
7185 73728 : bool lane_reducing = lane_reducing_op_p (op.code);
7186 :
7187 73728 : if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7188 21995 : && !SCALAR_FLOAT_TYPE_P (op.type))
7189 : return false;
7190 :
7191 : /* Do not try to vectorize bit-precision reductions. */
7192 73728 : if (!type_has_mode_precision_p (op.type)
7193 1717 : && op.code != BIT_AND_EXPR
7194 1582 : && op.code != BIT_IOR_EXPR
7195 74204 : && op.code != BIT_XOR_EXPR)
7196 : return false;
7197 :
7198 : /* Lane-reducing ops also never can be used in a SLP reduction group
7199 : since we'll mix lanes belonging to different reductions. But it's
7200 : OK to use them in a reduction chain or when the reduction group
7201 : has just one element. */
7202 73418 : if (lane_reducing
7203 73418 : && !reduc_chain
7204 650 : && SLP_TREE_LANES (slp_node) > 1)
7205 : {
7206 0 : if (dump_enabled_p ())
7207 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7208 : "lane-reducing reduction in reduction group.\n");
7209 0 : return false;
7210 : }
7211 :
7212 : /* All uses but the last are expected to be defined in the loop.
7213 : The last use is the reduction variable. In case of nested cycle this
7214 : assumption is not true: we use reduc_index to record the index of the
7215 : reduction variable. */
7216 73418 : slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7217 73418 : tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7218 73418 : gcc_assert (op.code != COND_EXPR || !COMPARISON_CLASS_P (op.ops[0]));
7219 235330 : for (i = 0; i < (int) op.num_ops; i++)
7220 : {
7221 : /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7222 161912 : if (i == 0 && op.code == COND_EXPR)
7223 81028 : continue;
7224 :
7225 161078 : stmt_vec_info def_stmt_info;
7226 161078 : enum vect_def_type dt;
7227 161078 : if (!vect_is_simple_use (loop_vinfo, slp_for_stmt_info,
7228 : i, &op.ops[i], &slp_op[i], &dt,
7229 161078 : &vectype_op[i], &def_stmt_info))
7230 : {
7231 0 : if (dump_enabled_p ())
7232 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7233 : "use not simple.\n");
7234 0 : return false;
7235 : }
7236 :
7237 : /* Skip reduction operands, and for an IFN_COND_OP we might hit the
7238 : reduction operand twice (once as definition, once as else). */
7239 161078 : if (SLP_TREE_CHILDREN (slp_for_stmt_info)[i]
7240 322156 : == SLP_TREE_CHILDREN
7241 161078 : (slp_for_stmt_info)[SLP_TREE_REDUC_IDX (slp_for_stmt_info)])
7242 80194 : continue;
7243 :
7244 : /* There should be only one cycle def in the stmt, the one
7245 : leading to reduc_def. */
7246 80884 : if (SLP_TREE_CHILDREN (slp_for_stmt_info)[i]->cycle_info.id != -1)
7247 : return false;
7248 :
7249 80884 : if (!vectype_op[i])
7250 7382 : vectype_op[i]
7251 7382 : = get_vectype_for_scalar_type (loop_vinfo,
7252 7382 : TREE_TYPE (op.ops[i]), slp_op[i]);
7253 :
7254 : /* Record how the non-reduction-def value of COND_EXPR is defined.
7255 : ??? For a chain of multiple CONDs we'd have to match them up all. */
7256 80884 : if (op.code == COND_EXPR && reduc_chain_length == 1)
7257 : {
7258 811 : if (dt == vect_constant_def)
7259 : {
7260 98 : cond_reduc_dt = dt;
7261 98 : cond_reduc_val = op.ops[i];
7262 : }
7263 713 : else if (dt == vect_induction_def
7264 408 : && def_stmt_info
7265 1121 : && is_nonwrapping_integer_induction (def_stmt_info, loop))
7266 : {
7267 109 : cond_reduc_dt = dt;
7268 109 : cond_stmt_vinfo = def_stmt_info;
7269 : }
7270 : }
7271 : }
7272 :
7273 73418 : enum vect_reduction_type reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7274 : /* If we have a condition reduction, see if we can simplify it further. */
7275 73418 : if (reduction_type == COND_REDUCTION)
7276 : {
7277 822 : if (SLP_TREE_LANES (slp_node) != 1)
7278 : return false;
7279 :
7280 : /* When the condition uses the reduction value in the condition, fail. */
7281 798 : if (SLP_TREE_REDUC_IDX (slp_node) == 0)
7282 : {
7283 0 : if (dump_enabled_p ())
7284 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7285 : "condition depends on previous iteration\n");
7286 0 : return false;
7287 : }
7288 :
7289 798 : if (reduc_chain_length == 1
7290 798 : && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7291 : OPTIMIZE_FOR_SPEED)
7292 775 : || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7293 : vectype_in,
7294 : OPTIMIZE_FOR_SPEED)))
7295 : {
7296 0 : if (dump_enabled_p ())
7297 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7298 : "optimizing condition reduction with"
7299 : " FOLD_EXTRACT_LAST.\n");
7300 0 : VECT_REDUC_INFO_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7301 : }
7302 798 : else if (cond_reduc_dt == vect_induction_def)
7303 : {
7304 109 : tree base
7305 : = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7306 109 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7307 :
7308 109 : gcc_assert (TREE_CODE (base) == INTEGER_CST
7309 : && TREE_CODE (step) == INTEGER_CST);
7310 109 : cond_reduc_val = NULL_TREE;
7311 109 : enum tree_code cond_reduc_op_code = ERROR_MARK;
7312 109 : tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7313 109 : if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7314 : ;
7315 : /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7316 : above base; punt if base is the minimum value of the type for
7317 : MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7318 97 : else if (tree_int_cst_sgn (step) == -1)
7319 : {
7320 18 : cond_reduc_op_code = MIN_EXPR;
7321 18 : if (tree_int_cst_sgn (base) == -1)
7322 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7323 18 : else if (tree_int_cst_lt (base,
7324 18 : TYPE_MAX_VALUE (TREE_TYPE (base))))
7325 18 : cond_reduc_val
7326 18 : = int_const_binop (PLUS_EXPR, base, integer_one_node);
7327 : }
7328 : else
7329 : {
7330 79 : cond_reduc_op_code = MAX_EXPR;
7331 79 : if (tree_int_cst_sgn (base) == 1)
7332 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7333 79 : else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7334 : base))
7335 79 : cond_reduc_val
7336 79 : = int_const_binop (MINUS_EXPR, base, integer_one_node);
7337 : }
7338 97 : if (cond_reduc_val)
7339 : {
7340 97 : if (dump_enabled_p ())
7341 61 : dump_printf_loc (MSG_NOTE, vect_location,
7342 : "condition expression based on "
7343 : "integer induction.\n");
7344 97 : VECT_REDUC_INFO_CODE (reduc_info) = cond_reduc_op_code;
7345 97 : VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info)
7346 97 : = cond_reduc_val;
7347 97 : VECT_REDUC_INFO_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7348 : }
7349 : }
7350 689 : else if (cond_reduc_dt == vect_constant_def)
7351 : {
7352 88 : enum vect_def_type cond_initial_dt;
7353 88 : tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7354 88 : vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7355 88 : if (cond_initial_dt == vect_constant_def
7356 113 : && types_compatible_p (TREE_TYPE (cond_initial_val),
7357 25 : TREE_TYPE (cond_reduc_val)))
7358 : {
7359 25 : tree e = fold_binary (LE_EXPR, boolean_type_node,
7360 : cond_initial_val, cond_reduc_val);
7361 25 : if (e && (integer_onep (e) || integer_zerop (e)))
7362 : {
7363 25 : if (dump_enabled_p ())
7364 16 : dump_printf_loc (MSG_NOTE, vect_location,
7365 : "condition expression based on "
7366 : "compile time constant.\n");
7367 : /* Record reduction code at analysis stage. */
7368 25 : VECT_REDUC_INFO_CODE (reduc_info)
7369 25 : = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7370 25 : VECT_REDUC_INFO_TYPE (reduc_info) = CONST_COND_REDUCTION;
7371 : }
7372 : }
7373 : }
7374 : }
7375 :
7376 73394 : if (STMT_VINFO_LIVE_P (phi_info))
7377 : return false;
7378 :
7379 73394 : ncopies = vect_get_num_copies (loop_vinfo, slp_node);
7380 :
7381 73394 : gcc_assert (ncopies >= 1);
7382 :
7383 73394 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7384 :
7385 : /* 4.2. Check support for the epilog operation.
7386 :
7387 : If STMT represents a reduction pattern, then the type of the
7388 : reduction variable may be different than the type of the rest
7389 : of the arguments. For example, consider the case of accumulation
7390 : of shorts into an int accumulator; The original code:
7391 : S1: int_a = (int) short_a;
7392 : orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7393 :
7394 : was replaced with:
7395 : STMT: int_acc = widen_sum <short_a, int_acc>
7396 :
7397 : This means that:
7398 : 1. The tree-code that is used to create the vector operation in the
7399 : epilog code (that reduces the partial results) is not the
7400 : tree-code of STMT, but is rather the tree-code of the original
7401 : stmt from the pattern that STMT is replacing. I.e, in the example
7402 : above we want to use 'widen_sum' in the loop, but 'plus' in the
7403 : epilog.
7404 : 2. The type (mode) we use to check available target support
7405 : for the vector operation to be created in the *epilog*, is
7406 : determined by the type of the reduction variable (in the example
7407 : above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7408 : However the type (mode) we use to check available target support
7409 : for the vector operation to be created *inside the loop*, is
7410 : determined by the type of the other arguments to STMT (in the
7411 : example we'd check this: optab_handler (widen_sum_optab,
7412 : vect_short_mode)).
7413 :
7414 : This is contrary to "regular" reductions, in which the types of all
7415 : the arguments are the same as the type of the reduction variable.
7416 : For "regular" reductions we can therefore use the same vector type
7417 : (and also the same tree-code) when generating the epilog code and
7418 : when generating the code inside the loop. */
7419 :
7420 73394 : code_helper orig_code = VECT_REDUC_INFO_CODE (reduc_info);
7421 :
7422 : /* If conversion might have created a conditional operation like
7423 : IFN_COND_ADD already. Use the internal code for the following checks. */
7424 73394 : if (orig_code.is_internal_fn ())
7425 : {
7426 6844 : tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7427 6844 : orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7428 : }
7429 :
7430 73394 : VECT_REDUC_INFO_CODE (reduc_info) = orig_code;
7431 :
7432 73394 : reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7433 73394 : if (reduction_type == TREE_CODE_REDUCTION)
7434 : {
7435 : /* Check whether it's ok to change the order of the computation.
7436 : Generally, when vectorizing a reduction we change the order of the
7437 : computation. This may change the behavior of the program in some
7438 : cases, so we need to check that this is ok. One exception is when
7439 : vectorizing an outer-loop: the inner-loop is executed sequentially,
7440 : and therefore vectorizing reductions in the inner-loop during
7441 : outer-loop vectorization is safe. Likewise when we are vectorizing
7442 : a series of reductions using SLP and the VF is one the reductions
7443 : are performed in scalar order. */
7444 72596 : if (!reduc_chain
7445 72596 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7446 : ;
7447 72439 : else if (needs_fold_left_reduction_p (op.type, orig_code))
7448 : {
7449 : /* When vectorizing a reduction chain w/o SLP the reduction PHI
7450 : is not directy used in stmt. */
7451 5183 : if (reduc_chain_length != 1)
7452 : {
7453 67 : if (dump_enabled_p ())
7454 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7455 : "in-order reduction chain without SLP.\n");
7456 67 : return false;
7457 : }
7458 : /* Code generation doesn't support function calls other
7459 : than .COND_*. */
7460 5116 : if (!op.code.is_tree_code ()
7461 5338 : && !(op.code.is_internal_fn ()
7462 111 : && conditional_internal_fn_code (internal_fn (op.code))
7463 : != ERROR_MARK))
7464 : {
7465 18 : if (dump_enabled_p ())
7466 16 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7467 : "in-order reduction chain operation not "
7468 : "supported.\n");
7469 18 : return false;
7470 : }
7471 5098 : VECT_REDUC_INFO_TYPE (reduc_info)
7472 5098 : = reduction_type = FOLD_LEFT_REDUCTION;
7473 : }
7474 67256 : else if (!commutative_binary_op_p (orig_code, op.type)
7475 67256 : || !associative_binary_op_p (orig_code, op.type))
7476 : {
7477 144 : if (dump_enabled_p ())
7478 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7479 : "reduction: not commutative/associative\n");
7480 144 : return false;
7481 : }
7482 : }
7483 :
7484 5098 : if ((reduction_type == COND_REDUCTION
7485 : || reduction_type == INTEGER_INDUC_COND_REDUCTION
7486 : || reduction_type == CONST_COND_REDUCTION
7487 68067 : || reduction_type == EXTRACT_LAST_REDUCTION)
7488 798 : && ncopies > 1)
7489 : {
7490 276 : if (dump_enabled_p ())
7491 60 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7492 : "multiple types in condition reduction.\n");
7493 276 : return false;
7494 : }
7495 :
7496 : /* See if we can convert a mask vector to a corresponding bool data vector
7497 : to perform the epilogue reduction. */
7498 72889 : tree alt_vectype_out = NULL_TREE;
7499 72889 : if (VECTOR_BOOLEAN_TYPE_P (vectype_out))
7500 : {
7501 1121 : alt_vectype_out
7502 2242 : = get_related_vectype_for_scalar_type (loop_vinfo->vector_mode,
7503 1121 : TREE_TYPE (vectype_out),
7504 : TYPE_VECTOR_SUBPARTS
7505 : (vectype_out));
7506 1121 : if (!alt_vectype_out
7507 1121 : || maybe_ne (TYPE_VECTOR_SUBPARTS (alt_vectype_out),
7508 2220 : TYPE_VECTOR_SUBPARTS (vectype_out))
7509 2242 : || !expand_vec_cond_expr_p (alt_vectype_out, vectype_out))
7510 22 : alt_vectype_out = NULL_TREE;
7511 : }
7512 :
7513 72889 : internal_fn reduc_fn = IFN_LAST;
7514 72889 : if (reduction_type == TREE_CODE_REDUCTION
7515 72889 : || reduction_type == FOLD_LEFT_REDUCTION
7516 : || reduction_type == INTEGER_INDUC_COND_REDUCTION
7517 522 : || reduction_type == CONST_COND_REDUCTION)
7518 : {
7519 67383 : if (reduction_type == FOLD_LEFT_REDUCTION
7520 76793 : ? fold_left_reduction_fn (orig_code, &reduc_fn)
7521 67383 : : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7522 : {
7523 71809 : internal_fn sbool_fn = IFN_LAST;
7524 71809 : if (reduc_fn == IFN_LAST)
7525 : ;
7526 69837 : else if ((!VECTOR_BOOLEAN_TYPE_P (vectype_out)
7527 1121 : || (GET_MODE_CLASS (TYPE_MODE (vectype_out))
7528 : == MODE_VECTOR_BOOL))
7529 138553 : && direct_internal_fn_supported_p (reduc_fn, vectype_out,
7530 : OPTIMIZE_FOR_SPEED))
7531 : ;
7532 18325 : else if (VECTOR_BOOLEAN_TYPE_P (vectype_out)
7533 1121 : && sbool_reduction_fn_for_fn (reduc_fn, &sbool_fn)
7534 19446 : && direct_internal_fn_supported_p (sbool_fn, vectype_out,
7535 : OPTIMIZE_FOR_SPEED))
7536 110 : reduc_fn = sbool_fn;
7537 18215 : else if (reduction_type != FOLD_LEFT_REDUCTION
7538 18215 : && alt_vectype_out
7539 18215 : && direct_internal_fn_supported_p (reduc_fn, alt_vectype_out,
7540 : OPTIMIZE_FOR_SPEED))
7541 790 : VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info) = alt_vectype_out;
7542 : else
7543 : {
7544 17425 : if (dump_enabled_p ())
7545 930 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7546 : "reduc op not supported by target.\n");
7547 :
7548 17425 : reduc_fn = IFN_LAST;
7549 : }
7550 : }
7551 : else
7552 : {
7553 672 : if (dump_enabled_p ())
7554 48 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7555 : "no reduc code for scalar code.\n");
7556 :
7557 672 : return false;
7558 : }
7559 71809 : if (reduc_fn == IFN_LAST
7560 71809 : && VECTOR_BOOLEAN_TYPE_P (vectype_out))
7561 : {
7562 221 : if (!alt_vectype_out)
7563 : {
7564 12 : if (dump_enabled_p ())
7565 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7566 : "cannot turn mask into bool data vector for "
7567 : "reduction epilogue.\n");
7568 12 : return false;
7569 : }
7570 209 : VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info) = alt_vectype_out;
7571 : }
7572 : }
7573 408 : else if (reduction_type == COND_REDUCTION)
7574 : {
7575 408 : int scalar_precision
7576 408 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7577 408 : cr_index_scalar_type = make_unsigned_type (scalar_precision);
7578 408 : cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7579 : vectype_out);
7580 :
7581 408 : if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7582 : OPTIMIZE_FOR_SPEED))
7583 12 : reduc_fn = IFN_REDUC_MAX;
7584 : }
7585 72205 : VECT_REDUC_INFO_FN (reduc_info) = reduc_fn;
7586 :
7587 72205 : if (reduction_type != EXTRACT_LAST_REDUCTION
7588 : && reduc_fn == IFN_LAST
7589 : && !nunits_out.is_constant ())
7590 : {
7591 : if (dump_enabled_p ())
7592 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7593 : "missing target support for reduction on"
7594 : " variable-length vectors.\n");
7595 : return false;
7596 : }
7597 :
7598 : /* For SLP reductions, see if there is a neutral value we can use. */
7599 72205 : tree neutral_op = NULL_TREE;
7600 72205 : tree initial_value = NULL_TREE;
7601 72205 : if (reduc_chain)
7602 2216 : initial_value = vect_phi_initial_value (reduc_def_phi);
7603 72205 : neutral_op = neutral_op_for_reduction (TREE_TYPE
7604 : (gimple_phi_result (reduc_def_phi)),
7605 : orig_code, initial_value);
7606 72205 : VECT_REDUC_INFO_NEUTRAL_OP (reduc_info) = neutral_op;
7607 :
7608 72205 : if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7609 : {
7610 : /* We can't support in-order reductions of code such as this:
7611 :
7612 : for (int i = 0; i < n1; ++i)
7613 : for (int j = 0; j < n2; ++j)
7614 : l += a[j];
7615 :
7616 : since GCC effectively transforms the loop when vectorizing:
7617 :
7618 : for (int i = 0; i < n1 / VF; ++i)
7619 : for (int j = 0; j < n2; ++j)
7620 : for (int k = 0; k < VF; ++k)
7621 : l += a[j];
7622 :
7623 : which is a reassociation of the original operation. */
7624 56 : if (dump_enabled_p ())
7625 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7626 : "in-order double reduction not supported.\n");
7627 :
7628 56 : return false;
7629 : }
7630 :
7631 72149 : if (reduction_type == FOLD_LEFT_REDUCTION
7632 4370 : && SLP_TREE_LANES (slp_node) > 1
7633 159 : && !reduc_chain)
7634 : {
7635 : /* We cannot use in-order reductions in this case because there is
7636 : an implicit reassociation of the operations involved. */
7637 64 : if (dump_enabled_p ())
7638 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7639 : "in-order unchained SLP reductions not supported.\n");
7640 64 : return false;
7641 : }
7642 :
7643 : /* For double reductions, and for SLP reductions with a neutral value,
7644 : we construct a variable-length initial vector by loading a vector
7645 : full of the neutral value and then shift-and-inserting the start
7646 : values into the low-numbered elements. This is however not needed
7647 : when neutral and initial value are equal or we can handle the
7648 : initial value via adjustment in the epilogue. */
7649 72085 : if ((double_reduc || neutral_op)
7650 : && !nunits_out.is_constant ()
7651 : && reduction_type != INTEGER_INDUC_COND_REDUCTION
7652 : && !((SLP_TREE_LANES (slp_node) == 1 || reduc_chain)
7653 : && neutral_op
7654 : && (!double_reduc
7655 : || operand_equal_p (neutral_op,
7656 : vect_phi_initial_value (reduc_def_phi))))
7657 : && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7658 : vectype_out, OPTIMIZE_FOR_BOTH))
7659 : {
7660 : if (dump_enabled_p ())
7661 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7662 : "reduction on variable-length vectors requires"
7663 : " target support for a vector-shift-and-insert"
7664 : " operation.\n");
7665 : return false;
7666 : }
7667 :
7668 : /* Check extra constraints for variable-length unchained SLP reductions. */
7669 72085 : if (!reduc_chain
7670 : && !nunits_out.is_constant ())
7671 : {
7672 : /* We checked above that we could build the initial vector when
7673 : there's a neutral element value. Check here for the case in
7674 : which each SLP statement has its own initial value and in which
7675 : that value needs to be repeated for every instance of the
7676 : statement within the initial vector. */
7677 : unsigned int group_size = SLP_TREE_LANES (slp_node);
7678 : if (!neutral_op
7679 : && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7680 : TREE_TYPE (vectype_out)))
7681 : {
7682 : if (dump_enabled_p ())
7683 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7684 : "unsupported form of SLP reduction for"
7685 : " variable-length vectors: cannot build"
7686 : " initial vector.\n");
7687 : return false;
7688 : }
7689 : /* The epilogue code relies on the number of elements being a multiple
7690 : of the group size. The duplicate-and-interleave approach to setting
7691 : up the initial vector does too. */
7692 : if (!multiple_p (nunits_out, group_size))
7693 : {
7694 : if (dump_enabled_p ())
7695 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7696 : "unsupported form of SLP reduction for"
7697 : " variable-length vectors: the vector size"
7698 : " is not a multiple of the number of results.\n");
7699 : return false;
7700 : }
7701 : }
7702 :
7703 72085 : if (reduction_type == COND_REDUCTION)
7704 : {
7705 408 : widest_int ni;
7706 :
7707 408 : if (! max_loop_iterations (loop, &ni))
7708 : {
7709 14 : if (dump_enabled_p ())
7710 0 : dump_printf_loc (MSG_NOTE, vect_location,
7711 : "loop count not known, cannot create cond "
7712 : "reduction.\n");
7713 14 : return false;
7714 : }
7715 : /* Convert backedges to iterations. */
7716 394 : ni += 1;
7717 :
7718 : /* The additional index will be the same type as the condition. Check
7719 : that the loop can fit into this less one (because we'll use up the
7720 : zero slot for when there are no matches). */
7721 394 : tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7722 394 : if (wi::geu_p (ni, wi::to_widest (max_index)))
7723 : {
7724 90 : if (dump_enabled_p ())
7725 54 : dump_printf_loc (MSG_NOTE, vect_location,
7726 : "loop size is greater than data size.\n");
7727 90 : return false;
7728 : }
7729 408 : }
7730 :
7731 : /* In case the vectorization factor (VF) is bigger than the number
7732 : of elements that we can fit in a vectype (nunits), we have to generate
7733 : more than one vector stmt - i.e - we need to "unroll" the
7734 : vector stmt by a factor VF/nunits. For more details see documentation
7735 : in vectorizable_operation. */
7736 :
7737 : /* If the reduction is used in an outer loop we need to generate
7738 : VF intermediate results, like so (e.g. for ncopies=2):
7739 : r0 = phi (init, r0)
7740 : r1 = phi (init, r1)
7741 : r0 = x0 + r0;
7742 : r1 = x1 + r1;
7743 : (i.e. we generate VF results in 2 registers).
7744 : In this case we have a separate def-use cycle for each copy, and therefore
7745 : for each copy we get the vector def for the reduction variable from the
7746 : respective phi node created for this copy.
7747 :
7748 : Otherwise (the reduction is unused in the loop nest), we can combine
7749 : together intermediate results, like so (e.g. for ncopies=2):
7750 : r = phi (init, r)
7751 : r = x0 + r;
7752 : r = x1 + r;
7753 : (i.e. we generate VF/2 results in a single register).
7754 : In this case for each copy we get the vector def for the reduction variable
7755 : from the vectorized reduction operation generated in the previous iteration.
7756 :
7757 : This only works when we see both the reduction PHI and its only consumer
7758 : in vectorizable_reduction and there are no intermediate stmts
7759 : participating. When unrolling we want each unrolled iteration to have its
7760 : own reduction accumulator since one of the main goals of unrolling a
7761 : reduction is to reduce the aggregate loop-carried latency. */
7762 71981 : if (ncopies > 1
7763 71981 : && !reduc_chain
7764 8001 : && SLP_TREE_LANES (slp_node) == 1
7765 7830 : && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7766 7807 : && reduc_chain_length == 1
7767 7420 : && loop_vinfo->suggested_unroll_factor == 1)
7768 71981 : single_defuse_cycle = true;
7769 :
7770 71981 : if (single_defuse_cycle && !lane_reducing)
7771 : {
7772 6475 : gcc_assert (op.code != COND_EXPR);
7773 :
7774 : /* 4. check support for the operation in the loop
7775 :
7776 : This isn't necessary for the lane reduction codes, since they
7777 : can only be produced by pattern matching, and it's up to the
7778 : pattern matcher to test for support. The main reason for
7779 : specifically skipping this step is to avoid rechecking whether
7780 : mixed-sign dot-products can be implemented using signed
7781 : dot-products. */
7782 6475 : machine_mode vec_mode = TYPE_MODE (vectype_in);
7783 6475 : if (!directly_supported_p (op.code, vectype_in, optab_vector))
7784 : {
7785 2065 : if (dump_enabled_p ())
7786 44 : dump_printf (MSG_NOTE, "op not supported by target.\n");
7787 4130 : if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7788 2065 : || !vect_can_vectorize_without_simd_p (op.code))
7789 : single_defuse_cycle = false;
7790 : else
7791 5 : if (dump_enabled_p ())
7792 0 : dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7793 : }
7794 :
7795 6475 : if (vect_emulated_vector_p (vectype_in)
7796 6475 : && !vect_can_vectorize_without_simd_p (op.code))
7797 : {
7798 0 : if (dump_enabled_p ())
7799 0 : dump_printf (MSG_NOTE, "using word mode not possible.\n");
7800 0 : return false;
7801 : }
7802 : }
7803 71981 : if (dump_enabled_p () && single_defuse_cycle)
7804 695 : dump_printf_loc (MSG_NOTE, vect_location,
7805 : "using single def-use cycle for reduction by reducing "
7806 : "multiple vectors to one in the loop body\n");
7807 71981 : VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7808 :
7809 : /* For lane-reducing operation, the below processing related to single
7810 : defuse-cycle will be done in its own vectorizable function. One more
7811 : thing to note is that the operation must not be involved in fold-left
7812 : reduction. */
7813 71981 : single_defuse_cycle &= !lane_reducing;
7814 :
7815 71981 : if (single_defuse_cycle || reduction_type == FOLD_LEFT_REDUCTION)
7816 28376 : for (i = 0; i < (int) op.num_ops; i++)
7817 19720 : if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7818 : {
7819 0 : if (dump_enabled_p ())
7820 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7821 : "incompatible vector types for invariants\n");
7822 0 : return false;
7823 : }
7824 :
7825 71981 : vect_model_reduction_cost (loop_vinfo, slp_for_stmt_info, reduc_fn,
7826 : reduction_type, ncopies, cost_vec);
7827 : /* Cost the reduction op inside the loop if transformed via
7828 : vect_transform_reduction for non-lane-reducing operation. Otherwise
7829 : this is costed by the separate vectorizable_* routines. */
7830 71981 : if (single_defuse_cycle)
7831 4415 : record_stmt_cost (cost_vec, ncopies, vector_stmt,
7832 : slp_for_stmt_info, 0, vect_body);
7833 :
7834 71981 : if (dump_enabled_p ()
7835 71981 : && reduction_type == FOLD_LEFT_REDUCTION)
7836 252 : dump_printf_loc (MSG_NOTE, vect_location,
7837 : "using an in-order (fold-left) reduction.\n");
7838 71981 : SLP_TREE_TYPE (slp_node) = cycle_phi_info_type;
7839 :
7840 : /* All but single defuse-cycle optimized and fold-left reductions go
7841 : through their own vectorizable_* routines. */
7842 71981 : stmt_vec_info tem
7843 71981 : = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (slp_node_instance));
7844 71981 : if (!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
7845 63325 : STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7846 : else
7847 : {
7848 8656 : STMT_VINFO_DEF_TYPE (tem) = vect_reduction_def;
7849 8656 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7850 4020 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
7851 : slp_node, op.code, op.type,
7852 : vectype_in);
7853 : }
7854 : return true;
7855 : }
7856 :
7857 : /* STMT_INFO is a dot-product reduction whose multiplication operands
7858 : have different signs. Emit a sequence to emulate the operation
7859 : using a series of signed DOT_PROD_EXPRs and return the last
7860 : statement generated. VEC_DEST is the result of the vector operation
7861 : and VOP lists its inputs. */
7862 :
7863 : static gassign *
7864 4 : vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7865 : gimple_stmt_iterator *gsi, tree vec_dest,
7866 : tree vop[3])
7867 : {
7868 4 : tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
7869 4 : tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
7870 4 : tree narrow_elttype = TREE_TYPE (narrow_vectype);
7871 4 : gimple *new_stmt;
7872 :
7873 : /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
7874 4 : if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
7875 0 : std::swap (vop[0], vop[1]);
7876 :
7877 : /* Convert all inputs to signed types. */
7878 12 : for (int i = 1; i < 3; ++i)
7879 8 : if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
7880 : {
7881 0 : tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
7882 0 : new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
7883 0 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7884 0 : vop[i] = tmp;
7885 : }
7886 :
7887 : /* In the comments below we assume 8-bit inputs for simplicity,
7888 : but the approach works for any full integer type. */
7889 :
7890 : /* Create a vector of -128. */
7891 4 : tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
7892 4 : tree min_narrow = build_vector_from_val (TREE_TYPE (vop[0]),
7893 4 : fold_convert
7894 : (TREE_TYPE (TREE_TYPE (vop[0])),
7895 : min_narrow_elttype));
7896 :
7897 : /* Create a vector of 64. */
7898 4 : auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
7899 4 : tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
7900 4 : half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
7901 :
7902 : /* Emit: SUB_RES = VOP[0] - 128 in an unsigned type. */
7903 4 : tree sub_res = make_ssa_name (TREE_TYPE (vop[0]));
7904 4 : new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
7905 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7906 :
7907 4 : vop[0] = make_ssa_name (narrow_vectype);
7908 4 : new_stmt = gimple_build_assign (vop[0], VIEW_CONVERT_EXPR,
7909 : build1 (VIEW_CONVERT_EXPR, narrow_vectype,
7910 : sub_res));
7911 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7912 :
7913 : /* Emit:
7914 :
7915 : STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
7916 : STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
7917 : STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
7918 :
7919 : on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
7920 : Doing the two 64 * y steps first allows more time to compute x. */
7921 4 : tree stage1 = make_ssa_name (wide_vectype);
7922 4 : new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
7923 : vop[1], half_narrow, vop[2]);
7924 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7925 :
7926 4 : tree stage2 = make_ssa_name (wide_vectype);
7927 4 : new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
7928 : vop[1], half_narrow, stage1);
7929 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7930 :
7931 4 : tree stage3 = make_ssa_name (wide_vectype);
7932 4 : new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
7933 : vop[0], vop[1], stage2);
7934 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7935 :
7936 : /* Convert STAGE3 to the reduction type. */
7937 4 : return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
7938 4 : }
7939 :
7940 : /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7941 : value. */
7942 :
7943 : bool
7944 2589 : vect_transform_reduction (loop_vec_info loop_vinfo,
7945 : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7946 : slp_tree slp_node)
7947 : {
7948 2589 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
7949 2589 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7950 2589 : unsigned vec_num;
7951 :
7952 2589 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
7953 :
7954 2589 : if (nested_in_vect_loop_p (loop, stmt_info))
7955 : {
7956 0 : loop = loop->inner;
7957 0 : gcc_assert (VECT_REDUC_INFO_DEF_TYPE (reduc_info)
7958 : == vect_double_reduction_def);
7959 : }
7960 :
7961 2589 : gimple_match_op op;
7962 2589 : if (!gimple_extract_op (stmt_info->stmt, &op))
7963 0 : gcc_unreachable ();
7964 :
7965 : /* All uses but the last are expected to be defined in the loop.
7966 : The last use is the reduction variable. In case of nested cycle this
7967 : assumption is not true: we use reduc_index to record the index of the
7968 : reduction variable. */
7969 2589 : int reduc_index = SLP_TREE_REDUC_IDX (slp_node);
7970 2589 : tree vectype_in = SLP_TREE_VECTYPE (slp_node);
7971 2589 : if (lane_reducing_op_p (op.code))
7972 260 : vectype_in = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (slp_node)[0]);
7973 :
7974 2589 : vec_num = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
7975 :
7976 2589 : code_helper code = canonicalize_code (op.code, op.type);
7977 2589 : internal_fn cond_fn
7978 484 : = ((code.is_internal_fn ()
7979 484 : && internal_fn_mask_index ((internal_fn)code) != -1)
7980 2589 : ? (internal_fn)code : get_conditional_internal_fn (code, op.type));
7981 :
7982 2589 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7983 2589 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
7984 2589 : bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7985 :
7986 : /* Transform. */
7987 2589 : tree new_temp = NULL_TREE;
7988 18123 : auto_vec<tree> vec_oprnds[3];
7989 :
7990 2589 : if (dump_enabled_p ())
7991 756 : dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7992 :
7993 : /* A binary COND_OP reduction must have the same definition and else
7994 : value. */
7995 3073 : bool cond_fn_p = code.is_internal_fn ()
7996 484 : && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
7997 484 : if (cond_fn_p)
7998 : {
7999 484 : gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8000 : || code == IFN_COND_MUL || code == IFN_COND_AND
8001 : || code == IFN_COND_IOR || code == IFN_COND_XOR
8002 : || code == IFN_COND_MIN || code == IFN_COND_MAX);
8003 484 : gcc_assert (op.num_ops == 4
8004 : && (op.ops[reduc_index]
8005 : == op.ops[internal_fn_else_index ((internal_fn) code)]));
8006 : }
8007 :
8008 2589 : bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8009 :
8010 2589 : vect_reduction_type reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
8011 2589 : if (reduction_type == FOLD_LEFT_REDUCTION)
8012 : {
8013 847 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
8014 847 : gcc_assert (code.is_tree_code () || cond_fn_p);
8015 847 : return vectorize_fold_left_reduction
8016 847 : (loop_vinfo, stmt_info, gsi, slp_node,
8017 847 : code, reduc_fn, op.num_ops, vectype_in,
8018 847 : reduc_index, masks, lens);
8019 : }
8020 :
8021 1742 : bool single_defuse_cycle = VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info);
8022 1742 : bool lane_reducing = lane_reducing_op_p (code);
8023 1482 : gcc_assert (single_defuse_cycle || lane_reducing);
8024 :
8025 1742 : if (lane_reducing)
8026 : {
8027 : /* The last operand of lane-reducing op is for reduction. */
8028 260 : gcc_assert (reduc_index == (int) op.num_ops - 1);
8029 : }
8030 :
8031 : /* Create the destination vector */
8032 1742 : tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8033 1742 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8034 :
8035 : /* Get NCOPIES vector definitions for all operands except the reduction
8036 : definition. */
8037 1742 : if (!cond_fn_p)
8038 : {
8039 1289 : gcc_assert (reduc_index >= 0 && reduc_index <= 2);
8040 2131 : vect_get_vec_defs (loop_vinfo, slp_node,
8041 1289 : single_defuse_cycle && reduc_index == 0
8042 : ? NULL_TREE : op.ops[0], &vec_oprnds[0],
8043 1289 : single_defuse_cycle && reduc_index == 1
8044 : ? NULL_TREE : op.ops[1], &vec_oprnds[1],
8045 1289 : op.num_ops == 3
8046 260 : && !(single_defuse_cycle && reduc_index == 2)
8047 : ? op.ops[2] : NULL_TREE, &vec_oprnds[2]);
8048 : }
8049 : else
8050 : {
8051 : /* For a conditional operation pass the truth type as mask
8052 : vectype. */
8053 453 : gcc_assert (single_defuse_cycle
8054 : && (reduc_index == 1 || reduc_index == 2));
8055 453 : vect_get_vec_defs (loop_vinfo, slp_node, op.ops[0],
8056 : &vec_oprnds[0],
8057 : reduc_index == 1 ? NULL_TREE : op.ops[1],
8058 : &vec_oprnds[1],
8059 : reduc_index == 2 ? NULL_TREE : op.ops[2],
8060 : &vec_oprnds[2]);
8061 : }
8062 :
8063 : /* For single def-use cycles get one copy of the vectorized reduction
8064 : definition. */
8065 1742 : if (single_defuse_cycle)
8066 : {
8067 1653 : vect_get_vec_defs (loop_vinfo, slp_node,
8068 : reduc_index == 0 ? op.ops[0] : NULL_TREE,
8069 : &vec_oprnds[0],
8070 : reduc_index == 1 ? op.ops[1] : NULL_TREE,
8071 : &vec_oprnds[1],
8072 : reduc_index == 2 ? op.ops[2] : NULL_TREE,
8073 : &vec_oprnds[2]);
8074 : }
8075 89 : else if (lane_reducing)
8076 : {
8077 : /* For normal reduction, consistency between vectorized def/use is
8078 : naturally ensured when mapping from scalar statement. But if lane-
8079 : reducing op is involved in reduction, thing would become somewhat
8080 : complicated in that the op's result and operand for accumulation are
8081 : limited to less lanes than other operands, which certainly causes
8082 : def/use mismatch on adjacent statements around the op if do not have
8083 : any kind of specific adjustment. One approach is to refit lane-
8084 : reducing op in the way of introducing new trivial pass-through copies
8085 : to fix possible def/use gap, so as to make it behave like a normal op.
8086 : And vector reduction PHIs are always generated to the full extent, no
8087 : matter lane-reducing op exists or not. If some copies or PHIs are
8088 : actually superfluous, they would be cleaned up by passes after
8089 : vectorization. An example for single-lane slp, lane-reducing ops
8090 : with mixed input vectypes in a reduction chain, is given as below.
8091 : Similarly, this handling is applicable for multiple-lane slp as well.
8092 :
8093 : int sum = 1;
8094 : for (i)
8095 : {
8096 : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
8097 : sum += w[i]; // widen-sum <vector(16) char>
8098 : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
8099 : sum += n[i]; // normal <vector(4) int>
8100 : }
8101 :
8102 : The vector size is 128-bit,vectorization factor is 16. Reduction
8103 : statements would be transformed as:
8104 :
8105 : vector<4> int sum_v0 = { 0, 0, 0, 1 };
8106 : vector<4> int sum_v1 = { 0, 0, 0, 0 };
8107 : vector<4> int sum_v2 = { 0, 0, 0, 0 };
8108 : vector<4> int sum_v3 = { 0, 0, 0, 0 };
8109 :
8110 : for (i / 16)
8111 : {
8112 : sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
8113 : sum_v1 = sum_v1; // copy
8114 : sum_v2 = sum_v2; // copy
8115 : sum_v3 = sum_v3; // copy
8116 :
8117 : sum_v0 = sum_v0; // copy
8118 : sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
8119 : sum_v2 = sum_v2; // copy
8120 : sum_v3 = sum_v3; // copy
8121 :
8122 : sum_v0 = sum_v0; // copy
8123 : sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1);
8124 : sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2);
8125 : sum_v3 = sum_v3; // copy
8126 :
8127 : sum_v0 += n_v0[i: 0 ~ 3 ];
8128 : sum_v1 += n_v1[i: 4 ~ 7 ];
8129 : sum_v2 += n_v2[i: 8 ~ 11];
8130 : sum_v3 += n_v3[i: 12 ~ 15];
8131 : }
8132 :
8133 : Moreover, for a higher instruction parallelism in final vectorized
8134 : loop, it is considered to make those effective vector lane-reducing
8135 : ops be distributed evenly among all def-use cycles. In the above
8136 : example, DOT_PROD, WIDEN_SUM and SADs are generated into disparate
8137 : cycles, instruction dependency among them could be eliminated. */
8138 89 : unsigned effec_ncopies = vec_oprnds[0].length ();
8139 89 : unsigned total_ncopies = vec_oprnds[reduc_index].length ();
8140 :
8141 89 : gcc_assert (effec_ncopies <= total_ncopies);
8142 :
8143 89 : if (effec_ncopies < total_ncopies)
8144 : {
8145 267 : for (unsigned i = 0; i < op.num_ops - 1; i++)
8146 : {
8147 356 : gcc_assert (vec_oprnds[i].length () == effec_ncopies);
8148 178 : vec_oprnds[i].safe_grow_cleared (total_ncopies);
8149 : }
8150 : }
8151 :
8152 89 : tree reduc_vectype_in = vectype_in;
8153 89 : gcc_assert (reduc_vectype_in);
8154 :
8155 89 : unsigned effec_reduc_ncopies
8156 89 : = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
8157 :
8158 89 : gcc_assert (effec_ncopies <= effec_reduc_ncopies);
8159 :
8160 89 : if (effec_ncopies < effec_reduc_ncopies)
8161 : {
8162 : /* Find suitable def-use cycles to generate vectorized statements
8163 : into, and reorder operands based on the selection. */
8164 0 : unsigned curr_pos = VECT_REDUC_INFO_RESULT_POS (reduc_info);
8165 0 : unsigned next_pos = (curr_pos + effec_ncopies) % effec_reduc_ncopies;
8166 :
8167 0 : gcc_assert (curr_pos < effec_reduc_ncopies);
8168 0 : VECT_REDUC_INFO_RESULT_POS (reduc_info) = next_pos;
8169 :
8170 0 : if (curr_pos)
8171 : {
8172 0 : unsigned count = effec_reduc_ncopies - effec_ncopies;
8173 0 : unsigned start = curr_pos - count;
8174 :
8175 0 : if ((int) start < 0)
8176 : {
8177 0 : count = curr_pos;
8178 0 : start = 0;
8179 : }
8180 :
8181 0 : for (unsigned i = 0; i < op.num_ops - 1; i++)
8182 : {
8183 0 : for (unsigned j = effec_ncopies; j > start; j--)
8184 : {
8185 0 : unsigned k = j - 1;
8186 0 : std::swap (vec_oprnds[i][k], vec_oprnds[i][k + count]);
8187 0 : gcc_assert (!vec_oprnds[i][k]);
8188 : }
8189 : }
8190 : }
8191 : }
8192 : }
8193 :
8194 1742 : bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (slp_node);
8195 2991 : unsigned num = vec_oprnds[reduc_index == 0 ? 1 : 0].length ();
8196 1742 : unsigned mask_index = 0;
8197 :
8198 7661 : for (unsigned i = 0; i < num; ++i)
8199 : {
8200 5919 : gimple *new_stmt;
8201 5919 : tree vop[3] = { vec_oprnds[0][i], vec_oprnds[1][i], NULL_TREE };
8202 5919 : if (!vop[0] || !vop[1])
8203 : {
8204 473 : tree reduc_vop = vec_oprnds[reduc_index][i];
8205 :
8206 : /* If could not generate an effective vector statement for current
8207 : portion of reduction operand, insert a trivial copy to simply
8208 : handle over the operand to other dependent statements. */
8209 473 : gcc_assert (reduc_vop);
8210 :
8211 473 : if (TREE_CODE (reduc_vop) == SSA_NAME
8212 473 : && !SSA_NAME_IS_DEFAULT_DEF (reduc_vop))
8213 473 : new_stmt = SSA_NAME_DEF_STMT (reduc_vop);
8214 : else
8215 : {
8216 0 : new_temp = make_ssa_name (vec_dest);
8217 0 : new_stmt = gimple_build_assign (new_temp, reduc_vop);
8218 0 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
8219 : gsi);
8220 : }
8221 : }
8222 5446 : else if (masked_loop_p && !mask_by_cond_expr)
8223 : {
8224 : /* No conditional ifns have been defined for lane-reducing op
8225 : yet. */
8226 16 : gcc_assert (!lane_reducing);
8227 :
8228 16 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8229 : vec_num, vectype_in,
8230 : mask_index++);
8231 16 : gcall *call;
8232 24 : if (code.is_internal_fn () && cond_fn_p)
8233 : {
8234 16 : gcc_assert (op.num_ops >= 3
8235 : && internal_fn_mask_index (internal_fn (code)) == 0);
8236 8 : vop[2] = vec_oprnds[2][i];
8237 8 : mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask),
8238 : mask, vop[0], gsi);
8239 8 : call = gimple_build_call_internal (cond_fn, 4, mask, vop[1],
8240 : vop[2], vop[reduc_index]);
8241 : }
8242 : else
8243 8 : call = gimple_build_call_internal (cond_fn, 4, mask, vop[0],
8244 : vop[1], vop[reduc_index]);
8245 16 : new_temp = make_ssa_name (vec_dest, call);
8246 16 : gimple_call_set_lhs (call, new_temp);
8247 16 : gimple_call_set_nothrow (call, true);
8248 16 : vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8249 16 : new_stmt = call;
8250 : }
8251 : else
8252 : {
8253 5430 : if (op.num_ops >= 3)
8254 1770 : vop[2] = vec_oprnds[2][i];
8255 :
8256 5430 : if (masked_loop_p && mask_by_cond_expr)
8257 : {
8258 4 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8259 : vec_num, vectype_in,
8260 : mask_index++);
8261 4 : build_vect_cond_expr (code, vop, mask, gsi);
8262 : }
8263 :
8264 5430 : if (emulated_mixed_dot_prod)
8265 4 : new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8266 : vec_dest, vop);
8267 :
8268 6768 : else if (code.is_internal_fn () && !cond_fn_p)
8269 0 : new_stmt = gimple_build_call_internal (internal_fn (code),
8270 : op.num_ops,
8271 : vop[0], vop[1], vop[2]);
8272 6768 : else if (code.is_internal_fn () && cond_fn_p)
8273 1342 : new_stmt = gimple_build_call_internal (internal_fn (code),
8274 : op.num_ops,
8275 : vop[0], vop[1], vop[2],
8276 : vop[reduc_index]);
8277 : else
8278 4084 : new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8279 : vop[0], vop[1], vop[2]);
8280 5430 : new_temp = make_ssa_name (vec_dest, new_stmt);
8281 5430 : gimple_set_lhs (new_stmt, new_temp);
8282 5430 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8283 : }
8284 :
8285 5919 : if (single_defuse_cycle && i < num - 1)
8286 3546 : vec_oprnds[reduc_index].safe_push (gimple_get_lhs (new_stmt));
8287 : else
8288 2373 : slp_node->push_vec_def (new_stmt);
8289 : }
8290 :
8291 : return true;
8292 10356 : }
8293 :
8294 : /* Transform phase of a cycle PHI. */
8295 :
8296 : bool
8297 23484 : vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8298 : stmt_vec_info stmt_info,
8299 : slp_tree slp_node, slp_instance slp_node_instance)
8300 : {
8301 23484 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
8302 23484 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8303 23484 : int i;
8304 23484 : bool nested_cycle = false;
8305 23484 : int vec_num;
8306 :
8307 23622 : if (nested_in_vect_loop_p (loop, stmt_info))
8308 : {
8309 : loop = loop->inner;
8310 : nested_cycle = true;
8311 : }
8312 :
8313 23484 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
8314 23484 : if (reduc_info
8315 22822 : && (VECT_REDUC_INFO_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8316 22822 : || VECT_REDUC_INFO_TYPE (reduc_info) == FOLD_LEFT_REDUCTION))
8317 : /* Leave the scalar phi in place. */
8318 : return true;
8319 :
8320 21975 : if (reduc_info && reduc_info->is_reduc_chain && dump_enabled_p ())
8321 118 : dump_printf_loc (MSG_NOTE, vect_location,
8322 : "vectorizing a reduction chain\n");
8323 :
8324 22637 : vec_num = vect_get_num_copies (loop_vinfo, slp_node);
8325 :
8326 : /* Check whether we should use a single PHI node and accumulate
8327 : vectors to one before the backedge. */
8328 22637 : if (reduc_info && VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info))
8329 22637 : vec_num = 1;
8330 :
8331 : /* Create the destination vector */
8332 22637 : gphi *phi = as_a <gphi *> (stmt_info->stmt);
8333 22637 : tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8334 : vectype_out);
8335 :
8336 : /* Get the loop-entry arguments. */
8337 22637 : auto_vec<tree> vec_initial_defs;
8338 22637 : vec_initial_defs.reserve (vec_num);
8339 : /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8340 : and we can't use zero for induc_val, use initial_def. Similarly
8341 : for REDUC_MIN and initial_def larger than the base. */
8342 22637 : if (reduc_info
8343 21975 : && VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8344 : {
8345 62 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
8346 62 : tree initial_def = vect_phi_initial_value (phi);
8347 62 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).safe_push (initial_def);
8348 62 : tree induc_val = VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info);
8349 62 : if (TREE_CODE (initial_def) == INTEGER_CST
8350 60 : && !integer_zerop (induc_val)
8351 122 : && ((VECT_REDUC_INFO_CODE (reduc_info) == MAX_EXPR
8352 42 : && tree_int_cst_lt (initial_def, induc_val))
8353 58 : || (VECT_REDUC_INFO_CODE (reduc_info) == MIN_EXPR
8354 18 : && tree_int_cst_lt (induc_val, initial_def))))
8355 : {
8356 2 : induc_val = initial_def;
8357 : /* Communicate we used the initial_def to epilouge
8358 : generation. */
8359 2 : VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8360 : }
8361 62 : vec_initial_defs.quick_push
8362 62 : (build_vector_from_val (vectype_out, induc_val));
8363 62 : }
8364 22575 : else if (nested_cycle)
8365 : {
8366 748 : unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8367 748 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8368 : &vec_initial_defs);
8369 : }
8370 : else
8371 : {
8372 21827 : gcc_assert (slp_node == slp_node_instance->reduc_phis);
8373 21827 : vec<tree> &initial_values = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info);
8374 21827 : vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8375 :
8376 21827 : unsigned int num_phis = stmts.length ();
8377 21827 : if (reduc_info->is_reduc_chain)
8378 185 : num_phis = 1;
8379 21827 : initial_values.reserve (num_phis);
8380 44099 : for (unsigned int i = 0; i < num_phis; ++i)
8381 : {
8382 22272 : gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8383 22272 : initial_values.quick_push (vect_phi_initial_value (this_phi));
8384 : }
8385 21827 : tree neutral_op = VECT_REDUC_INFO_NEUTRAL_OP (reduc_info);
8386 21827 : if (vec_num == 1
8387 21827 : && vect_find_reusable_accumulator (loop_vinfo,
8388 : reduc_info, vectype_out))
8389 : ;
8390 : /* Try to simplify the vector initialization by applying an
8391 : adjustment after the reduction has been performed. This
8392 : can also break a critical path but on the other hand
8393 : requires to keep the initial value live across the loop. */
8394 17812 : else if (neutral_op
8395 17255 : && initial_values.length () == 1
8396 17071 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8397 34806 : && !operand_equal_p (neutral_op, initial_values[0]))
8398 : {
8399 12152 : VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info)
8400 12152 : = initial_values[0];
8401 12152 : initial_values[0] = neutral_op;
8402 : }
8403 21827 : if (!VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)
8404 4015 : || loop_vinfo->main_loop_edge)
8405 43208 : get_initial_defs_for_reduction (loop_vinfo, reduc_info, vectype_out,
8406 : &vec_initial_defs, vec_num,
8407 : stmts.length (), neutral_op);
8408 : }
8409 :
8410 22637 : if (reduc_info)
8411 21975 : if (auto *accumulator = VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info))
8412 : {
8413 4015 : tree def = accumulator->reduc_input;
8414 4015 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8415 : {
8416 4012 : unsigned int nreduc;
8417 8024 : bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8418 4012 : (TREE_TYPE (def)),
8419 4012 : TYPE_VECTOR_SUBPARTS (vectype_out),
8420 : &nreduc);
8421 0 : gcc_assert (res);
8422 4012 : gimple_seq stmts = NULL;
8423 : /* Reduce the single vector to a smaller one. */
8424 4012 : if (nreduc != 1)
8425 : {
8426 : /* Perform the reduction in the appropriate type. */
8427 4012 : tree rvectype = vectype_out;
8428 4012 : if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8429 4012 : TREE_TYPE (TREE_TYPE (def))))
8430 235 : rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8431 : TYPE_VECTOR_SUBPARTS
8432 470 : (vectype_out));
8433 4012 : def = vect_create_partial_epilog (def, rvectype,
8434 : VECT_REDUC_INFO_CODE
8435 : (reduc_info),
8436 : &stmts);
8437 : }
8438 : /* The epilogue loop might use a different vector mode, like
8439 : VNx2DI vs. V2DI. */
8440 4012 : if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8441 : {
8442 0 : tree reduc_type = build_vector_type_for_mode
8443 0 : (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8444 0 : def = gimple_convert (&stmts, reduc_type, def);
8445 : }
8446 : /* Adjust the input so we pick up the partially reduced value
8447 : for the skip edge in vect_create_epilog_for_reduction. */
8448 4012 : accumulator->reduc_input = def;
8449 : /* And the reduction could be carried out using a different sign. */
8450 4012 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8451 235 : def = gimple_convert (&stmts, vectype_out, def);
8452 4012 : edge e;
8453 4012 : if ((e = loop_vinfo->main_loop_edge)
8454 4012 : || (e = loop_vinfo->skip_this_loop_edge))
8455 : {
8456 : /* While we'd like to insert on the edge this will split
8457 : blocks and disturb bookkeeping, we also will eventually
8458 : need this on the skip edge. Rely on sinking to
8459 : fixup optimal placement and insert in the pred. */
8460 3789 : gimple_stmt_iterator gsi = gsi_last_bb (e->src);
8461 : /* Insert before a cond that eventually skips the
8462 : epilogue. */
8463 3789 : if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8464 3772 : gsi_prev (&gsi);
8465 3789 : gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8466 : }
8467 : else
8468 223 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8469 : stmts);
8470 : }
8471 4015 : if (loop_vinfo->main_loop_edge)
8472 3792 : vec_initial_defs[0]
8473 3792 : = vect_get_main_loop_result (loop_vinfo, def,
8474 3792 : vec_initial_defs[0]);
8475 : else
8476 223 : vec_initial_defs.safe_push (def);
8477 : }
8478 :
8479 : /* Generate the reduction PHIs upfront. */
8480 47061 : for (i = 0; i < vec_num; i++)
8481 : {
8482 24424 : tree vec_init_def = vec_initial_defs[i];
8483 : /* Create the reduction-phi that defines the reduction
8484 : operand. */
8485 24424 : gphi *new_phi = create_phi_node (vec_dest, loop->header);
8486 24424 : add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8487 : UNKNOWN_LOCATION);
8488 :
8489 : /* The loop-latch arg is set in epilogue processing. */
8490 :
8491 24424 : slp_node->push_vec_def (new_phi);
8492 : }
8493 :
8494 22637 : return true;
8495 22637 : }
8496 :
8497 : /* Vectorizes LC PHIs. */
8498 :
8499 : bool
8500 181999 : vectorizable_lc_phi (loop_vec_info loop_vinfo,
8501 : stmt_vec_info stmt_info,
8502 : slp_tree slp_node)
8503 : {
8504 181999 : if (!loop_vinfo
8505 181999 : || !is_a <gphi *> (stmt_info->stmt)
8506 217976 : || gimple_phi_num_args (stmt_info->stmt) != 1)
8507 : return false;
8508 :
8509 821 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8510 0 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8511 : return false;
8512 :
8513 : /* Deal with copies from externs or constants that disguise as
8514 : loop-closed PHI nodes (PR97886). */
8515 821 : if (!vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8516 : SLP_TREE_VECTYPE (slp_node)))
8517 : {
8518 0 : if (dump_enabled_p ())
8519 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8520 : "incompatible vector types for invariants\n");
8521 0 : return false;
8522 : }
8523 :
8524 : /* ??? This can happen with data vs. mask uses of boolean. */
8525 821 : if (!useless_type_conversion_p (SLP_TREE_VECTYPE (slp_node),
8526 821 : SLP_TREE_VECTYPE
8527 : (SLP_TREE_CHILDREN (slp_node)[0])))
8528 : {
8529 0 : if (dump_enabled_p ())
8530 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8531 : "missed mask promotion\n");
8532 0 : return false;
8533 : }
8534 :
8535 821 : SLP_TREE_TYPE (slp_node) = lc_phi_info_type;
8536 821 : return true;
8537 : }
8538 :
8539 : bool
8540 530 : vect_transform_lc_phi (loop_vec_info loop_vinfo,
8541 : stmt_vec_info stmt_info,
8542 : slp_tree slp_node)
8543 : {
8544 :
8545 530 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8546 530 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8547 530 : basic_block bb = gimple_bb (stmt_info->stmt);
8548 530 : edge e = single_pred_edge (bb);
8549 530 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8550 530 : auto_vec<tree> vec_oprnds;
8551 1060 : vect_get_vec_defs (loop_vinfo, slp_node,
8552 530 : gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8553 1175 : for (unsigned i = 0; i < vec_oprnds.length (); i++)
8554 : {
8555 : /* Create the vectorized LC PHI node. */
8556 645 : gphi *new_phi = create_phi_node (vec_dest, bb);
8557 645 : add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8558 645 : slp_node->push_vec_def (new_phi);
8559 : }
8560 :
8561 530 : return true;
8562 530 : }
8563 :
8564 : /* Vectorizes PHIs. */
8565 :
8566 : bool
8567 140477 : vectorizable_phi (bb_vec_info vinfo,
8568 : stmt_vec_info stmt_info,
8569 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8570 : {
8571 140477 : if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8572 : return false;
8573 :
8574 72486 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8575 : return false;
8576 :
8577 72486 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8578 :
8579 72486 : if (cost_vec) /* transformation not required. */
8580 : {
8581 : slp_tree child;
8582 : unsigned i;
8583 198922 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8584 140652 : if (!child)
8585 : {
8586 0 : if (dump_enabled_p ())
8587 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8588 : "PHI node with unvectorized backedge def\n");
8589 0 : return false;
8590 : }
8591 140652 : else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8592 : {
8593 18 : if (dump_enabled_p ())
8594 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8595 : "incompatible vector types for invariants\n");
8596 18 : return false;
8597 : }
8598 140634 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8599 140634 : && !useless_type_conversion_p (vectype,
8600 : SLP_TREE_VECTYPE (child)))
8601 : {
8602 : /* With bools we can have mask and non-mask precision vectors
8603 : or different non-mask precisions. while pattern recog is
8604 : supposed to guarantee consistency here bugs in it can cause
8605 : mismatches (PR103489 and PR103800 for example).
8606 : Deal with them here instead of ICEing later. */
8607 18 : if (dump_enabled_p ())
8608 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8609 : "incompatible vector type setup from "
8610 : "bool pattern detection\n");
8611 18 : return false;
8612 : }
8613 :
8614 : /* For single-argument PHIs assume coalescing which means zero cost
8615 : for the scalar and the vector PHIs. This avoids artificially
8616 : favoring the vector path (but may pessimize it in some cases). */
8617 58270 : if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8618 52897 : record_stmt_cost (cost_vec, vect_get_num_copies (vinfo, slp_node),
8619 : vector_stmt, slp_node, vectype, 0, vect_body);
8620 58270 : SLP_TREE_TYPE (slp_node) = phi_info_type;
8621 58270 : return true;
8622 : }
8623 :
8624 14180 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8625 14180 : basic_block bb = gimple_bb (stmt_info->stmt);
8626 14180 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8627 14180 : auto_vec<gphi *> new_phis;
8628 51725 : for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8629 : {
8630 37545 : slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8631 :
8632 : /* Skip not yet vectorized defs. */
8633 37994 : if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8634 37545 : && SLP_TREE_VEC_DEFS (child).is_empty ())
8635 449 : continue;
8636 :
8637 37096 : auto_vec<tree> vec_oprnds;
8638 37096 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8639 37096 : if (!new_phis.exists ())
8640 : {
8641 14180 : new_phis.create (vec_oprnds.length ());
8642 29992 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
8643 : {
8644 : /* Create the vectorized LC PHI node. */
8645 15812 : new_phis.quick_push (create_phi_node (vec_dest, bb));
8646 15812 : slp_node->push_vec_def (new_phis[j]);
8647 : }
8648 : }
8649 37096 : edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8650 80896 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
8651 43800 : add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8652 37096 : }
8653 : /* We should have at least one already vectorized child. */
8654 14180 : gcc_assert (new_phis.exists ());
8655 :
8656 14180 : return true;
8657 14180 : }
8658 :
8659 : /* Vectorizes first order recurrences. An overview of the transformation
8660 : is described below. Suppose we have the following loop.
8661 :
8662 : int t = 0;
8663 : for (int i = 0; i < n; ++i)
8664 : {
8665 : b[i] = a[i] - t;
8666 : t = a[i];
8667 : }
8668 :
8669 : There is a first-order recurrence on 'a'. For this loop, the scalar IR
8670 : looks (simplified) like:
8671 :
8672 : scalar.preheader:
8673 : init = 0;
8674 :
8675 : scalar.body:
8676 : i = PHI <0(scalar.preheader), i+1(scalar.body)>
8677 : _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8678 : _1 = a[i]
8679 : b[i] = _1 - _2
8680 : if (i < n) goto scalar.body
8681 :
8682 : In this example, _2 is a recurrence because it's value depends on the
8683 : previous iteration. We vectorize this as (VF = 4)
8684 :
8685 : vector.preheader:
8686 : vect_init = vect_cst(..., ..., ..., 0)
8687 :
8688 : vector.body
8689 : i = PHI <0(vector.preheader), i+4(vector.body)>
8690 : vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8691 : vect_2 = a[i, i+1, i+2, i+3];
8692 : vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8693 : b[i, i+1, i+2, i+3] = vect_2 - vect_3
8694 : if (..) goto vector.body
8695 :
8696 : In this function, vectorizable_recurr, we code generate both the
8697 : vector PHI node and the permute since those together compute the
8698 : vectorized value of the scalar PHI. We do not yet have the
8699 : backedge value to fill in there nor into the vec_perm. Those
8700 : are filled in vect_schedule_scc.
8701 :
8702 : TODO: Since the scalar loop does not have a use of the recurrence
8703 : outside of the loop the natural way to implement peeling via
8704 : vectorizing the live value doesn't work. For now peeling of loops
8705 : with a recurrence is not implemented. For SLP the supported cases
8706 : are restricted to those requiring a single vector recurrence PHI. */
8707 :
8708 : bool
8709 181221 : vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8710 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8711 : {
8712 181221 : if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8713 : return false;
8714 :
8715 35199 : gphi *phi = as_a<gphi *> (stmt_info->stmt);
8716 :
8717 : /* So far we only support first-order recurrence auto-vectorization. */
8718 35199 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8719 : return false;
8720 :
8721 416 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8722 416 : unsigned ncopies = vect_get_num_copies (loop_vinfo, slp_node);
8723 416 : poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8724 416 : unsigned dist = SLP_TREE_LANES (slp_node);
8725 : /* We need to be able to make progress with a single vector. */
8726 416 : if (maybe_gt (dist * 2, nunits))
8727 : {
8728 0 : if (dump_enabled_p ())
8729 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8730 : "first order recurrence exceeds half of "
8731 : "a vector\n");
8732 0 : return false;
8733 : }
8734 :
8735 : /* We need to be able to build a { ..., a, b } init vector with
8736 : dist number of distinct trailing values. Always possible
8737 : when dist == 1 or when nunits is constant or when the initializations
8738 : are uniform. */
8739 416 : tree uniform_initval = NULL_TREE;
8740 416 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8741 1688 : for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
8742 : {
8743 452 : gphi *phi = as_a <gphi *> (s->stmt);
8744 452 : if (! uniform_initval)
8745 416 : uniform_initval = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8746 36 : else if (! operand_equal_p (uniform_initval,
8747 36 : PHI_ARG_DEF_FROM_EDGE (phi, pe)))
8748 : {
8749 : uniform_initval = NULL_TREE;
8750 : break;
8751 : }
8752 : }
8753 416 : if (!uniform_initval && !nunits.is_constant ())
8754 : {
8755 : if (dump_enabled_p ())
8756 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8757 : "cannot build initialization vector for "
8758 : "first order recurrence\n");
8759 : return false;
8760 : }
8761 :
8762 : /* First-order recurrence autovectorization needs to handle permutation
8763 : with indices = [nunits-1, nunits, nunits+1, ...]. */
8764 416 : vec_perm_builder sel (nunits, 1, 3);
8765 1664 : for (int i = 0; i < 3; ++i)
8766 1248 : sel.quick_push (nunits - dist + i);
8767 416 : vec_perm_indices indices (sel, 2, nunits);
8768 :
8769 416 : if (cost_vec) /* transformation not required. */
8770 : {
8771 373 : if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8772 : indices))
8773 : return false;
8774 :
8775 : /* We eventually need to set a vector type on invariant
8776 : arguments. */
8777 : unsigned j;
8778 : slp_tree child;
8779 783 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8780 522 : if (!vect_maybe_update_slp_op_vectype (child, vectype))
8781 : {
8782 0 : if (dump_enabled_p ())
8783 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8784 : "incompatible vector types for "
8785 : "invariants\n");
8786 0 : return false;
8787 : }
8788 :
8789 : /* Verify we have set up compatible types. */
8790 261 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8791 261 : slp_tree latch_def = SLP_TREE_CHILDREN (slp_node)[le->dest_idx];
8792 261 : tree latch_vectype = SLP_TREE_VECTYPE (latch_def);
8793 261 : if (!types_compatible_p (latch_vectype, vectype))
8794 : return false;
8795 :
8796 : /* The recurrence costs the initialization vector and one permute
8797 : for each copy. With SLP the prologue value is explicitly
8798 : represented and costed separately. */
8799 261 : unsigned prologue_cost = 0;
8800 261 : unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8801 : slp_node, 0, vect_body);
8802 261 : if (dump_enabled_p ())
8803 50 : dump_printf_loc (MSG_NOTE, vect_location,
8804 : "vectorizable_recurr: inside_cost = %d, "
8805 : "prologue_cost = %d .\n", inside_cost,
8806 : prologue_cost);
8807 :
8808 261 : SLP_TREE_TYPE (slp_node) = recurr_info_type;
8809 261 : return true;
8810 : }
8811 :
8812 43 : tree vec_init;
8813 43 : if (! uniform_initval)
8814 : {
8815 6 : vec<constructor_elt, va_gc> *v = NULL;
8816 6 : vec_alloc (v, nunits.to_constant ());
8817 33 : for (unsigned i = 0; i < nunits.to_constant () - dist; ++i)
8818 27 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
8819 : build_zero_cst (TREE_TYPE (vectype)));
8820 39 : for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
8821 : {
8822 21 : gphi *phi = as_a <gphi *> (s->stmt);
8823 21 : tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8824 21 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
8825 21 : TREE_TYPE (preheader)))
8826 : {
8827 0 : gimple_seq stmts = NULL;
8828 0 : preheader = gimple_convert (&stmts,
8829 0 : TREE_TYPE (vectype), preheader);
8830 0 : gsi_insert_seq_on_edge_immediate (pe, stmts);
8831 : }
8832 21 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, preheader);
8833 : }
8834 6 : vec_init = build_constructor (vectype, v);
8835 : }
8836 : else
8837 : vec_init = uniform_initval;
8838 43 : vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8839 :
8840 : /* Create the vectorized first-order PHI node. */
8841 43 : tree vec_dest = vect_get_new_vect_var (vectype,
8842 : vect_simple_var, "vec_recur_");
8843 43 : basic_block bb = gimple_bb (phi);
8844 43 : gphi *new_phi = create_phi_node (vec_dest, bb);
8845 43 : add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8846 :
8847 : /* Insert shuffles the first-order recurrence autovectorization.
8848 : result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
8849 43 : tree perm = vect_gen_perm_mask_checked (vectype, indices);
8850 :
8851 : /* Insert the required permute after the latch definition. The
8852 : second and later operands are tentative and will be updated when we have
8853 : vectorized the latch definition. */
8854 43 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8855 43 : gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8856 43 : gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8857 51 : do
8858 : {
8859 51 : gsi_next (&gsi2);
8860 : }
8861 : /* Skip inserted vectorized stmts for the latch definition. We have to
8862 : insert after those. */
8863 94 : while (gsi_stmt (gsi2) && gimple_uid (gsi_stmt (gsi2)) == 0);
8864 :
8865 123 : for (unsigned i = 0; i < ncopies; ++i)
8866 : {
8867 80 : vec_dest = make_ssa_name (vectype);
8868 80 : gassign *vperm
8869 123 : = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8870 43 : i == 0 ? gimple_phi_result (new_phi) : NULL,
8871 : NULL, perm);
8872 80 : vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8873 :
8874 80 : slp_node->push_vec_def (vperm);
8875 : }
8876 :
8877 : return true;
8878 416 : }
8879 :
8880 : /* Return true if VECTYPE represents a vector that requires lowering
8881 : by the vector lowering pass. */
8882 :
8883 : bool
8884 807062 : vect_emulated_vector_p (tree vectype)
8885 : {
8886 1614124 : return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8887 811141 : && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8888 4061 : || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8889 : }
8890 :
8891 : /* Return true if we can emulate CODE on an integer mode representation
8892 : of a vector. */
8893 :
8894 : bool
8895 11711 : vect_can_vectorize_without_simd_p (tree_code code)
8896 : {
8897 11711 : switch (code)
8898 : {
8899 : case PLUS_EXPR:
8900 : case MINUS_EXPR:
8901 : case NEGATE_EXPR:
8902 : case BIT_AND_EXPR:
8903 : case BIT_IOR_EXPR:
8904 : case BIT_XOR_EXPR:
8905 : case BIT_NOT_EXPR:
8906 : return true;
8907 :
8908 11146 : default:
8909 11146 : return false;
8910 : }
8911 : }
8912 :
8913 : /* Likewise, but taking a code_helper. */
8914 :
8915 : bool
8916 992 : vect_can_vectorize_without_simd_p (code_helper code)
8917 : {
8918 992 : return (code.is_tree_code ()
8919 992 : && vect_can_vectorize_without_simd_p (tree_code (code)));
8920 : }
8921 :
8922 : /* Create vector init for vectorized iv. */
8923 : static tree
8924 916 : vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8925 : tree step_expr, poly_uint64 nunits,
8926 : tree vectype,
8927 : enum vect_induction_op_type induction_type)
8928 : {
8929 916 : unsigned HOST_WIDE_INT const_nunits;
8930 916 : tree vec_shift, vec_init, new_name;
8931 916 : unsigned i;
8932 916 : tree itype = TREE_TYPE (vectype);
8933 :
8934 : /* iv_loop is the loop to be vectorized. Create:
8935 : vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
8936 916 : new_name = gimple_convert (stmts, itype, init_expr);
8937 916 : switch (induction_type)
8938 : {
8939 18 : case vect_step_op_shr:
8940 18 : case vect_step_op_shl:
8941 : /* Build the Initial value from shift_expr. */
8942 18 : vec_init = gimple_build_vector_from_val (stmts,
8943 : vectype,
8944 : new_name);
8945 18 : vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
8946 : build_zero_cst (itype), step_expr);
8947 18 : vec_init = gimple_build (stmts,
8948 : (induction_type == vect_step_op_shr
8949 : ? RSHIFT_EXPR : LSHIFT_EXPR),
8950 : vectype, vec_init, vec_shift);
8951 18 : break;
8952 :
8953 822 : case vect_step_op_neg:
8954 822 : {
8955 822 : vec_init = gimple_build_vector_from_val (stmts,
8956 : vectype,
8957 : new_name);
8958 822 : tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
8959 : vectype, vec_init);
8960 : /* The encoding has 2 interleaved stepped patterns. */
8961 822 : vec_perm_builder sel (nunits, 2, 3);
8962 822 : sel.quick_grow (6);
8963 4110 : for (i = 0; i < 3; i++)
8964 : {
8965 2466 : sel[2 * i] = i;
8966 2466 : sel[2 * i + 1] = i + nunits;
8967 : }
8968 822 : vec_perm_indices indices (sel, 2, nunits);
8969 : /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
8970 : fail when vec_init is const vector. In that situation vec_perm is not
8971 : really needed. */
8972 822 : tree perm_mask_even
8973 822 : = vect_gen_perm_mask_any (vectype, indices);
8974 822 : vec_init = gimple_build (stmts, VEC_PERM_EXPR,
8975 : vectype,
8976 : vec_init, vec_neg,
8977 : perm_mask_even);
8978 822 : }
8979 822 : break;
8980 :
8981 76 : case vect_step_op_mul:
8982 76 : {
8983 : /* Use unsigned mult to avoid UD integer overflow. */
8984 76 : gcc_assert (nunits.is_constant (&const_nunits));
8985 76 : tree utype = unsigned_type_for (itype);
8986 76 : tree uvectype = build_vector_type (utype,
8987 76 : TYPE_VECTOR_SUBPARTS (vectype));
8988 76 : new_name = gimple_convert (stmts, utype, new_name);
8989 76 : vec_init = gimple_build_vector_from_val (stmts,
8990 : uvectype,
8991 : new_name);
8992 76 : tree_vector_builder elts (uvectype, const_nunits, 1);
8993 76 : tree elt_step = build_one_cst (utype);
8994 :
8995 76 : elts.quick_push (elt_step);
8996 660 : for (i = 1; i < const_nunits; i++)
8997 : {
8998 : /* Create: new_name_i = new_name + step_expr. */
8999 508 : elt_step = gimple_build (stmts, MULT_EXPR,
9000 : utype, elt_step, step_expr);
9001 508 : elts.quick_push (elt_step);
9002 : }
9003 : /* Create a vector from [new_name_0, new_name_1, ...,
9004 : new_name_nunits-1]. */
9005 76 : tree vec_mul = gimple_build_vector (stmts, &elts);
9006 76 : vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9007 : vec_init, vec_mul);
9008 76 : vec_init = gimple_convert (stmts, vectype, vec_init);
9009 76 : }
9010 76 : break;
9011 :
9012 0 : default:
9013 0 : gcc_unreachable ();
9014 : }
9015 :
9016 916 : return vec_init;
9017 : }
9018 :
9019 : /* Peel init_expr by skip_niter for induction_type. */
9020 : tree
9021 84 : vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9022 : tree skip_niters, tree step_expr,
9023 : enum vect_induction_op_type induction_type,
9024 : bool early_exit_p)
9025 : {
9026 84 : gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST || early_exit_p);
9027 84 : tree type = TREE_TYPE (init_expr);
9028 84 : unsigned prec = TYPE_PRECISION (type);
9029 84 : switch (induction_type)
9030 : {
9031 : /* neg inductions are typically not used for loop termination conditions but
9032 : are typically implemented as b = -b. That is every scalar iteration b is
9033 : negated. That means that for the initial value of b we will have to
9034 : determine whether the number of skipped iteration is a multiple of 2
9035 : because every 2 scalar iterations we are back at "b". */
9036 0 : case vect_step_op_neg:
9037 : /* For early exits the neg induction will always be the same value at the
9038 : start of the iteration. */
9039 0 : if (early_exit_p)
9040 : break;
9041 :
9042 0 : if (TREE_INT_CST_LOW (skip_niters) % 2)
9043 0 : init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9044 : /* else no change. */
9045 : break;
9046 :
9047 12 : case vect_step_op_shr:
9048 12 : case vect_step_op_shl:
9049 12 : skip_niters = fold_build1 (NOP_EXPR, type, skip_niters);
9050 12 : step_expr = fold_build1 (NOP_EXPR, type, step_expr);
9051 12 : step_expr = fold_build2 (MULT_EXPR, type, step_expr, skip_niters);
9052 : /* When shift mount >= precision, need to avoid UD.
9053 : In the original loop, there's no UD, and according to semantic,
9054 : init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9055 12 : if ((!tree_fits_uhwi_p (step_expr)
9056 12 : || tree_to_uhwi (step_expr) >= prec)
9057 6 : && !early_exit_p)
9058 : {
9059 6 : if (induction_type == vect_step_op_shl
9060 6 : || TYPE_UNSIGNED (type))
9061 4 : init_expr = build_zero_cst (type);
9062 : else
9063 2 : init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9064 : init_expr,
9065 4 : wide_int_to_tree (type, prec - 1));
9066 : }
9067 : else
9068 : {
9069 8 : init_expr = fold_build2 ((induction_type == vect_step_op_shr
9070 : ? RSHIFT_EXPR : LSHIFT_EXPR),
9071 : type, init_expr, step_expr);
9072 6 : init_expr = force_gimple_operand (init_expr, stmts, false, NULL);
9073 : }
9074 : break;
9075 :
9076 72 : case vect_step_op_mul:
9077 72 : {
9078 : /* Due to UB we can't support vect_step_op_mul with early break for now.
9079 : so assert and block. */
9080 72 : gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9081 72 : tree utype = unsigned_type_for (type);
9082 72 : init_expr = gimple_convert (stmts, utype, init_expr);
9083 72 : wide_int skipn = wi::to_wide (skip_niters);
9084 72 : wide_int begin = wi::to_wide (step_expr);
9085 72 : auto_mpz base, exp, mod, res;
9086 72 : wi::to_mpz (begin, base, TYPE_SIGN (type));
9087 72 : wi::to_mpz (skipn, exp, UNSIGNED);
9088 72 : mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9089 72 : mpz_powm (res, base, exp, mod);
9090 72 : begin = wi::from_mpz (utype, res, true);
9091 72 : tree mult_expr = wide_int_to_tree (utype, begin);
9092 72 : init_expr = gimple_build (stmts, MULT_EXPR, utype,
9093 : init_expr, mult_expr);
9094 72 : init_expr = gimple_convert (stmts, type, init_expr);
9095 72 : }
9096 72 : break;
9097 :
9098 0 : default:
9099 0 : gcc_unreachable ();
9100 : }
9101 :
9102 84 : return init_expr;
9103 : }
9104 :
9105 : /* Create vector step for vectorized iv. */
9106 : static tree
9107 1202 : vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9108 : poly_uint64 vf,
9109 : enum vect_induction_op_type induction_type)
9110 : {
9111 1202 : tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9112 1202 : tree new_name = NULL;
9113 : /* Step should be pow (step, vf) for mult induction. */
9114 1202 : if (induction_type == vect_step_op_mul)
9115 : {
9116 76 : gcc_assert (vf.is_constant ());
9117 76 : wide_int begin = wi::to_wide (step_expr);
9118 :
9119 584 : for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9120 508 : begin = wi::mul (begin, wi::to_wide (step_expr));
9121 :
9122 76 : new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9123 76 : }
9124 1126 : else if (induction_type == vect_step_op_neg)
9125 : /* Do nothing. */
9126 : ;
9127 : else
9128 18 : new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9129 : expr, step_expr);
9130 1202 : return new_name;
9131 : }
9132 :
9133 : static tree
9134 1202 : vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9135 : stmt_vec_info stmt_info,
9136 : tree new_name, tree vectype,
9137 : enum vect_induction_op_type induction_type)
9138 : {
9139 : /* No step is needed for neg induction. */
9140 1202 : if (induction_type == vect_step_op_neg)
9141 : return NULL;
9142 :
9143 94 : tree t = unshare_expr (new_name);
9144 94 : gcc_assert (CONSTANT_CLASS_P (new_name)
9145 : || TREE_CODE (new_name) == SSA_NAME);
9146 94 : tree new_vec = build_vector_from_val (vectype, t);
9147 94 : tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9148 : new_vec, vectype, NULL);
9149 94 : return vec_step;
9150 : }
9151 :
9152 : /* Update vectorized iv with vect_step, induc_def is init. */
9153 : static tree
9154 1390 : vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9155 : tree induc_def, tree vec_step,
9156 : enum vect_induction_op_type induction_type)
9157 : {
9158 1390 : tree vec_def = induc_def;
9159 1390 : switch (induction_type)
9160 : {
9161 76 : case vect_step_op_mul:
9162 76 : {
9163 : /* Use unsigned mult to avoid UD integer overflow. */
9164 76 : tree uvectype = unsigned_type_for (vectype);
9165 76 : vec_def = gimple_convert (stmts, uvectype, vec_def);
9166 76 : vec_step = gimple_convert (stmts, uvectype, vec_step);
9167 76 : vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9168 : vec_def, vec_step);
9169 76 : vec_def = gimple_convert (stmts, vectype, vec_def);
9170 : }
9171 76 : break;
9172 :
9173 12 : case vect_step_op_shr:
9174 12 : vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9175 : vec_def, vec_step);
9176 12 : break;
9177 :
9178 6 : case vect_step_op_shl:
9179 6 : vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9180 : vec_def, vec_step);
9181 6 : break;
9182 : case vect_step_op_neg:
9183 : vec_def = induc_def;
9184 : /* Do nothing. */
9185 : break;
9186 0 : default:
9187 0 : gcc_unreachable ();
9188 : }
9189 :
9190 1390 : return vec_def;
9191 :
9192 : }
9193 :
9194 : /* Function vectorizable_nonlinear_induction
9195 :
9196 : Check if STMT_INFO performs an nonlinear induction computation that can be
9197 : vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9198 : a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9199 : basic block.
9200 : Return true if STMT_INFO is vectorizable in this way. */
9201 :
9202 : static bool
9203 9198 : vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9204 : stmt_vec_info stmt_info,
9205 : slp_tree slp_node,
9206 : stmt_vector_for_cost *cost_vec)
9207 : {
9208 9198 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9209 9198 : unsigned ncopies;
9210 9198 : bool nested_in_vect_loop = false;
9211 9198 : class loop *iv_loop;
9212 9198 : tree vec_def;
9213 9198 : edge pe = loop_preheader_edge (loop);
9214 9198 : basic_block new_bb;
9215 9198 : tree vec_init, vec_step;
9216 9198 : tree new_name;
9217 9198 : gimple *new_stmt;
9218 9198 : gphi *induction_phi;
9219 9198 : tree induc_def, vec_dest;
9220 9198 : tree init_expr, step_expr;
9221 9198 : tree niters_skip;
9222 9198 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9223 9198 : unsigned i;
9224 9198 : gimple_stmt_iterator si;
9225 :
9226 9198 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9227 :
9228 9198 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9229 9198 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9230 9198 : enum vect_induction_op_type induction_type
9231 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9232 :
9233 9198 : gcc_assert (induction_type > vect_step_op_add);
9234 :
9235 9198 : ncopies = vect_get_num_copies (loop_vinfo, slp_node);
9236 9198 : gcc_assert (ncopies >= 1);
9237 :
9238 : /* FORNOW. Only handle nonlinear induction in the same loop. */
9239 9198 : if (nested_in_vect_loop_p (loop, stmt_info))
9240 : {
9241 0 : if (dump_enabled_p ())
9242 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9243 : "nonlinear induction in nested loop.\n");
9244 0 : return false;
9245 : }
9246 :
9247 9198 : iv_loop = loop;
9248 9198 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9249 :
9250 : /* TODO: Support multi-lane SLP for nonlinear iv. There should be separate
9251 : vector iv update for each iv and a permutation to generate wanted
9252 : vector iv. */
9253 9198 : if (SLP_TREE_LANES (slp_node) > 1)
9254 : {
9255 0 : if (dump_enabled_p ())
9256 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9257 : "SLP induction not supported for nonlinear"
9258 : " induction.\n");
9259 0 : return false;
9260 : }
9261 :
9262 9198 : if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9263 : {
9264 0 : if (dump_enabled_p ())
9265 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9266 : "floating point nonlinear induction vectorization"
9267 : " not supported.\n");
9268 0 : return false;
9269 : }
9270 :
9271 9198 : step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9272 9198 : init_expr = vect_phi_initial_value (phi);
9273 9198 : gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9274 : && TREE_CODE (step_expr) == INTEGER_CST);
9275 : /* step_expr should be aligned with init_expr,
9276 : .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9277 9198 : step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9278 :
9279 9198 : if (TREE_CODE (init_expr) == INTEGER_CST)
9280 4097 : init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9281 5101 : else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9282 : {
9283 : /* INIT_EXPR could be a bit_field, bail out for such case. */
9284 4 : if (dump_enabled_p ())
9285 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9286 : "nonlinear induction vectorization failed:"
9287 : " component type of vectype is not a nop conversion"
9288 : " from type of init_expr.\n");
9289 4 : return false;
9290 : }
9291 :
9292 9194 : switch (induction_type)
9293 : {
9294 3718 : case vect_step_op_neg:
9295 3718 : if (maybe_eq (TYPE_VECTOR_SUBPARTS (vectype), 1u))
9296 : return false;
9297 3556 : if (TREE_CODE (init_expr) != INTEGER_CST
9298 282 : && TREE_CODE (init_expr) != REAL_CST)
9299 : {
9300 : /* Check for backend support of NEGATE_EXPR and vec_perm. */
9301 282 : if (!directly_supported_p (NEGATE_EXPR, vectype))
9302 0 : return false;
9303 :
9304 : /* The encoding has 2 interleaved stepped patterns. */
9305 282 : vec_perm_builder sel (nunits, 2, 3);
9306 282 : machine_mode mode = TYPE_MODE (vectype);
9307 282 : sel.quick_grow (6);
9308 1410 : for (i = 0; i < 3; i++)
9309 : {
9310 846 : sel[i * 2] = i;
9311 846 : sel[i * 2 + 1] = i + nunits;
9312 : }
9313 282 : vec_perm_indices indices (sel, 2, nunits);
9314 282 : if (!can_vec_perm_const_p (mode, mode, indices))
9315 0 : return false;
9316 282 : }
9317 : break;
9318 :
9319 1066 : case vect_step_op_mul:
9320 1066 : {
9321 : /* Check for backend support of MULT_EXPR. */
9322 1066 : if (!directly_supported_p (MULT_EXPR, vectype))
9323 : return false;
9324 :
9325 : /* ?? How to construct vector step for variable number vector.
9326 : [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9327 : if (!vf.is_constant ())
9328 : return false;
9329 : }
9330 : break;
9331 :
9332 4092 : case vect_step_op_shr:
9333 : /* Check for backend support of RSHIFT_EXPR. */
9334 4092 : if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9335 : return false;
9336 :
9337 : /* Don't shift more than type precision to avoid UD. */
9338 26 : if (!tree_fits_uhwi_p (step_expr)
9339 26 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9340 : TYPE_PRECISION (TREE_TYPE (init_expr))))
9341 : return false;
9342 : break;
9343 :
9344 318 : case vect_step_op_shl:
9345 : /* Check for backend support of RSHIFT_EXPR. */
9346 318 : if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9347 : return false;
9348 :
9349 : /* Don't shift more than type precision to avoid UD. */
9350 12 : if (!tree_fits_uhwi_p (step_expr)
9351 12 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9352 : TYPE_PRECISION (TREE_TYPE (init_expr))))
9353 : return false;
9354 :
9355 : break;
9356 :
9357 0 : default:
9358 0 : gcc_unreachable ();
9359 : }
9360 :
9361 4420 : if (cost_vec) /* transformation not required. */
9362 : {
9363 3504 : unsigned inside_cost = 0, prologue_cost = 0;
9364 : /* loop cost for vec_loop. Neg induction doesn't have any
9365 : inside_cost. */
9366 3504 : inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9367 : slp_node, 0, vect_body);
9368 :
9369 : /* loop cost for vec_loop. Neg induction doesn't have any
9370 : inside_cost. */
9371 3504 : if (induction_type == vect_step_op_neg)
9372 2734 : inside_cost = 0;
9373 :
9374 : /* prologue cost for vec_init and vec_step. */
9375 3504 : prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9376 : slp_node, 0, vect_prologue);
9377 :
9378 3504 : if (dump_enabled_p ())
9379 68 : dump_printf_loc (MSG_NOTE, vect_location,
9380 : "vect_model_induction_cost: inside_cost = %d, "
9381 : "prologue_cost = %d. \n", inside_cost,
9382 : prologue_cost);
9383 :
9384 3504 : SLP_TREE_TYPE (slp_node) = induc_vec_info_type;
9385 3504 : DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9386 3504 : return true;
9387 : }
9388 :
9389 : /* Transform. */
9390 :
9391 : /* Compute a vector variable, initialized with the first VF values of
9392 : the induction variable. E.g., for an iv with IV_PHI='X' and
9393 : evolution S, for a vector of 4 units, we want to compute:
9394 : [X, X + S, X + 2*S, X + 3*S]. */
9395 :
9396 916 : if (dump_enabled_p ())
9397 32 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9398 :
9399 916 : pe = loop_preheader_edge (iv_loop);
9400 : /* Find the first insertion point in the BB. */
9401 916 : basic_block bb = gimple_bb (phi);
9402 916 : si = gsi_after_labels (bb);
9403 :
9404 916 : gimple_seq stmts = NULL;
9405 :
9406 916 : niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9407 : /* If we are using the loop mask to "peel" for alignment then we need
9408 : to adjust the start value here. */
9409 916 : if (niters_skip != NULL_TREE)
9410 0 : init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9411 : step_expr, induction_type, false);
9412 :
9413 916 : vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9414 : step_expr, nunits, vectype,
9415 : induction_type);
9416 916 : if (stmts)
9417 : {
9418 162 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9419 162 : gcc_assert (!new_bb);
9420 : }
9421 :
9422 916 : stmts = NULL;
9423 916 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9424 : vf, induction_type);
9425 916 : if (stmts)
9426 : {
9427 0 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9428 0 : gcc_assert (!new_bb);
9429 : }
9430 :
9431 916 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9432 : new_name, vectype,
9433 : induction_type);
9434 : /* Create the following def-use cycle:
9435 : loop prolog:
9436 : vec_init = ...
9437 : vec_step = ...
9438 : loop:
9439 : vec_iv = PHI <vec_init, vec_loop>
9440 : ...
9441 : STMT
9442 : ...
9443 : vec_loop = vec_iv + vec_step; */
9444 :
9445 : /* Create the induction-phi that defines the induction-operand. */
9446 916 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9447 916 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
9448 916 : induc_def = PHI_RESULT (induction_phi);
9449 :
9450 : /* Create the iv update inside the loop. */
9451 916 : stmts = NULL;
9452 916 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9453 : induc_def, vec_step,
9454 : induction_type);
9455 :
9456 916 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9457 916 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9458 :
9459 : /* Set the arguments of the phi node: */
9460 916 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9461 916 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9462 : UNKNOWN_LOCATION);
9463 :
9464 916 : slp_node->push_vec_def (induction_phi);
9465 :
9466 : /* In case that vectorization factor (VF) is bigger than the number
9467 : of elements that we can fit in a vectype (nunits), we have to generate
9468 : more than one vector stmt - i.e - we need to "unroll" the
9469 : vector stmt by a factor VF/nunits. For more details see documentation
9470 : in vectorizable_operation. */
9471 :
9472 916 : if (ncopies > 1)
9473 : {
9474 286 : stmts = NULL;
9475 : /* FORNOW. This restriction should be relaxed. */
9476 286 : gcc_assert (!nested_in_vect_loop);
9477 :
9478 286 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9479 : nunits, induction_type);
9480 :
9481 286 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9482 : new_name, vectype,
9483 : induction_type);
9484 286 : vec_def = induc_def;
9485 1046 : for (i = 1; i < ncopies; i++)
9486 : {
9487 : /* vec_i = vec_prev + vec_step. */
9488 474 : stmts = NULL;
9489 474 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9490 : vec_def, vec_step,
9491 : induction_type);
9492 474 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9493 474 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9494 474 : slp_node->push_vec_def (new_stmt);
9495 : }
9496 : }
9497 :
9498 916 : if (dump_enabled_p ())
9499 64 : dump_printf_loc (MSG_NOTE, vect_location,
9500 : "transform induction: created def-use cycle: %G%G",
9501 32 : (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9502 :
9503 : return true;
9504 : }
9505 :
9506 : /* Function vectorizable_induction
9507 :
9508 : Check if STMT_INFO performs an induction computation that can be vectorized.
9509 : If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9510 : phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9511 : Return true if STMT_INFO is vectorizable in this way. */
9512 :
9513 : bool
9514 316467 : vectorizable_induction (loop_vec_info loop_vinfo,
9515 : stmt_vec_info stmt_info,
9516 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9517 : {
9518 316467 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9519 316467 : bool nested_in_vect_loop = false;
9520 316467 : class loop *iv_loop;
9521 316467 : tree vec_def;
9522 316467 : edge pe = loop_preheader_edge (loop);
9523 316467 : basic_block new_bb;
9524 316467 : tree vec_init = NULL_TREE, vec_step, t;
9525 316467 : tree new_name;
9526 316467 : gphi *induction_phi;
9527 316467 : tree induc_def, vec_dest;
9528 316467 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9529 316467 : unsigned i;
9530 316467 : tree expr;
9531 316467 : tree index_vectype = NULL_TREE;
9532 316467 : gimple_stmt_iterator si;
9533 316467 : enum vect_induction_op_type induction_type
9534 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9535 :
9536 347666 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9537 170445 : if (!phi)
9538 : return false;
9539 :
9540 170445 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
9541 : return false;
9542 :
9543 : /* Make sure it was recognized as induction computation. */
9544 170445 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9545 : return false;
9546 :
9547 : /* Handle nonlinear induction in a separate place. */
9548 166454 : if (induction_type != vect_step_op_add)
9549 9198 : return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9550 9198 : slp_node, cost_vec);
9551 :
9552 157256 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9553 157256 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9554 :
9555 : /* FORNOW. These restrictions should be relaxed. */
9556 157256 : if (nested_in_vect_loop_p (loop, stmt_info))
9557 : {
9558 813 : imm_use_iterator imm_iter;
9559 813 : use_operand_p use_p;
9560 813 : gimple *exit_phi;
9561 813 : edge latch_e;
9562 813 : tree loop_arg;
9563 :
9564 813 : exit_phi = NULL;
9565 813 : latch_e = loop_latch_edge (loop->inner);
9566 813 : loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9567 2475 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9568 : {
9569 873 : gimple *use_stmt = USE_STMT (use_p);
9570 873 : if (is_gimple_debug (use_stmt))
9571 36 : continue;
9572 :
9573 837 : if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9574 : {
9575 : exit_phi = use_stmt;
9576 : break;
9577 : }
9578 813 : }
9579 813 : if (exit_phi)
9580 : {
9581 24 : stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9582 24 : if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9583 8 : && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9584 : {
9585 16 : if (dump_enabled_p ())
9586 16 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9587 : "inner-loop induction only used outside "
9588 : "of the outer vectorized loop.\n");
9589 16 : return false;
9590 : }
9591 : }
9592 :
9593 797 : nested_in_vect_loop = true;
9594 797 : iv_loop = loop->inner;
9595 : }
9596 : else
9597 : iv_loop = loop;
9598 157240 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9599 :
9600 157240 : if (!nunits.is_constant () && SLP_TREE_LANES (slp_node) != 1)
9601 : {
9602 : /* The current SLP code creates the step value element-by-element. */
9603 : if (dump_enabled_p ())
9604 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9605 : "SLP induction not supported for variable-length"
9606 : " vectors.\n");
9607 : return false;
9608 : }
9609 :
9610 157240 : if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9611 : {
9612 12 : if (dump_enabled_p ())
9613 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9614 : "floating point induction vectorization disabled\n");
9615 12 : return false;
9616 : }
9617 :
9618 157228 : tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9619 157228 : gcc_assert (step_expr != NULL_TREE);
9620 314432 : if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
9621 314333 : && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
9622 : {
9623 12 : if (dump_enabled_p ())
9624 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9625 : "bit-precision induction vectorization not "
9626 : "supported.\n");
9627 12 : return false;
9628 : }
9629 157216 : tree stept = TREE_TYPE (step_expr);
9630 157216 : tree step_vectype = get_same_sized_vectype (stept, vectype);
9631 157216 : stept = TREE_TYPE (step_vectype);
9632 :
9633 : /* Check for target support of the vectorized arithmetic used here. */
9634 157216 : if (!target_supports_op_p (step_vectype, PLUS_EXPR, optab_default)
9635 157216 : || !target_supports_op_p (step_vectype, MINUS_EXPR, optab_default))
9636 27168 : return false;
9637 130048 : if (!nunits.is_constant ())
9638 : {
9639 : if (!target_supports_op_p (step_vectype, MULT_EXPR, optab_default))
9640 : return false;
9641 : /* FLOAT_EXPR when computing VEC_INIT for float inductions. */
9642 : if (SCALAR_FLOAT_TYPE_P (stept))
9643 : {
9644 : tree index_type = build_nonstandard_integer_type
9645 : (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (stept)), 1);
9646 :
9647 : index_vectype = build_vector_type (index_type, nunits);
9648 : if (!can_float_p (TYPE_MODE (step_vectype),
9649 : TYPE_MODE (index_vectype), 1))
9650 : return false;
9651 : }
9652 : }
9653 :
9654 130048 : unsigned nvects = vect_get_num_copies (loop_vinfo, slp_node);
9655 130048 : if (cost_vec) /* transformation not required. */
9656 : {
9657 344049 : unsigned inside_cost = 0, prologue_cost = 0;
9658 : /* We eventually need to set a vector type on invariant
9659 : arguments. */
9660 : unsigned j;
9661 : slp_tree child;
9662 344049 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9663 229366 : if (!vect_maybe_update_slp_op_vectype
9664 229366 : (child, SLP_TREE_VECTYPE (slp_node)))
9665 : {
9666 0 : if (dump_enabled_p ())
9667 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9668 : "incompatible vector types for "
9669 : "invariants\n");
9670 0 : return false;
9671 : }
9672 : /* loop cost for vec_loop. */
9673 114683 : inside_cost = record_stmt_cost (cost_vec, nvects,
9674 : vector_stmt, slp_node, 0, vect_body);
9675 : /* prologue cost for vec_init (if not nested) and step. */
9676 114683 : prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9677 : scalar_to_vec,
9678 : slp_node, 0, vect_prologue);
9679 114683 : if (dump_enabled_p ())
9680 4076 : dump_printf_loc (MSG_NOTE, vect_location,
9681 : "vect_model_induction_cost: inside_cost = %d, "
9682 : "prologue_cost = %d .\n", inside_cost,
9683 : prologue_cost);
9684 :
9685 114683 : SLP_TREE_TYPE (slp_node) = induc_vec_info_type;
9686 114683 : DUMP_VECT_SCOPE ("vectorizable_induction");
9687 114683 : return true;
9688 : }
9689 :
9690 : /* Transform. */
9691 :
9692 : /* Compute a vector variable, initialized with the first VF values of
9693 : the induction variable. E.g., for an iv with IV_PHI='X' and
9694 : evolution S, for a vector of 4 units, we want to compute:
9695 : [X, X + S, X + 2*S, X + 3*S]. */
9696 :
9697 15365 : if (dump_enabled_p ())
9698 2779 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9699 :
9700 15365 : pe = loop_preheader_edge (iv_loop);
9701 : /* Find the first insertion point in the BB. */
9702 15365 : basic_block bb = gimple_bb (phi);
9703 15365 : si = gsi_after_labels (bb);
9704 :
9705 : /* For SLP induction we have to generate several IVs as for example
9706 : with group size 3 we need
9707 : [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9708 : [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9709 15365 : gimple_stmt_iterator incr_si;
9710 15365 : bool insert_after;
9711 15365 : standard_iv_increment_position (iv_loop, &incr_si, &insert_after);
9712 :
9713 : /* The initial values are vectorized, but any lanes > group_size
9714 : need adjustment. */
9715 15365 : slp_tree init_node
9716 15365 : = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9717 :
9718 : /* Gather steps. Since we do not vectorize inductions as
9719 : cycles we have to reconstruct the step from SCEV data. */
9720 15365 : unsigned group_size = SLP_TREE_LANES (slp_node);
9721 15365 : tree *steps = XALLOCAVEC (tree, group_size);
9722 15365 : tree *inits = XALLOCAVEC (tree, group_size);
9723 15365 : stmt_vec_info phi_info;
9724 47323 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9725 : {
9726 16593 : steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9727 16593 : if (!init_node)
9728 16348 : inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9729 : pe->dest_idx);
9730 : }
9731 :
9732 : /* Now generate the IVs. */
9733 30730 : gcc_assert (multiple_p (nunits * nvects, group_size));
9734 15365 : unsigned nivs;
9735 15365 : unsigned HOST_WIDE_INT const_nunits;
9736 15365 : if (nested_in_vect_loop)
9737 : nivs = nvects;
9738 15141 : else if (nunits.is_constant (&const_nunits))
9739 : {
9740 : /* Compute the number of distinct IVs we need. First reduce
9741 : group_size if it is a multiple of const_nunits so we get
9742 : one IV for a group_size of 4 but const_nunits 2. */
9743 15141 : unsigned group_sizep = group_size;
9744 15141 : if (group_sizep % const_nunits == 0)
9745 111 : group_sizep = group_sizep / const_nunits;
9746 15141 : nivs = least_common_multiple (group_sizep, const_nunits) / const_nunits;
9747 : }
9748 : else
9749 : {
9750 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
9751 : nivs = 1;
9752 : }
9753 15365 : gimple_seq init_stmts = NULL;
9754 15365 : tree lupdate_mul = NULL_TREE;
9755 224 : if (!nested_in_vect_loop)
9756 : {
9757 15141 : if (nunits.is_constant (&const_nunits))
9758 : {
9759 : /* The number of iterations covered in one vector iteration. */
9760 15141 : unsigned lup_mul = (nvects * const_nunits) / group_size;
9761 15141 : lupdate_mul
9762 15141 : = build_vector_from_val (step_vectype,
9763 15141 : SCALAR_FLOAT_TYPE_P (stept)
9764 28 : ? build_real_from_wide (stept, lup_mul,
9765 : UNSIGNED)
9766 30254 : : build_int_cstu (stept, lup_mul));
9767 : }
9768 : else
9769 : {
9770 : if (SCALAR_FLOAT_TYPE_P (stept))
9771 : {
9772 : tree tem = build_int_cst (integer_type_node, vf);
9773 : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
9774 : }
9775 : else
9776 : lupdate_mul = build_int_cst (stept, vf);
9777 : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
9778 : lupdate_mul);
9779 : }
9780 : }
9781 15365 : tree peel_mul = NULL_TREE;
9782 15365 : if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9783 : {
9784 0 : if (SCALAR_FLOAT_TYPE_P (stept))
9785 0 : peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9786 : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9787 : else
9788 0 : peel_mul = gimple_convert (&init_stmts, stept,
9789 : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9790 0 : peel_mul = gimple_build_vector_from_val (&init_stmts,
9791 : step_vectype, peel_mul);
9792 : }
9793 15365 : tree step_mul = NULL_TREE;
9794 15365 : unsigned ivn;
9795 15365 : auto_vec<tree> vec_steps;
9796 31302 : for (ivn = 0; ivn < nivs; ++ivn)
9797 : {
9798 15937 : gimple_seq stmts = NULL;
9799 15937 : bool invariant = true;
9800 15937 : if (nunits.is_constant (&const_nunits))
9801 : {
9802 15937 : tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9803 15937 : tree_vector_builder init_elts (vectype, const_nunits, 1);
9804 15937 : tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9805 102685 : for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9806 : {
9807 : /* The scalar steps of the IVs. */
9808 86748 : tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9809 86748 : elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9810 86748 : step_elts.quick_push (elt);
9811 86748 : if (!init_node)
9812 : {
9813 : /* The scalar inits of the IVs if not vectorized. */
9814 85486 : elt = inits[(ivn*const_nunits + eltn) % group_size];
9815 85486 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
9816 85486 : TREE_TYPE (elt)))
9817 260 : elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9818 260 : TREE_TYPE (vectype), elt);
9819 85486 : init_elts.quick_push (elt);
9820 : }
9821 : /* The number of steps to add to the initial values. */
9822 86748 : unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9823 173496 : mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9824 173394 : ? build_real_from_wide (stept, mul_elt,
9825 : UNSIGNED)
9826 173394 : : build_int_cstu (stept, mul_elt));
9827 : }
9828 15937 : vec_step = gimple_build_vector (&init_stmts, &step_elts);
9829 15937 : step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9830 15937 : if (!init_node)
9831 15679 : vec_init = gimple_build_vector (&init_stmts, &init_elts);
9832 15937 : }
9833 : else
9834 : {
9835 : tree step = gimple_convert (&init_stmts, stept, steps[0]);
9836 : if (init_node)
9837 : ;
9838 : else if (INTEGRAL_TYPE_P (stept))
9839 : {
9840 : new_name = gimple_convert (&init_stmts, stept, inits[0]);
9841 : /* Build the initial value directly as a VEC_SERIES_EXPR. */
9842 : vec_init = gimple_build (&init_stmts, VEC_SERIES_EXPR,
9843 : step_vectype, new_name, step);
9844 : if (!useless_type_conversion_p (vectype, step_vectype))
9845 : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9846 : vectype, vec_init);
9847 : }
9848 : else
9849 : {
9850 : /* Build:
9851 : [base, base, base, ...]
9852 : + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
9853 : gcc_assert (SCALAR_FLOAT_TYPE_P (stept));
9854 : gcc_assert (flag_associative_math);
9855 : gcc_assert (index_vectype != NULL_TREE);
9856 :
9857 : tree index = build_index_vector (index_vectype, 0, 1);
9858 : new_name = gimple_convert (&init_stmts, stept, inits[0]);
9859 : tree base_vec = gimple_build_vector_from_val (&init_stmts,
9860 : step_vectype,
9861 : new_name);
9862 : tree step_vec = gimple_build_vector_from_val (&init_stmts,
9863 : step_vectype,
9864 : step);
9865 : vec_init = gimple_build (&init_stmts, FLOAT_EXPR,
9866 : step_vectype, index);
9867 : vec_init = gimple_build (&init_stmts, MULT_EXPR,
9868 : step_vectype, vec_init, step_vec);
9869 : vec_init = gimple_build (&init_stmts, PLUS_EXPR,
9870 : step_vectype, vec_init, base_vec);
9871 : if (!useless_type_conversion_p (vectype, step_vectype))
9872 : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9873 : vectype, vec_init);
9874 : }
9875 : /* iv_loop is nested in the loop to be vectorized. Generate:
9876 : vec_step = [S, S, S, S] */
9877 : t = unshare_expr (step);
9878 : gcc_assert (CONSTANT_CLASS_P (t)
9879 : || TREE_CODE (t) == SSA_NAME);
9880 : vec_step = gimple_build_vector_from_val (&init_stmts,
9881 : step_vectype, t);
9882 : }
9883 15937 : vec_steps.safe_push (vec_step);
9884 15937 : if (peel_mul)
9885 : {
9886 0 : if (!step_mul)
9887 : {
9888 0 : gcc_assert (!nunits.is_constant ());
9889 : step_mul = gimple_build (&init_stmts,
9890 : MINUS_EXPR, step_vectype,
9891 : build_zero_cst (step_vectype), peel_mul);
9892 : }
9893 : else
9894 0 : step_mul = gimple_build (&init_stmts,
9895 : MINUS_EXPR, step_vectype,
9896 : step_mul, peel_mul);
9897 : }
9898 :
9899 : /* Create the induction-phi that defines the induction-operand. */
9900 15937 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9901 : "vec_iv_");
9902 15937 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
9903 15937 : induc_def = PHI_RESULT (induction_phi);
9904 :
9905 : /* Create the iv update inside the loop */
9906 15937 : tree up = vec_step;
9907 15937 : if (lupdate_mul)
9908 : {
9909 15679 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
9910 : {
9911 : /* When we're using loop_len produced by SELEC_VL, the
9912 : non-final iterations are not always processing VF
9913 : elements. So vectorize induction variable instead of
9914 :
9915 : _21 = vect_vec_iv_.6_22 + { VF, ... };
9916 :
9917 : We should generate:
9918 :
9919 : _35 = .SELECT_VL (ivtmp_33, VF);
9920 : vect_cst__22 = [vec_duplicate_expr] _35;
9921 : _21 = vect_vec_iv_.6_22 + vect_cst__22; */
9922 0 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
9923 0 : tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1,
9924 : vectype, 0, 0, false);
9925 0 : if (SCALAR_FLOAT_TYPE_P (stept))
9926 0 : expr = gimple_build (&stmts, FLOAT_EXPR, stept, len);
9927 : else
9928 0 : expr = gimple_convert (&stmts, stept, len);
9929 0 : lupdate_mul = gimple_build_vector_from_val (&stmts, step_vectype,
9930 : expr);
9931 0 : up = gimple_build (&stmts, MULT_EXPR,
9932 : step_vectype, vec_step, lupdate_mul);
9933 : }
9934 : else
9935 15679 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9936 : vec_step, lupdate_mul);
9937 : }
9938 15937 : vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9939 15937 : vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, up);
9940 15937 : vec_def = gimple_convert (&stmts, vectype, vec_def);
9941 15937 : insert_iv_increment (&incr_si, insert_after, stmts);
9942 15937 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9943 : UNKNOWN_LOCATION);
9944 :
9945 15937 : if (init_node)
9946 258 : vec_init = vect_get_slp_vect_def (init_node, ivn);
9947 15937 : if (!nested_in_vect_loop
9948 15937 : && step_mul
9949 15937 : && !integer_zerop (step_mul))
9950 : {
9951 15238 : gcc_assert (invariant);
9952 15238 : vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9953 15238 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9954 : vec_step, step_mul);
9955 15238 : vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9956 : vec_def, up);
9957 15238 : vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9958 : }
9959 :
9960 : /* Set the arguments of the phi node: */
9961 15937 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9962 :
9963 15937 : slp_node->push_vec_def (induction_phi);
9964 : }
9965 15365 : if (!nested_in_vect_loop)
9966 : {
9967 : /* Fill up to the number of vectors we need for the whole group. */
9968 15141 : if (nunits.is_constant (&const_nunits))
9969 15141 : nivs = least_common_multiple (group_size, const_nunits) / const_nunits;
9970 : else
9971 : nivs = 1;
9972 15141 : vec_steps.reserve (nivs-ivn);
9973 30303 : for (; ivn < nivs; ++ivn)
9974 : {
9975 21 : slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
9976 21 : vec_steps.quick_push (vec_steps[0]);
9977 : }
9978 : }
9979 :
9980 : /* Re-use IVs when we can. We are generating further vector
9981 : stmts by adding VF' * stride to the IVs generated above. */
9982 15365 : if (ivn < nvects)
9983 : {
9984 3392 : if (nunits.is_constant (&const_nunits))
9985 : {
9986 3392 : unsigned vfp = (least_common_multiple (group_size, const_nunits)
9987 3392 : / group_size);
9988 3392 : lupdate_mul
9989 3392 : = build_vector_from_val (step_vectype,
9990 3392 : SCALAR_FLOAT_TYPE_P (stept)
9991 8 : ? build_real_from_wide (stept,
9992 8 : vfp, UNSIGNED)
9993 6776 : : build_int_cstu (stept, vfp));
9994 : }
9995 : else
9996 : {
9997 : if (SCALAR_FLOAT_TYPE_P (stept))
9998 : {
9999 : tree tem = build_int_cst (integer_type_node, nunits);
10000 : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
10001 : }
10002 : else
10003 : lupdate_mul = build_int_cst (stept, nunits);
10004 : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
10005 : lupdate_mul);
10006 : }
10007 10900 : for (; ivn < nvects; ++ivn)
10008 : {
10009 7508 : gimple *iv
10010 7508 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10011 7508 : tree def = gimple_get_lhs (iv);
10012 7508 : if (ivn < 2*nivs)
10013 3480 : vec_steps[ivn - nivs]
10014 3480 : = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10015 3480 : vec_steps[ivn - nivs], lupdate_mul);
10016 7508 : gimple_seq stmts = NULL;
10017 7508 : def = gimple_convert (&stmts, step_vectype, def);
10018 22524 : def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10019 7508 : def, vec_steps[ivn % nivs]);
10020 7508 : def = gimple_convert (&stmts, vectype, def);
10021 7508 : if (gimple_code (iv) == GIMPLE_PHI)
10022 3480 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10023 : else
10024 : {
10025 4028 : gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10026 4028 : gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10027 : }
10028 7508 : slp_node->push_vec_def (def);
10029 : }
10030 : }
10031 :
10032 15365 : new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10033 15365 : gcc_assert (!new_bb);
10034 :
10035 15365 : return true;
10036 15365 : }
10037 :
10038 : /* Function vectorizable_live_operation_1.
10039 :
10040 : helper function for vectorizable_live_operation. */
10041 :
10042 : static tree
10043 2843 : vectorizable_live_operation_1 (loop_vec_info loop_vinfo, basic_block exit_bb,
10044 : tree vectype, slp_tree slp_node,
10045 : tree bitsize, tree bitstart, tree vec_lhs,
10046 : tree lhs_type, gimple_stmt_iterator *exit_gsi)
10047 : {
10048 2843 : gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10049 :
10050 2843 : tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10051 2843 : gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10052 5688 : for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10053 2845 : SET_PHI_ARG_DEF (phi, i, vec_lhs);
10054 :
10055 2843 : gimple_seq stmts = NULL;
10056 2843 : tree new_tree;
10057 :
10058 : /* If bitstart is 0 then we can use a BIT_FIELD_REF */
10059 2843 : if (integer_zerop (bitstart))
10060 : {
10061 217 : tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10062 : vec_lhs_phi, bitsize, bitstart);
10063 :
10064 : /* Convert the extracted vector element to the scalar type. */
10065 217 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10066 : }
10067 2626 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10068 : {
10069 : /* Emit:
10070 :
10071 : SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - 1>
10072 :
10073 : where VEC_LHS is the vectorized live-out result, LEN is the length of
10074 : the vector, BIAS is the load-store bias. The bias should not be used
10075 : at all since we are not using load/store operations, but LEN will be
10076 : REALLEN + BIAS, so subtract it to get to the correct position. */
10077 0 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
10078 0 : gimple_seq tem = NULL;
10079 0 : gimple_stmt_iterator gsi = gsi_last (tem);
10080 0 : tree len = vect_get_loop_len (loop_vinfo, &gsi,
10081 : &LOOP_VINFO_LENS (loop_vinfo),
10082 : 1, vectype, 0, 1, false);
10083 0 : gimple_seq_add_seq (&stmts, tem);
10084 :
10085 : /* LAST_INDEX = LEN - 1. */
10086 0 : tree last_index = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (len),
10087 0 : len, build_one_cst (TREE_TYPE (len)));
10088 :
10089 : /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - 1>. */
10090 0 : tree scalar_res
10091 0 : = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10092 : vec_lhs_phi, last_index);
10093 :
10094 : /* Convert the extracted vector element to the scalar type. */
10095 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10096 : }
10097 2626 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10098 : {
10099 : /* Emit:
10100 :
10101 : SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10102 :
10103 : where VEC_LHS is the vectorized live-out result and MASK is
10104 : the loop mask for the final iteration. */
10105 0 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
10106 0 : tree scalar_type = TREE_TYPE (vectype);
10107 0 : gimple_seq tem = NULL;
10108 0 : gimple_stmt_iterator gsi = gsi_last (tem);
10109 0 : tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10110 : &LOOP_VINFO_MASKS (loop_vinfo),
10111 : 1, vectype, 0);
10112 0 : tree scalar_res;
10113 0 : gimple_seq_add_seq (&stmts, tem);
10114 :
10115 0 : scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10116 : mask, vec_lhs_phi);
10117 :
10118 : /* Convert the extracted vector element to the scalar type. */
10119 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10120 : }
10121 : else
10122 : {
10123 2626 : tree bftype = TREE_TYPE (vectype);
10124 2626 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10125 85 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10126 2626 : new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10127 2626 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10128 : &stmts, true, NULL_TREE);
10129 : }
10130 :
10131 2843 : *exit_gsi = gsi_after_labels (exit_bb);
10132 2843 : if (stmts)
10133 2843 : gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10134 :
10135 2843 : return new_tree;
10136 : }
10137 :
10138 : /* Function vectorizable_live_operation.
10139 :
10140 : STMT_INFO computes a value that is used outside the loop. Check if
10141 : it can be supported. */
10142 :
10143 : bool
10144 262745 : vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10145 : slp_tree slp_node, slp_instance slp_node_instance,
10146 : int slp_index, bool vec_stmt_p,
10147 : stmt_vector_for_cost *cost_vec)
10148 : {
10149 262745 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10150 262745 : imm_use_iterator imm_iter;
10151 262745 : tree lhs, lhs_type, bitsize;
10152 262745 : tree vectype = SLP_TREE_VECTYPE (slp_node);
10153 262745 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10154 262745 : gimple *use_stmt;
10155 262745 : use_operand_p use_p;
10156 262745 : auto_vec<tree> vec_oprnds;
10157 262745 : int vec_entry = 0;
10158 262745 : poly_uint64 vec_index = 0;
10159 :
10160 262745 : gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10161 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10162 :
10163 : /* If a stmt of a reduction is live, vectorize it via
10164 : vect_create_epilog_for_reduction. vectorizable_reduction assessed
10165 : validity so just trigger the transform here. */
10166 262745 : if (vect_is_reduction (slp_node))
10167 : {
10168 86641 : if (!vec_stmt_p)
10169 : {
10170 63418 : SLP_TREE_LIVE_LANES (slp_node).safe_push (slp_index);
10171 63418 : return true;
10172 : }
10173 : /* For SLP reductions we vectorize the epilogue for all involved stmts
10174 : together. For SLP reduction chains we only get here once. */
10175 23223 : if (SLP_INSTANCE_KIND (slp_node_instance) == slp_inst_kind_reduc_group
10176 22967 : && slp_index != 0)
10177 : return true;
10178 22775 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
10179 22775 : if (VECT_REDUC_INFO_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10180 22775 : || VECT_REDUC_INFO_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10181 : return true;
10182 :
10183 21928 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
10184 21928 : || !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10185 21919 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10186 : slp_node_instance,
10187 : LOOP_VINFO_MAIN_EXIT (loop_vinfo));
10188 :
10189 : /* If early break we only have to materialize the reduction on the merge
10190 : block, but we have to find an alternate exit first. */
10191 21928 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10192 : {
10193 28 : slp_tree phis_node = slp_node_instance->reduc_phis;
10194 28 : stmt_info = SLP_TREE_REPRESENTATIVE (phis_node);
10195 89 : for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10196 28 : if (exit != LOOP_VINFO_MAIN_EXIT (loop_vinfo))
10197 : {
10198 23 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10199 : phis_node, slp_node_instance,
10200 : exit);
10201 23 : break;
10202 28 : }
10203 28 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10204 9 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10205 : phis_node, slp_node_instance,
10206 : LOOP_VINFO_MAIN_EXIT
10207 : (loop_vinfo));
10208 : }
10209 :
10210 21928 : return true;
10211 : }
10212 :
10213 : /* If STMT is not relevant and it is a simple assignment and its inputs are
10214 : invariant then it can remain in place, unvectorized. The original last
10215 : scalar value that it computes will be used. */
10216 176104 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
10217 : {
10218 0 : gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10219 0 : if (dump_enabled_p ())
10220 0 : dump_printf_loc (MSG_NOTE, vect_location,
10221 : "statement is simple and uses invariant. Leaving in "
10222 : "place.\n");
10223 0 : return true;
10224 : }
10225 :
10226 176104 : gcc_assert (slp_index >= 0);
10227 :
10228 : /* Get the last occurrence of the scalar index from the concatenation of
10229 : all the slp vectors. Calculate which slp vector it is and the index
10230 : within. */
10231 176104 : int num_scalar = SLP_TREE_LANES (slp_node);
10232 176104 : int num_vec = vect_get_num_copies (vinfo, slp_node);
10233 176104 : poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10234 :
10235 : /* Calculate which vector contains the result, and which lane of
10236 : that vector we need. */
10237 176104 : if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10238 : {
10239 : if (dump_enabled_p ())
10240 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10241 : "Cannot determine which vector holds the"
10242 : " final result.\n");
10243 : return false;
10244 : }
10245 :
10246 176104 : if (!vec_stmt_p)
10247 : {
10248 : /* No transformation required. */
10249 137698 : if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10250 : {
10251 27443 : if (SLP_TREE_LANES (slp_node) != 1)
10252 : {
10253 19 : if (dump_enabled_p ())
10254 19 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10255 : "can't operate on partial vectors "
10256 : "because an SLP statement is live after "
10257 : "the loop.\n");
10258 19 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10259 : }
10260 27424 : else if (num_vec > 1)
10261 : {
10262 15709 : if (dump_enabled_p ())
10263 53 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10264 : "can't operate on partial vectors "
10265 : "because ncopies is greater than 1.\n");
10266 15709 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10267 : }
10268 : else
10269 : {
10270 11715 : if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10271 : OPTIMIZE_FOR_SPEED))
10272 0 : vect_record_loop_mask (loop_vinfo,
10273 : &LOOP_VINFO_MASKS (loop_vinfo),
10274 : 1, vectype, NULL);
10275 11715 : else if (can_vec_extract_var_idx_p (
10276 11715 : TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10277 0 : vect_record_loop_len (loop_vinfo,
10278 : &LOOP_VINFO_LENS (loop_vinfo),
10279 : 1, vectype, 1);
10280 : else
10281 : {
10282 11715 : if (dump_enabled_p ())
10283 655 : dump_printf_loc (
10284 655 : MSG_MISSED_OPTIMIZATION, vect_location,
10285 : "can't operate on partial vectors "
10286 : "because the target doesn't support extract "
10287 : "last reduction.\n");
10288 11715 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10289 : }
10290 : }
10291 : }
10292 : /* ??? Enable for loop costing as well. */
10293 27443 : if (!loop_vinfo)
10294 65831 : record_stmt_cost (cost_vec, 1, vec_to_scalar, slp_node,
10295 : 0, vect_epilogue);
10296 137698 : SLP_TREE_LIVE_LANES (slp_node).safe_push (slp_index);
10297 137698 : return true;
10298 : }
10299 :
10300 : /* Use the lhs of the original scalar statement. */
10301 38406 : gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10302 38406 : if (dump_enabled_p ())
10303 988 : dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10304 : "stmt %G", stmt);
10305 :
10306 38406 : lhs = gimple_get_lhs (stmt);
10307 38406 : lhs_type = TREE_TYPE (lhs);
10308 :
10309 38406 : bitsize = vector_element_bits_tree (vectype);
10310 :
10311 : /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10312 38406 : gcc_assert (!loop_vinfo
10313 : || ((!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10314 : && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10315 : || SLP_TREE_LANES (slp_node) == 1));
10316 :
10317 : /* Get the correct slp vectorized stmt. */
10318 38406 : tree vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10319 38406 : gimple *vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10320 :
10321 : /* In case we need to early break vectorize also get the first stmt. */
10322 38406 : tree vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10323 :
10324 : /* Get entry to use. */
10325 38406 : tree bitstart = bitsize_int (vec_index);
10326 38406 : bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10327 :
10328 38406 : if (loop_vinfo)
10329 : {
10330 : /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10331 : requirement, insert one phi node for it. It looks like:
10332 : loop;
10333 : BB:
10334 : # lhs' = PHI <lhs>
10335 : ==>
10336 : loop;
10337 : BB:
10338 : # vec_lhs' = PHI <vec_lhs>
10339 : new_tree = lane_extract <vec_lhs', ...>;
10340 : lhs' = new_tree; */
10341 :
10342 2906 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10343 : /* Check if we have a loop where the chosen exit is not the main exit,
10344 : in these cases for an early break we restart the iteration the vector code
10345 : did. For the live values we want the value at the start of the iteration
10346 : rather than at the end. */
10347 2906 : edge main_e = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
10348 2906 : bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10349 15066 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10350 9254 : if (!is_gimple_debug (use_stmt)
10351 9254 : && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10352 2843 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10353 : {
10354 2843 : edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10355 2843 : phi_arg_index_from_use (use_p));
10356 2843 : gcc_assert (loop_exit_edge_p (loop, e));
10357 2843 : bool main_exit_edge = e == main_e;
10358 2843 : tree tmp_vec_lhs = vec_lhs;
10359 2843 : tree tmp_bitstart = bitstart;
10360 :
10361 : /* For early exit where the exit is not in the BB that leads
10362 : to the latch then we're restarting the iteration in the
10363 : scalar loop. So get the first live value. */
10364 2843 : bool early_break_first_element_p
10365 2843 : = all_exits_as_early_p || !main_exit_edge;
10366 2843 : if (early_break_first_element_p)
10367 : {
10368 199 : tmp_vec_lhs = vec_lhs0;
10369 199 : tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10370 : }
10371 :
10372 2843 : gimple_stmt_iterator exit_gsi;
10373 2843 : tree new_tree
10374 2843 : = vectorizable_live_operation_1 (loop_vinfo,
10375 : e->dest, vectype,
10376 : slp_node, bitsize,
10377 : tmp_bitstart, tmp_vec_lhs,
10378 : lhs_type, &exit_gsi);
10379 :
10380 2843 : auto gsi = gsi_for_stmt (use_stmt);
10381 2843 : tree lhs_phi = gimple_phi_result (use_stmt);
10382 2843 : remove_phi_node (&gsi, false);
10383 2843 : gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10384 2843 : gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10385 2843 : break;
10386 2906 : }
10387 :
10388 : /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
10389 12223 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10390 6411 : gcc_assert (is_gimple_debug (use_stmt)
10391 2906 : || flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10392 : }
10393 : else
10394 : {
10395 : /* For basic-block vectorization simply insert the lane-extraction. */
10396 35500 : tree bftype = TREE_TYPE (vectype);
10397 35500 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10398 2 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10399 35500 : tree new_tree = build3 (BIT_FIELD_REF, bftype,
10400 : vec_lhs, bitsize, bitstart);
10401 35500 : gimple_seq stmts = NULL;
10402 35500 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10403 : &stmts, true, NULL_TREE);
10404 35500 : if (TREE_CODE (new_tree) == SSA_NAME
10405 71000 : && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10406 2 : SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10407 35500 : if (is_a <gphi *> (vec_stmt))
10408 : {
10409 2501 : gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10410 2501 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10411 : }
10412 : else
10413 : {
10414 32999 : gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10415 32999 : gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10416 : }
10417 :
10418 : /* Replace use of lhs with newly computed result. If the use stmt is a
10419 : single arg PHI, just replace all uses of PHI result. It's necessary
10420 : because lcssa PHI defining lhs may be before newly inserted stmt. */
10421 35500 : use_operand_p use_p;
10422 35500 : stmt_vec_info use_stmt_info;
10423 209402 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10424 138402 : if (!is_gimple_debug (use_stmt)
10425 138402 : && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10426 101806 : || !PURE_SLP_STMT (use_stmt_info)))
10427 : {
10428 : /* ??? This can happen when the live lane ends up being
10429 : rooted in a vector construction code-generated by an
10430 : external SLP node (and code-generation for that already
10431 : happened).
10432 : Doing this is what would happen if that vector CTOR
10433 : were not code-generated yet so it is not too bad.
10434 : ??? In fact we'd likely want to avoid this situation
10435 : in the first place. */
10436 61897 : if (TREE_CODE (new_tree) == SSA_NAME
10437 61897 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10438 61897 : && gimple_code (use_stmt) != GIMPLE_PHI
10439 117100 : && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10440 : use_stmt))
10441 : {
10442 0 : if (dump_enabled_p ())
10443 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10444 : "Using original scalar computation for "
10445 : "live lane because use preceeds vector "
10446 : "def\n");
10447 0 : continue;
10448 : }
10449 189919 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10450 : {
10451 : /* ??? It can also happen that we end up pulling a def into
10452 : a loop where replacing out-of-loop uses would require
10453 : a new LC SSA PHI node. Retain the original scalar in
10454 : those cases as well. PR98064. */
10455 64011 : edge e;
10456 64011 : if (TREE_CODE (new_tree) == SSA_NAME
10457 64011 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10458 64011 : && (gimple_bb (use_stmt)->loop_father
10459 64011 : != gimple_bb (vec_stmt)->loop_father)
10460 : /* But a replacement in a LC PHI is OK. This happens
10461 : in gcc.dg/vect/bb-slp-57.c for example. */
10462 7369 : && (gimple_code (use_stmt) != GIMPLE_PHI
10463 3218 : || (((e = phi_arg_edge_from_use (use_p)), true)
10464 3218 : && !loop_exit_edge_p
10465 3218 : (gimple_bb (vec_stmt)->loop_father, e)))
10466 69616 : && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10467 5605 : gimple_bb (use_stmt)->loop_father))
10468 : {
10469 0 : if (dump_enabled_p ())
10470 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10471 : "Using original scalar computation for "
10472 : "live lane because there is an "
10473 : "out-of-loop definition for it\n");
10474 0 : continue;
10475 : }
10476 64011 : SET_USE (use_p, new_tree);
10477 : }
10478 61897 : update_stmt (use_stmt);
10479 35500 : }
10480 : }
10481 :
10482 : return true;
10483 262745 : }
10484 :
10485 : /* Given loop represented by LOOP_VINFO, return true if computation of
10486 : LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10487 : otherwise. */
10488 :
10489 : static bool
10490 61523 : loop_niters_no_overflow (loop_vec_info loop_vinfo)
10491 : {
10492 61523 : gcc_assert (!LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo));
10493 :
10494 : /* Constant case. */
10495 61523 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10496 : {
10497 35884 : tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10498 35884 : tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10499 :
10500 35884 : gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10501 35884 : gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10502 35884 : if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10503 : return true;
10504 : }
10505 :
10506 25639 : widest_int max;
10507 25639 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10508 : /* Check the upper bound of loop niters. */
10509 25639 : if (get_max_loop_iterations (loop, &max))
10510 : {
10511 25639 : tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10512 25639 : signop sgn = TYPE_SIGN (type);
10513 25639 : widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10514 25639 : if (max < type_max)
10515 25418 : return true;
10516 25639 : }
10517 : return false;
10518 25639 : }
10519 :
10520 : /* Return a mask type with half the number of elements as OLD_TYPE,
10521 : given that it should have mode NEW_MODE. */
10522 :
10523 : tree
10524 4795 : vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10525 : {
10526 4795 : poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10527 4795 : return build_truth_vector_type_for_mode (nunits, new_mode);
10528 : }
10529 :
10530 : /* Return a mask type with twice as many elements as OLD_TYPE,
10531 : given that it should have mode NEW_MODE. */
10532 :
10533 : tree
10534 7186 : vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10535 : {
10536 7186 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10537 7186 : return build_truth_vector_type_for_mode (nunits, new_mode);
10538 : }
10539 :
10540 : /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10541 : contain a sequence of NVECTORS masks that each control a vector of type
10542 : VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10543 : these vector masks with the vector version of SCALAR_MASK. */
10544 :
10545 : void
10546 105143 : vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10547 : unsigned int nvectors, tree vectype, tree scalar_mask)
10548 : {
10549 105143 : gcc_assert (nvectors != 0);
10550 :
10551 105143 : if (scalar_mask)
10552 : {
10553 4979 : scalar_cond_masked_key cond (scalar_mask, nvectors);
10554 4979 : loop_vinfo->scalar_cond_masked_set.add (cond);
10555 : }
10556 :
10557 105143 : masks->mask_set.add (std::make_pair (vectype, nvectors));
10558 105143 : }
10559 :
10560 : /* Given a complete set of masks MASKS, extract mask number INDEX
10561 : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10562 : where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10563 :
10564 : See the comment above vec_loop_masks for more details about the mask
10565 : arrangement. */
10566 :
10567 : tree
10568 208 : vect_get_loop_mask (loop_vec_info loop_vinfo,
10569 : gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10570 : unsigned int nvectors, tree vectype, unsigned int index)
10571 : {
10572 208 : if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10573 : == vect_partial_vectors_while_ult)
10574 : {
10575 0 : rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10576 0 : tree mask_type = rgm->type;
10577 :
10578 : /* Populate the rgroup's mask array, if this is the first time we've
10579 : used it. */
10580 0 : if (rgm->controls.is_empty ())
10581 : {
10582 0 : rgm->controls.safe_grow_cleared (nvectors, true);
10583 0 : for (unsigned int i = 0; i < nvectors; ++i)
10584 : {
10585 0 : tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10586 : /* Provide a dummy definition until the real one is available. */
10587 0 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10588 0 : rgm->controls[i] = mask;
10589 : }
10590 : }
10591 :
10592 0 : tree mask = rgm->controls[index];
10593 0 : if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10594 0 : TYPE_VECTOR_SUBPARTS (vectype)))
10595 : {
10596 : /* A loop mask for data type X can be reused for data type Y
10597 : if X has N times more elements than Y and if Y's elements
10598 : are N times bigger than X's. In this case each sequence
10599 : of N elements in the loop mask will be all-zero or all-one.
10600 : We can then view-convert the mask so that each sequence of
10601 : N elements is replaced by a single element. */
10602 0 : gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10603 : TYPE_VECTOR_SUBPARTS (vectype)));
10604 0 : gimple_seq seq = NULL;
10605 0 : mask_type = truth_type_for (vectype);
10606 0 : mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10607 0 : if (seq)
10608 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10609 : }
10610 0 : return mask;
10611 : }
10612 208 : else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10613 : == vect_partial_vectors_avx512)
10614 : {
10615 : /* The number of scalars per iteration and the number of vectors are
10616 : both compile-time constants. */
10617 208 : unsigned int nscalars_per_iter
10618 208 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10619 208 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10620 :
10621 208 : rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10622 :
10623 : /* The stored nV is dependent on the mask type produced. */
10624 208 : gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10625 : TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10626 : == rgm->factor);
10627 208 : nvectors = rgm->factor;
10628 :
10629 : /* Populate the rgroup's mask array, if this is the first time we've
10630 : used it. */
10631 208 : if (rgm->controls.is_empty ())
10632 : {
10633 20 : rgm->controls.safe_grow_cleared (nvectors, true);
10634 106 : for (unsigned int i = 0; i < nvectors; ++i)
10635 : {
10636 86 : tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
10637 : /* Provide a dummy definition until the real one is available. */
10638 86 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10639 86 : rgm->controls[i] = mask;
10640 : }
10641 : }
10642 208 : if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
10643 : TYPE_VECTOR_SUBPARTS (vectype)))
10644 160 : return rgm->controls[index];
10645 :
10646 : /* Split the vector if needed. Since we are dealing with integer mode
10647 : masks with AVX512 we can operate on the integer representation
10648 : performing the whole vector shifting. */
10649 48 : unsigned HOST_WIDE_INT factor;
10650 48 : bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
10651 48 : TYPE_VECTOR_SUBPARTS (vectype), &factor);
10652 0 : gcc_assert (ok);
10653 48 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
10654 48 : tree mask_type = truth_type_for (vectype);
10655 48 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
10656 48 : unsigned vi = index / factor;
10657 48 : unsigned vpart = index % factor;
10658 48 : tree vec = rgm->controls[vi];
10659 48 : gimple_seq seq = NULL;
10660 48 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
10661 48 : lang_hooks.types.type_for_mode
10662 48 : (TYPE_MODE (rgm->type), 1), vec);
10663 : /* For integer mode masks simply shift the right bits into position. */
10664 48 : if (vpart != 0)
10665 40 : vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
10666 : build_int_cst (integer_type_node,
10667 80 : (TYPE_VECTOR_SUBPARTS (vectype)
10668 40 : * vpart)));
10669 48 : vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
10670 48 : (TYPE_MODE (mask_type), 1), vec);
10671 48 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
10672 48 : if (seq)
10673 48 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10674 48 : return vec;
10675 : }
10676 : else
10677 0 : gcc_unreachable ();
10678 : }
10679 :
10680 : /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10681 : lengths for controlling an operation on VECTYPE. The operation splits
10682 : each element of VECTYPE into FACTOR separate subelements, measuring the
10683 : length as a number of these subelements. */
10684 :
10685 : void
10686 0 : vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10687 : unsigned int nvectors, tree vectype, unsigned int factor)
10688 : {
10689 0 : gcc_assert (nvectors != 0);
10690 0 : if (lens->length () < nvectors)
10691 0 : lens->safe_grow_cleared (nvectors, true);
10692 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
10693 :
10694 : /* The number of scalars per iteration, scalar occupied bytes and
10695 : the number of vectors are both compile-time constants. */
10696 0 : unsigned int nscalars_per_iter
10697 0 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10698 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10699 :
10700 0 : if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10701 : {
10702 : /* For now, we only support cases in which all loads and stores fall back
10703 : to VnQI or none do. */
10704 0 : gcc_assert (!rgl->max_nscalars_per_iter
10705 : || (rgl->factor == 1 && factor == 1)
10706 : || (rgl->max_nscalars_per_iter * rgl->factor
10707 : == nscalars_per_iter * factor));
10708 0 : rgl->max_nscalars_per_iter = nscalars_per_iter;
10709 0 : rgl->type = vectype;
10710 0 : rgl->factor = factor;
10711 : }
10712 0 : }
10713 :
10714 : /* Given a complete set of lengths LENS, extract length number INDEX
10715 : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10716 : where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
10717 : multipled by the number of elements that should be processed.
10718 : Insert any set-up statements before GSI. */
10719 :
10720 : tree
10721 0 : vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10722 : vec_loop_lens *lens, unsigned int nvectors, tree vectype,
10723 : unsigned int index, unsigned int factor, bool adjusted)
10724 : {
10725 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
10726 0 : bool use_bias_adjusted_len =
10727 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10728 :
10729 : /* Populate the rgroup's len array, if this is the first time we've
10730 : used it. */
10731 0 : if (rgl->controls.is_empty ())
10732 : {
10733 0 : rgl->controls.safe_grow_cleared (nvectors, true);
10734 0 : for (unsigned int i = 0; i < nvectors; ++i)
10735 : {
10736 0 : tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10737 0 : gcc_assert (len_type != NULL_TREE);
10738 :
10739 0 : tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10740 :
10741 : /* Provide a dummy definition until the real one is available. */
10742 0 : SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10743 0 : rgl->controls[i] = len;
10744 :
10745 0 : if (use_bias_adjusted_len)
10746 : {
10747 0 : gcc_assert (i == 0);
10748 0 : tree adjusted_len =
10749 0 : make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10750 0 : SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10751 0 : rgl->bias_adjusted_ctrl = adjusted_len;
10752 : }
10753 : }
10754 : }
10755 :
10756 0 : if (use_bias_adjusted_len && adjusted)
10757 0 : return rgl->bias_adjusted_ctrl;
10758 :
10759 0 : tree loop_len = rgl->controls[index];
10760 0 : if (rgl->factor == 1 && factor == 1)
10761 : {
10762 0 : poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
10763 0 : poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
10764 0 : if (maybe_ne (nunits1, nunits2))
10765 : {
10766 : /* A loop len for data type X can be reused for data type Y
10767 : if X has N times more elements than Y and if Y's elements
10768 : are N times bigger than X's. */
10769 0 : gcc_assert (multiple_p (nunits1, nunits2));
10770 0 : factor = exact_div (nunits1, nunits2).to_constant ();
10771 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10772 0 : gimple_seq seq = NULL;
10773 0 : loop_len = gimple_build (&seq, EXACT_DIV_EXPR, iv_type, loop_len,
10774 0 : build_int_cst (iv_type, factor));
10775 0 : if (seq)
10776 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10777 : }
10778 0 : }
10779 0 : else if (factor && rgl->factor != factor)
10780 : {
10781 : /* The number of scalars per iteration, scalar occupied bytes and
10782 : the number of vectors are both compile-time constants. */
10783 0 : unsigned int nscalars_per_iter
10784 0 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10785 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10786 0 : unsigned int rglvecsize = rgl->factor * rgl->max_nscalars_per_iter;
10787 0 : unsigned int vecsize = nscalars_per_iter * factor;
10788 0 : if (rglvecsize > vecsize)
10789 : {
10790 0 : unsigned int fac = rglvecsize / vecsize;
10791 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10792 0 : gimple_seq seq = NULL;
10793 0 : loop_len = gimple_build (&seq, EXACT_DIV_EXPR, iv_type, loop_len,
10794 0 : build_int_cst (iv_type, fac));
10795 0 : if (seq)
10796 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10797 : }
10798 0 : else if (rglvecsize < vecsize)
10799 : {
10800 0 : unsigned int fac = vecsize / rglvecsize;
10801 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10802 0 : gimple_seq seq = NULL;
10803 0 : loop_len = gimple_build (&seq, MULT_EXPR, iv_type, loop_len,
10804 0 : build_int_cst (iv_type, fac));
10805 0 : if (seq)
10806 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10807 : }
10808 : }
10809 : return loop_len;
10810 : }
10811 :
10812 : /* Generate the tree for the loop len mask and return it. Given the lens,
10813 : nvectors, vectype, index and factor to gen the len mask as below.
10814 :
10815 : tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
10816 : */
10817 : tree
10818 0 : vect_gen_loop_len_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10819 : gimple_stmt_iterator *cond_gsi, vec_loop_lens *lens,
10820 : unsigned int nvectors, tree vectype, tree stmt,
10821 : unsigned int index, unsigned int factor)
10822 : {
10823 0 : tree all_one_mask = build_all_ones_cst (vectype);
10824 0 : tree all_zero_mask = build_zero_cst (vectype);
10825 0 : tree len = vect_get_loop_len (loop_vinfo, gsi, lens, nvectors, vectype, index,
10826 : factor, true);
10827 0 : tree bias = build_int_cst (intQI_type_node,
10828 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo));
10829 0 : tree len_mask = make_temp_ssa_name (TREE_TYPE (stmt), NULL, "vec_len_mask");
10830 0 : gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, stmt,
10831 : all_one_mask, all_zero_mask, len,
10832 : bias);
10833 0 : gimple_call_set_lhs (call, len_mask);
10834 0 : gsi_insert_before (cond_gsi, call, GSI_SAME_STMT);
10835 :
10836 0 : return len_mask;
10837 : }
10838 :
10839 : /* Scale profiling counters by estimation for LOOP which is vectorized
10840 : by factor VF.
10841 : If FLAT is true, the loop we started with had unrealistically flat
10842 : profile. */
10843 :
10844 : static void
10845 61566 : scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
10846 : {
10847 : /* For flat profiles do not scale down proportionally by VF and only
10848 : cap by known iteration count bounds. */
10849 61566 : if (flat)
10850 : {
10851 34623 : if (dump_file && (dump_flags & TDF_DETAILS))
10852 5286 : fprintf (dump_file,
10853 : "Vectorized loop profile seems flat; not scaling iteration "
10854 : "count down by the vectorization factor %i\n", vf);
10855 34623 : scale_loop_profile (loop, profile_probability::always (),
10856 : get_likely_max_loop_iterations_int (loop));
10857 34623 : return;
10858 : }
10859 : /* Loop body executes VF fewer times and exit increases VF times. */
10860 26943 : profile_count entry_count = loop_preheader_edge (loop)->count ();
10861 :
10862 : /* If we have unreliable loop profile avoid dropping entry
10863 : count below header count. This can happen since loops
10864 : has unrealistically low trip counts. */
10865 26943 : while (vf > 1
10866 28032 : && loop->header->count > entry_count
10867 57083 : && loop->header->count < entry_count * vf)
10868 : {
10869 2108 : if (dump_file && (dump_flags & TDF_DETAILS))
10870 155 : fprintf (dump_file,
10871 : "Vectorization factor %i seems too large for profile "
10872 : "prevoiusly believed to be consistent; reducing.\n", vf);
10873 2108 : vf /= 2;
10874 : }
10875 :
10876 26943 : if (entry_count.nonzero_p ())
10877 26943 : set_edge_probability_and_rescale_others
10878 26943 : (exit_e,
10879 26943 : entry_count.probability_in (loop->header->count / vf));
10880 : /* Avoid producing very large exit probability when we do not have
10881 : sensible profile. */
10882 0 : else if (exit_e->probability < profile_probability::always () / (vf * 2))
10883 0 : set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
10884 26943 : loop->latch->count = single_pred_edge (loop->latch)->count ();
10885 :
10886 26943 : scale_loop_profile (loop, profile_probability::always () / vf,
10887 : get_likely_max_loop_iterations_int (loop));
10888 : }
10889 :
10890 : /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
10891 : original loop that has now been vectorized.
10892 :
10893 : The inits of the data_references need to be advanced with the number of
10894 : iterations of the main loop. This has been computed in vect_do_peeling and
10895 : is stored in parameter ADVANCE.
10896 :
10897 : Since the loop_vec_info of this EPILOGUE was constructed for the original
10898 : loop, its stmt_vec_infos all point to the original statements. These need
10899 : to be updated to point to their corresponding copies.
10900 :
10901 : The data_reference's connections also need to be updated. Their
10902 : corresponding dr_vec_info need to be reconnected to the EPILOGUE's
10903 : stmt_vec_infos, their statements need to point to their corresponding
10904 : copy. */
10905 :
10906 : static void
10907 6823 : update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
10908 : {
10909 6823 : loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
10910 6823 : hash_map<tree,tree> mapping;
10911 6823 : gimple *orig_stmt, *new_stmt;
10912 6823 : gimple_stmt_iterator epilogue_gsi;
10913 6823 : gphi_iterator epilogue_phi_gsi;
10914 6823 : stmt_vec_info stmt_vinfo = NULL, related_vinfo;
10915 6823 : basic_block *epilogue_bbs = get_loop_body (epilogue);
10916 6823 : unsigned i;
10917 :
10918 6823 : free (LOOP_VINFO_BBS (epilogue_vinfo));
10919 6823 : LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
10920 6823 : LOOP_VINFO_NBBS (epilogue_vinfo) = epilogue->num_nodes;
10921 :
10922 : /* The EPILOGUE loop is a copy of the original loop so they share the same
10923 : gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
10924 : point to the copied statements. */
10925 20469 : for (unsigned i = 0; i < epilogue->num_nodes; ++i)
10926 : {
10927 13646 : for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
10928 35154 : !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
10929 : {
10930 21508 : new_stmt = epilogue_phi_gsi.phi ();
10931 :
10932 21508 : gcc_assert (gimple_uid (new_stmt) > 0);
10933 21508 : stmt_vinfo
10934 21508 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10935 :
10936 21508 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10937 : }
10938 :
10939 27292 : for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
10940 136505 : !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
10941 : {
10942 122859 : new_stmt = gsi_stmt (epilogue_gsi);
10943 122859 : if (is_gimple_debug (new_stmt))
10944 20235 : continue;
10945 :
10946 102624 : gcc_assert (gimple_uid (new_stmt) > 0);
10947 102624 : stmt_vinfo
10948 102624 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10949 :
10950 102624 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10951 :
10952 102624 : related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
10953 102624 : if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
10954 : {
10955 1938 : gimple *stmt = STMT_VINFO_STMT (related_vinfo);
10956 : /* Set BB such that the assert in
10957 : 'get_initial_defs_for_reduction' is able to determine that
10958 : the BB of the related stmt is inside this loop. */
10959 1938 : gimple_set_bb (stmt,
10960 : gimple_bb (new_stmt));
10961 1938 : related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
10962 1938 : gcc_assert (related_vinfo == NULL
10963 : || related_vinfo == stmt_vinfo);
10964 : }
10965 : }
10966 : }
10967 :
10968 6823 : struct data_reference *dr;
10969 6823 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
10970 30861 : FOR_EACH_VEC_ELT (datarefs, i, dr)
10971 : {
10972 24038 : orig_stmt = DR_STMT (dr);
10973 24038 : gcc_assert (gimple_uid (orig_stmt) > 0);
10974 24038 : stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
10975 24038 : DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
10976 : }
10977 :
10978 : /* Advance data_reference's with the number of iterations of the previous
10979 : loop and its prologue. */
10980 6823 : vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
10981 :
10982 : /* Remember the advancement made. */
10983 6823 : LOOP_VINFO_DRS_ADVANCED_BY (epilogue_vinfo) = advance;
10984 6823 : }
10985 :
10986 : /* When vectorizing early break statements instructions that happen before
10987 : the early break in the current BB need to be moved to after the early
10988 : break. This function deals with that and assumes that any validity
10989 : checks has already been performed.
10990 :
10991 : While moving the instructions if it encounters a VUSE or VDEF it then
10992 : corrects the VUSES as it moves the statements along. GDEST is the location
10993 : in which to insert the new statements. */
10994 :
10995 : static void
10996 1409 : move_early_exit_stmts (loop_vec_info loop_vinfo)
10997 : {
10998 1409 : DUMP_VECT_SCOPE ("move_early_exit_stmts");
10999 :
11000 1409 : if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
11001 1190 : return;
11002 :
11003 : /* Move all stmts that need moving. */
11004 219 : basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
11005 219 : gimple_stmt_iterator dest_gsi = gsi_after_labels (dest_bb);
11006 :
11007 219 : tree last_seen_vuse = NULL_TREE;
11008 537 : for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11009 : {
11010 : /* We have to update crossed degenerate virtual PHIs. Simply
11011 : elide them. */
11012 318 : if (gphi *vphi = dyn_cast <gphi *> (stmt))
11013 : {
11014 7 : tree vdef = gimple_phi_result (vphi);
11015 7 : tree vuse = gimple_phi_arg_def (vphi, 0);
11016 7 : imm_use_iterator iter;
11017 7 : use_operand_p use_p;
11018 7 : gimple *use_stmt;
11019 30 : FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef)
11020 : {
11021 48 : FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
11022 16 : SET_USE (use_p, vuse);
11023 7 : }
11024 7 : auto gsi = gsi_for_stmt (stmt);
11025 7 : remove_phi_node (&gsi, true);
11026 7 : last_seen_vuse = vuse;
11027 7 : continue;
11028 7 : }
11029 :
11030 : /* Check to see if statement is still required for vect or has been
11031 : elided. */
11032 311 : auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11033 311 : if (!stmt_info)
11034 0 : continue;
11035 :
11036 311 : if (dump_enabled_p ())
11037 160 : dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11038 :
11039 311 : gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11040 311 : gsi_move_before (&stmt_gsi, &dest_gsi, GSI_NEW_STMT);
11041 622 : last_seen_vuse = gimple_vuse (stmt);
11042 : }
11043 :
11044 : /* Update all the stmts with their new reaching VUSES. */
11045 689 : for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11046 : {
11047 198 : if (dump_enabled_p ())
11048 162 : dump_printf_loc (MSG_NOTE, vect_location,
11049 : "updating vuse to %T for load %G",
11050 : last_seen_vuse, p);
11051 198 : gimple_set_vuse (p, last_seen_vuse);
11052 198 : update_stmt (p);
11053 : }
11054 :
11055 : /* And update the LC PHIs on exits. */
11056 1108 : for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
11057 451 : if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
11058 245 : if (gphi *phi = get_virtual_phi (e->dest))
11059 464 : SET_PHI_ARG_DEF_ON_EDGE (phi, e, last_seen_vuse);
11060 : }
11061 :
11062 : /* Generate adjustment code for early break scalar IVs filling in the value
11063 : we created earlier on for LOOP_VINFO_EARLY_BRK_NITERS_VAR. */
11064 :
11065 : static void
11066 1409 : vect_update_ivs_after_vectorizer_for_early_breaks (loop_vec_info loop_vinfo)
11067 : {
11068 1409 : DUMP_VECT_SCOPE ("vect_update_ivs_after_vectorizer_for_early_breaks");
11069 :
11070 1409 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
11071 : /* If no peeling was done then we have no IV to update. */
11072 1409 : || !LOOP_VINFO_EARLY_BRK_NITERS_VAR (loop_vinfo))
11073 584 : return;
11074 :
11075 825 : tree phi_var = LOOP_VINFO_EARLY_BRK_NITERS_VAR (loop_vinfo);
11076 825 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11077 825 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11078 825 : tree ty_var = TREE_TYPE (phi_var);
11079 825 : auto loop = LOOP_VINFO_LOOP (loop_vinfo);
11080 825 : tree induc_var = niters_skip ? copy_ssa_name (phi_var) : phi_var;
11081 :
11082 825 : auto induction_phi = create_phi_node (induc_var, loop->header);
11083 825 : tree induc_def = PHI_RESULT (induction_phi);
11084 :
11085 : /* Create the iv update inside the loop. */
11086 825 : gimple_seq init_stmts = NULL;
11087 825 : gimple_seq stmts = NULL;
11088 825 : gimple_seq iv_stmts = NULL;
11089 825 : tree tree_vf = build_int_cst (ty_var, vf);
11090 :
11091 : /* For loop len targets we have to use .SELECT_VL (ivtmp_33, VF); instead of
11092 : just += VF as the VF can change in between two loop iterations. */
11093 825 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
11094 : {
11095 0 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
11096 0 : tree_vf = vect_get_loop_len (loop_vinfo, NULL, lens, 1,
11097 : NULL_TREE, 0, 0, true);
11098 : }
11099 :
11100 825 : tree iter_var;
11101 825 : if (POINTER_TYPE_P (ty_var))
11102 : {
11103 0 : tree offset = gimple_convert (&stmts, sizetype, tree_vf);
11104 0 : iter_var = gimple_build (&stmts, POINTER_PLUS_EXPR, ty_var, induc_def,
11105 : gimple_convert (&stmts, sizetype, offset));
11106 : }
11107 : else
11108 : {
11109 825 : tree offset = gimple_convert (&stmts, ty_var, tree_vf);
11110 825 : iter_var = gimple_build (&stmts, PLUS_EXPR, ty_var, induc_def, offset);
11111 : }
11112 :
11113 825 : tree init_var = build_zero_cst (ty_var);
11114 825 : if (niters_skip)
11115 0 : init_var = gimple_build (&init_stmts, MINUS_EXPR, ty_var, init_var,
11116 : gimple_convert (&init_stmts, ty_var, niters_skip));
11117 :
11118 825 : add_phi_arg (induction_phi, iter_var,
11119 : loop_latch_edge (loop), UNKNOWN_LOCATION);
11120 825 : add_phi_arg (induction_phi, init_var,
11121 : loop_preheader_edge (loop), UNKNOWN_LOCATION);
11122 :
11123 : /* Find the first insertion point in the BB. */
11124 825 : auto pe = loop_preheader_edge (loop);
11125 :
11126 : /* If we've done any peeling, calculate the peeling adjustment needed to the
11127 : final IV. */
11128 825 : if (niters_skip)
11129 : {
11130 0 : tree induc_type = TREE_TYPE (induc_def);
11131 0 : tree s_induc_type = signed_type_for (induc_type);
11132 0 : induc_def = gimple_build (&iv_stmts, MAX_EXPR, s_induc_type,
11133 : gimple_convert (&iv_stmts, s_induc_type,
11134 : induc_def),
11135 : build_zero_cst (s_induc_type));
11136 0 : auto stmt = gimple_build_assign (phi_var,
11137 : gimple_convert (&iv_stmts, induc_type,
11138 : induc_def));
11139 0 : gimple_seq_add_stmt_without_update (&iv_stmts, stmt);
11140 0 : basic_block exit_bb = NULL;
11141 : /* Identify the early exit merge block. I wish we had stored this. */
11142 0 : for (auto e : get_loop_exit_edges (loop))
11143 0 : if (e != LOOP_VINFO_MAIN_EXIT (loop_vinfo))
11144 : {
11145 0 : exit_bb = e->dest;
11146 0 : break;
11147 0 : }
11148 :
11149 0 : gcc_assert (exit_bb);
11150 0 : auto exit_gsi = gsi_after_labels (exit_bb);
11151 0 : gsi_insert_seq_before (&exit_gsi, iv_stmts, GSI_SAME_STMT);
11152 : }
11153 : /* Write the init_stmts in the loop-preheader block. */
11154 825 : auto psi = gsi_last_nondebug_bb (pe->src);
11155 825 : gsi_insert_seq_after (&psi, init_stmts, GSI_LAST_NEW_STMT);
11156 : /* Wite the adjustments in the header block. */
11157 825 : basic_block bb = loop->header;
11158 825 : auto si = gsi_after_labels (bb);
11159 825 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11160 : }
11161 :
11162 : /* Function vect_transform_loop.
11163 :
11164 : The analysis phase has determined that the loop is vectorizable.
11165 : Vectorize the loop - created vectorized stmts to replace the scalar
11166 : stmts in the loop, and update the loop exit condition.
11167 : Returns scalar epilogue loop if any. */
11168 :
11169 : class loop *
11170 61566 : vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11171 : {
11172 61566 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11173 61566 : class loop *epilogue = NULL;
11174 61566 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11175 61566 : int nbbs = loop->num_nodes;
11176 61566 : int i;
11177 61566 : tree niters_vector = NULL_TREE;
11178 61566 : tree step_vector = NULL_TREE;
11179 61566 : tree niters_vector_mult_vf = NULL_TREE;
11180 61566 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11181 61566 : unsigned int lowest_vf = constant_lower_bound (vf);
11182 61566 : gimple *stmt;
11183 61566 : bool check_profitability = false;
11184 61566 : unsigned int th;
11185 61566 : bool flat = maybe_flat_loop_profile (loop);
11186 61566 : bool uncounted_p = LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo);
11187 :
11188 61566 : DUMP_VECT_SCOPE ("vec_transform_loop");
11189 :
11190 61566 : if (! LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11191 54743 : loop_vinfo->shared->check_datarefs ();
11192 :
11193 : /* Use the more conservative vectorization threshold. If the number
11194 : of iterations is constant assume the cost check has been performed
11195 : by our caller. If the threshold makes all loops profitable that
11196 : run at least the (estimated) vectorization factor number of times
11197 : checking is pointless, too. */
11198 61566 : th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11199 61566 : if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11200 : {
11201 18611 : if (dump_enabled_p ())
11202 176 : dump_printf_loc (MSG_NOTE, vect_location,
11203 : "Profitability threshold is %d loop iterations.\n",
11204 : th);
11205 : check_profitability = true;
11206 : }
11207 :
11208 : /* Make sure there exists a single-predecessor exit bb. Do this before
11209 : versioning. */
11210 61566 : edge e = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
11211 61566 : if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11212 : {
11213 19019 : split_loop_exit_edge (e, true);
11214 19019 : if (dump_enabled_p ())
11215 2278 : dump_printf (MSG_NOTE, "split exit edge\n");
11216 : }
11217 :
11218 : /* Version the loop first, if required, so the profitability check
11219 : comes first. */
11220 :
11221 61566 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11222 : {
11223 3757 : class loop *sloop
11224 3757 : = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11225 3757 : sloop->force_vectorize = false;
11226 3757 : check_profitability = false;
11227 : }
11228 :
11229 : /* Make sure there exists a single-predecessor exit bb also on the
11230 : scalar loop copy. Do this after versioning but before peeling
11231 : so CFG structure is fine for both scalar and if-converted loop
11232 : to make slpeel_duplicate_current_defs_from_edges face matched
11233 : loop closed PHI nodes on the exit. */
11234 61566 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11235 : {
11236 8042 : e = LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo);
11237 8042 : if (! single_pred_p (e->dest))
11238 : {
11239 7782 : split_loop_exit_edge (e, true);
11240 7782 : if (dump_enabled_p ())
11241 1139 : dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11242 : }
11243 : }
11244 :
11245 61566 : tree niters = vect_build_loop_niters (loop_vinfo);
11246 61566 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11247 61566 : tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11248 61566 : tree advance;
11249 61566 : drs_init_vec orig_drs_init;
11250 61566 : bool niters_no_overflow = uncounted_p ? false /* Not known. */
11251 61523 : : loop_niters_no_overflow (loop_vinfo);
11252 :
11253 61566 : epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11254 : &step_vector, &niters_vector_mult_vf, th,
11255 : check_profitability, niters_no_overflow,
11256 : &advance);
11257 :
11258 : /* Assign hierarchical discriminators to the vectorized loop. */
11259 61566 : poly_uint64 vf_val = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11260 61566 : unsigned int vf_int = constant_lower_bound (vf_val);
11261 61566 : if (vf_int > DISCR_MULTIPLICITY_MAX)
11262 : vf_int = DISCR_MULTIPLICITY_MAX;
11263 :
11264 : /* Assign unique copy_id dynamically instead of using hardcoded constants.
11265 : Epilogue and main vectorized loops get different copy_ids. */
11266 61566 : gimple *loop_last = last_nondebug_stmt (loop->header);
11267 61566 : location_t loop_loc
11268 61566 : = loop_last ? gimple_location (loop_last) : UNKNOWN_LOCATION;
11269 61288 : if (loop_loc != UNKNOWN_LOCATION)
11270 : {
11271 50764 : unsigned int copyid = allocate_copyid_base (loop_loc, 1);
11272 50764 : assign_discriminators_to_loop (loop, vf_int, copyid);
11273 : }
11274 61566 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11275 61566 : && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11276 : {
11277 : /* Ifcvt duplicates loop preheader, loop body and produces an basic
11278 : block after loop exit. We need to scale all that. */
11279 88 : basic_block preheader
11280 88 : = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11281 88 : preheader->count
11282 : = preheader->count.apply_probability
11283 88 : (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11284 88 : scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11285 : LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11286 88 : LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo)->dest->count = preheader->count;
11287 : }
11288 :
11289 61566 : if (niters_vector == NULL_TREE && !uncounted_p)
11290 : {
11291 28029 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11292 28029 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11293 56845 : && known_eq (lowest_vf, vf))
11294 : {
11295 28026 : niters_vector
11296 28026 : = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11297 28026 : LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11298 28026 : step_vector = build_one_cst (TREE_TYPE (niters));
11299 : }
11300 793 : else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11301 1 : vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11302 : &step_vector, niters_no_overflow);
11303 : else
11304 : /* vect_do_peeling subtracted the number of peeled prologue
11305 : iterations from LOOP_VINFO_NITERS. */
11306 792 : vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11307 : &niters_vector, &step_vector,
11308 : niters_no_overflow);
11309 : }
11310 :
11311 : /* 1) Make sure the loop header has exactly two entries
11312 : 2) Make sure we have a preheader basic block. */
11313 :
11314 61566 : gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11315 :
11316 61566 : split_edge (loop_preheader_edge (loop));
11317 :
11318 61566 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11319 : /* This will deal with any possible peeling. */
11320 1 : vect_prepare_for_masked_peels (loop_vinfo);
11321 :
11322 : /* Handle any code motion that we need to for early-break vectorization after
11323 : we've done peeling but just before we start vectorizing. */
11324 61566 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11325 : {
11326 1409 : vect_update_ivs_after_vectorizer_for_early_breaks (loop_vinfo);
11327 1409 : move_early_exit_stmts (loop_vinfo);
11328 : }
11329 :
11330 : /* Remove existing clobber stmts and prefetches. */
11331 188018 : for (i = 0; i < nbbs; i++)
11332 : {
11333 126452 : basic_block bb = bbs[i];
11334 1090243 : for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);)
11335 : {
11336 837339 : stmt = gsi_stmt (si);
11337 837339 : if (gimple_clobber_p (stmt)
11338 837339 : || gimple_call_builtin_p (stmt, BUILT_IN_PREFETCH))
11339 : {
11340 88 : unlink_stmt_vdef (stmt);
11341 88 : gsi_remove (&si, true);
11342 88 : release_defs (stmt);
11343 : }
11344 : else
11345 837251 : gsi_next (&si);
11346 : }
11347 : }
11348 :
11349 : /* Schedule the SLP instances. */
11350 61566 : if (!loop_vinfo->slp_instances.is_empty ())
11351 : {
11352 61566 : DUMP_VECT_SCOPE ("scheduling SLP instances");
11353 61566 : vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11354 : }
11355 :
11356 : /* Generate the loop invariant statements. */
11357 61566 : if (!gimple_seq_empty_p (LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo)))
11358 : {
11359 73 : if (dump_enabled_p ())
11360 30 : dump_printf_loc (MSG_NOTE, vect_location,
11361 : "------>generating loop invariant statements\n");
11362 73 : gimple_stmt_iterator gsi;
11363 73 : gsi = gsi_after_labels (loop_preheader_edge (loop)->src);
11364 73 : gsi_insert_seq_before (&gsi, LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo),
11365 : GSI_CONTINUE_LINKING);
11366 : }
11367 :
11368 : /* Stub out scalar statements that must not survive vectorization and
11369 : were not picked as relevant in any SLP instance.
11370 : Doing this here helps with grouped statements, or statements that
11371 : are involved in patterns. */
11372 188018 : for (i = 0; i < nbbs; i++)
11373 : {
11374 126452 : basic_block bb = bbs[i];
11375 126452 : stmt_vec_info stmt_info;
11376 252904 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11377 1670927 : !gsi_end_p (gsi); gsi_next (&gsi))
11378 : {
11379 1544475 : gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11380 6350 : if (!call || !gimple_call_internal_p (call))
11381 1539284 : continue;
11382 5191 : internal_fn ifn = gimple_call_internal_fn (call);
11383 5191 : if (ifn == IFN_MASK_LOAD)
11384 : {
11385 737 : tree lhs = gimple_get_lhs (call);
11386 737 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11387 : {
11388 0 : tree zero = build_zero_cst (TREE_TYPE (lhs));
11389 0 : gimple *new_stmt = gimple_build_assign (lhs, zero);
11390 0 : gsi_replace (&gsi, new_stmt, true);
11391 : }
11392 : }
11393 4454 : else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11394 : {
11395 2297 : tree lhs = gimple_get_lhs (call);
11396 2297 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11397 : {
11398 0 : tree else_arg
11399 0 : = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11400 0 : gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11401 0 : gsi_replace (&gsi, new_stmt, true);
11402 : }
11403 : }
11404 2157 : else if (ifn == IFN_MASK_CALL
11405 4 : && (stmt_info = loop_vinfo->lookup_stmt (call))
11406 4 : && !STMT_VINFO_RELEVANT_P (stmt_info)
11407 2161 : && !STMT_VINFO_LIVE_P (stmt_info))
11408 : {
11409 4 : gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11410 4 : loop_vinfo->remove_stmt (stmt_info);
11411 : }
11412 : }
11413 : }
11414 :
11415 61566 : if (!uncounted_p)
11416 : {
11417 : /* The vectorization factor is always > 1, so if we use an IV increment of
11418 : 1. A zero NITERS becomes a nonzero NITERS_VECTOR. */
11419 61523 : if (integer_onep (step_vector))
11420 61505 : niters_no_overflow = true;
11421 :
11422 61523 : vect_set_loop_condition (loop, LOOP_VINFO_MAIN_EXIT (loop_vinfo),
11423 : loop_vinfo, niters_vector, step_vector,
11424 61523 : niters_vector_mult_vf, !niters_no_overflow);
11425 : }
11426 :
11427 61566 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11428 :
11429 : /* True if the final iteration might not handle a full vector's
11430 : worth of scalar iterations. */
11431 123132 : bool final_iter_may_be_partial
11432 61566 : = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11433 61566 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
11434 :
11435 : /* +1 to convert latch counts to loop iteration counts. */
11436 61566 : int bias_for_lowest = 1;
11437 :
11438 : /* When we are peeling for gaps then we take away one scalar iteration
11439 : from the vector loop. Thus we can adjust the upper bound by one
11440 : scalar iteration. But only when we know the bound applies to the
11441 : IV exit test which might not be true when we have multiple exits. */
11442 61566 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11443 119942 : bias_for_lowest -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11444 :
11445 61566 : int bias_for_assumed = bias_for_lowest;
11446 61566 : int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11447 61566 : if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11448 : {
11449 : /* When the amount of peeling is known at compile time, the first
11450 : iteration will have exactly alignment_npeels active elements.
11451 : In the worst case it will have at least one. */
11452 1 : int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11453 1 : bias_for_lowest += lowest_vf - min_first_active;
11454 1 : bias_for_assumed += assumed_vf - min_first_active;
11455 : }
11456 : /* In these calculations the "- 1" converts loop iteration counts
11457 : back to latch counts. */
11458 61566 : if (loop->any_upper_bound)
11459 : {
11460 61550 : loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11461 61550 : loop->nb_iterations_upper_bound
11462 61550 : = (final_iter_may_be_partial
11463 62961 : ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11464 2822 : lowest_vf) - 1
11465 60139 : : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11466 120278 : lowest_vf) - 1);
11467 61550 : if (main_vinfo
11468 : /* Both peeling for alignment and peeling for gaps can end up
11469 : with the scalar epilogue running for more than VF-1 iterations. */
11470 6823 : && !main_vinfo->peeling_for_alignment
11471 6775 : && !main_vinfo->peeling_for_gaps)
11472 : {
11473 6593 : unsigned int bound;
11474 6593 : poly_uint64 main_iters
11475 6593 : = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11476 : LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11477 6593 : main_iters
11478 6593 : = upper_bound (main_iters,
11479 6593 : LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11480 13186 : if (can_div_away_from_zero_p (main_iters,
11481 6593 : LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11482 : &bound))
11483 6593 : loop->nb_iterations_upper_bound
11484 6593 : = wi::umin ((bound_wide_int) (bound - 1),
11485 6593 : loop->nb_iterations_upper_bound);
11486 : }
11487 : }
11488 61566 : if (loop->any_likely_upper_bound)
11489 61550 : loop->nb_iterations_likely_upper_bound
11490 61550 : = (final_iter_may_be_partial
11491 62961 : ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11492 1411 : + bias_for_lowest, lowest_vf) - 1
11493 60139 : : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11494 61550 : + bias_for_lowest, lowest_vf) - 1);
11495 61566 : if (loop->any_estimate)
11496 35502 : loop->nb_iterations_estimate
11497 35502 : = (final_iter_may_be_partial
11498 36195 : ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11499 1386 : assumed_vf) - 1
11500 34809 : : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11501 70311 : assumed_vf) - 1);
11502 61566 : scale_profile_for_vect_loop (loop, LOOP_VINFO_MAIN_EXIT (loop_vinfo),
11503 : assumed_vf, flat);
11504 :
11505 61566 : if (dump_enabled_p ())
11506 : {
11507 10987 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11508 : {
11509 9531 : dump_printf_loc (MSG_NOTE, vect_location,
11510 : "LOOP VECTORIZED\n");
11511 9531 : if (loop->inner)
11512 345 : dump_printf_loc (MSG_NOTE, vect_location,
11513 : "OUTER LOOP VECTORIZED\n");
11514 9531 : dump_printf (MSG_NOTE, "\n");
11515 : }
11516 : else
11517 1456 : dump_printf_loc (MSG_NOTE, vect_location,
11518 : "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11519 1456 : GET_MODE_NAME (loop_vinfo->vector_mode));
11520 : }
11521 :
11522 : /* Loops vectorized with a variable factor won't benefit from
11523 : unrolling/peeling. */
11524 61566 : if (!vf.is_constant ())
11525 : {
11526 : loop->unroll = 1;
11527 : if (dump_enabled_p ())
11528 : dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11529 : " variable-length vectorization factor\n");
11530 : }
11531 :
11532 : /* When we have unrolled the loop due to a user requested value we should
11533 : leave it up to the RTL unroll heuristics to determine if it's still worth
11534 : while to unroll more. */
11535 61566 : if (LOOP_VINFO_USER_UNROLL (loop_vinfo))
11536 44 : loop->unroll = 0;
11537 :
11538 : /* Free SLP instances here because otherwise stmt reference counting
11539 : won't work. */
11540 : slp_instance instance;
11541 151221 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11542 89655 : vect_free_slp_instance (instance);
11543 61566 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11544 : /* Clear-up safelen field since its value is invalid after vectorization
11545 : since vectorized loop can have loop-carried dependencies. */
11546 61566 : loop->safelen = 0;
11547 :
11548 61566 : if (epilogue)
11549 : {
11550 : /* Accumulate past advancements made. */
11551 6823 : if (LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo))
11552 83 : advance = fold_build2 (PLUS_EXPR, TREE_TYPE (advance),
11553 : LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo),
11554 : advance);
11555 6823 : update_epilogue_loop_vinfo (epilogue, advance);
11556 :
11557 6823 : epilogue->simduid = loop->simduid;
11558 6823 : epilogue->force_vectorize = loop->force_vectorize;
11559 6823 : epilogue->dont_vectorize = false;
11560 : }
11561 :
11562 61566 : return epilogue;
11563 61566 : }
11564 :
11565 : /* The code below is trying to perform simple optimization - revert
11566 : if-conversion for masked stores, i.e. if the mask of a store is zero
11567 : do not perform it and all stored value producers also if possible.
11568 : For example,
11569 : for (i=0; i<n; i++)
11570 : if (c[i])
11571 : {
11572 : p1[i] += 1;
11573 : p2[i] = p3[i] +2;
11574 : }
11575 : this transformation will produce the following semi-hammock:
11576 :
11577 : if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11578 : {
11579 : vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11580 : vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11581 : MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11582 : vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11583 : vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11584 : MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11585 : }
11586 : */
11587 :
11588 : void
11589 493 : optimize_mask_stores (class loop *loop)
11590 : {
11591 493 : basic_block *bbs = get_loop_body (loop);
11592 493 : unsigned nbbs = loop->num_nodes;
11593 493 : unsigned i;
11594 493 : basic_block bb;
11595 493 : class loop *bb_loop;
11596 493 : gimple_stmt_iterator gsi;
11597 493 : gimple *stmt;
11598 493 : auto_vec<gimple *> worklist;
11599 493 : auto_purge_vect_location sentinel;
11600 :
11601 493 : vect_location = find_loop_location (loop);
11602 : /* Pick up all masked stores in loop if any. */
11603 1972 : for (i = 0; i < nbbs; i++)
11604 : {
11605 986 : bb = bbs[i];
11606 17311 : for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11607 15339 : gsi_next (&gsi))
11608 : {
11609 15339 : stmt = gsi_stmt (gsi);
11610 15339 : if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11611 695 : worklist.safe_push (stmt);
11612 : }
11613 : }
11614 :
11615 493 : free (bbs);
11616 493 : if (worklist.is_empty ())
11617 68 : return;
11618 :
11619 : /* Loop has masked stores. */
11620 1103 : while (!worklist.is_empty ())
11621 : {
11622 678 : gimple *last, *last_store;
11623 678 : edge e, efalse;
11624 678 : tree mask;
11625 678 : basic_block store_bb, join_bb;
11626 678 : gimple_stmt_iterator gsi_to;
11627 678 : tree vdef, new_vdef;
11628 678 : gphi *phi;
11629 678 : tree vectype;
11630 678 : tree zero;
11631 :
11632 678 : last = worklist.pop ();
11633 678 : mask = gimple_call_arg (last, 2);
11634 678 : bb = gimple_bb (last);
11635 : /* Create then_bb and if-then structure in CFG, then_bb belongs to
11636 : the same loop as if_bb. It could be different to LOOP when two
11637 : level loop-nest is vectorized and mask_store belongs to the inner
11638 : one. */
11639 678 : e = split_block (bb, last);
11640 678 : bb_loop = bb->loop_father;
11641 678 : gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11642 678 : join_bb = e->dest;
11643 678 : store_bb = create_empty_bb (bb);
11644 678 : add_bb_to_loop (store_bb, bb_loop);
11645 678 : e->flags = EDGE_TRUE_VALUE;
11646 678 : efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11647 : /* Put STORE_BB to likely part. */
11648 678 : efalse->probability = profile_probability::likely ();
11649 678 : e->probability = efalse->probability.invert ();
11650 678 : store_bb->count = efalse->count ();
11651 678 : make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11652 678 : if (dom_info_available_p (CDI_DOMINATORS))
11653 678 : set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11654 678 : if (dump_enabled_p ())
11655 351 : dump_printf_loc (MSG_NOTE, vect_location,
11656 : "Create new block %d to sink mask stores.",
11657 : store_bb->index);
11658 : /* Create vector comparison with boolean result. */
11659 678 : vectype = TREE_TYPE (mask);
11660 678 : zero = build_zero_cst (vectype);
11661 678 : stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11662 678 : gsi = gsi_last_bb (bb);
11663 678 : gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11664 : /* Create new PHI node for vdef of the last masked store:
11665 : .MEM_2 = VDEF <.MEM_1>
11666 : will be converted to
11667 : .MEM.3 = VDEF <.MEM_1>
11668 : and new PHI node will be created in join bb
11669 : .MEM_2 = PHI <.MEM_1, .MEM_3>
11670 : */
11671 678 : vdef = gimple_vdef (last);
11672 678 : new_vdef = make_ssa_name (gimple_vop (cfun), last);
11673 678 : gimple_set_vdef (last, new_vdef);
11674 678 : phi = create_phi_node (vdef, join_bb);
11675 678 : add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11676 :
11677 : /* Put all masked stores with the same mask to STORE_BB if possible. */
11678 712 : while (true)
11679 : {
11680 695 : gimple_stmt_iterator gsi_from;
11681 695 : gimple *stmt1 = NULL;
11682 :
11683 : /* Move masked store to STORE_BB. */
11684 695 : last_store = last;
11685 695 : gsi = gsi_for_stmt (last);
11686 695 : gsi_from = gsi;
11687 : /* Shift GSI to the previous stmt for further traversal. */
11688 695 : gsi_prev (&gsi);
11689 695 : gsi_to = gsi_start_bb (store_bb);
11690 695 : gsi_move_before (&gsi_from, &gsi_to);
11691 : /* Setup GSI_TO to the non-empty block start. */
11692 695 : gsi_to = gsi_start_bb (store_bb);
11693 695 : if (dump_enabled_p ())
11694 367 : dump_printf_loc (MSG_NOTE, vect_location,
11695 : "Move stmt to created bb\n%G", last);
11696 : /* Move all stored value producers if possible. */
11697 4960 : while (!gsi_end_p (gsi))
11698 : {
11699 4959 : tree lhs;
11700 4959 : imm_use_iterator imm_iter;
11701 4959 : use_operand_p use_p;
11702 4959 : bool res;
11703 :
11704 : /* Skip debug statements. */
11705 4959 : if (is_gimple_debug (gsi_stmt (gsi)))
11706 : {
11707 3 : gsi_prev (&gsi);
11708 3225 : continue;
11709 : }
11710 4956 : stmt1 = gsi_stmt (gsi);
11711 : /* Do not consider statements writing to memory or having
11712 : volatile operand. */
11713 9762 : if (gimple_vdef (stmt1)
11714 9762 : || gimple_has_volatile_ops (stmt1))
11715 : break;
11716 4806 : gsi_from = gsi;
11717 4806 : gsi_prev (&gsi);
11718 4806 : lhs = gimple_get_lhs (stmt1);
11719 4806 : if (!lhs)
11720 : break;
11721 :
11722 : /* LHS of vectorized stmt must be SSA_NAME. */
11723 4806 : if (TREE_CODE (lhs) != SSA_NAME)
11724 : break;
11725 :
11726 4806 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11727 : {
11728 : /* Remove dead scalar statement. */
11729 3554 : if (has_zero_uses (lhs))
11730 : {
11731 3222 : gsi_remove (&gsi_from, true);
11732 3222 : release_defs (stmt1);
11733 3222 : continue;
11734 : }
11735 : }
11736 :
11737 : /* Check that LHS does not have uses outside of STORE_BB. */
11738 1584 : res = true;
11739 4309 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11740 : {
11741 1685 : gimple *use_stmt;
11742 1685 : use_stmt = USE_STMT (use_p);
11743 1685 : if (is_gimple_debug (use_stmt))
11744 0 : continue;
11745 1685 : if (gimple_bb (use_stmt) != store_bb)
11746 : {
11747 : res = false;
11748 : break;
11749 : }
11750 1584 : }
11751 1584 : if (!res)
11752 : break;
11753 :
11754 1040 : if (gimple_vuse (stmt1)
11755 1476 : && gimple_vuse (stmt1) != gimple_vuse (last_store))
11756 : break;
11757 :
11758 : /* Can move STMT1 to STORE_BB. */
11759 1040 : if (dump_enabled_p ())
11760 563 : dump_printf_loc (MSG_NOTE, vect_location,
11761 : "Move stmt to created bb\n%G", stmt1);
11762 1040 : gsi_move_before (&gsi_from, &gsi_to);
11763 : /* Shift GSI_TO for further insertion. */
11764 2080 : gsi_prev (&gsi_to);
11765 : }
11766 : /* Put other masked stores with the same mask to STORE_BB. */
11767 695 : if (worklist.is_empty ()
11768 270 : || gimple_call_arg (worklist.last (), 2) != mask
11769 17 : || worklist.last () != stmt1)
11770 : break;
11771 17 : last = worklist.pop ();
11772 17 : }
11773 1356 : add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11774 : }
11775 493 : }
11776 :
11777 : /* Decide whether it is possible to use a zero-based induction variable
11778 : when vectorizing LOOP_VINFO with partial vectors. If it is, return
11779 : the value that the induction variable must be able to hold in order
11780 : to ensure that the rgroups eventually have no active vector elements.
11781 : Return -1 otherwise. */
11782 :
11783 : widest_int
11784 46716 : vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11785 : {
11786 46716 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11787 46716 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11788 46716 : unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11789 :
11790 : /* Calculate the value that the induction variable must be able
11791 : to hit in order to ensure that we end the loop with an all-false mask.
11792 : This involves adding the maximum number of inactive trailing scalar
11793 : iterations. */
11794 46716 : widest_int iv_limit = -1;
11795 46716 : if (max_loop_iterations (loop, &iv_limit))
11796 : {
11797 46716 : if (niters_skip)
11798 : {
11799 : /* Add the maximum number of skipped iterations to the
11800 : maximum iteration count. */
11801 0 : if (TREE_CODE (niters_skip) == INTEGER_CST)
11802 0 : iv_limit += wi::to_widest (niters_skip);
11803 : else
11804 0 : iv_limit += max_vf - 1;
11805 : }
11806 46716 : else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11807 : /* Make a conservatively-correct assumption. */
11808 320 : iv_limit += max_vf - 1;
11809 :
11810 : /* IV_LIMIT is the maximum number of latch iterations, which is also
11811 : the maximum in-range IV value. Round this value down to the previous
11812 : vector alignment boundary and then add an extra full iteration. */
11813 46716 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11814 46716 : iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11815 : }
11816 46716 : return iv_limit;
11817 : }
11818 :
11819 : /* For the given rgroup_controls RGC, check whether an induction variable
11820 : would ever hit a value that produces a set of all-false masks or zero
11821 : lengths before wrapping around. Return true if it's possible to wrap
11822 : around before hitting the desirable value, otherwise return false. */
11823 :
11824 : bool
11825 0 : vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11826 : {
11827 0 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11828 :
11829 0 : if (iv_limit == -1)
11830 : return true;
11831 :
11832 0 : tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11833 0 : unsigned int compare_precision = TYPE_PRECISION (compare_type);
11834 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11835 :
11836 0 : if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11837 : return true;
11838 :
11839 : return false;
11840 0 : }
|