Line data Source code
1 : /* Loop Vectorization
2 : Copyright (C) 2003-2026 Free Software Foundation, Inc.
3 : Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 : Ira Rosen <irar@il.ibm.com>
5 :
6 : This file is part of GCC.
7 :
8 : GCC is free software; you can redistribute it and/or modify it under
9 : the terms of the GNU General Public License as published by the Free
10 : Software Foundation; either version 3, or (at your option) any later
11 : version.
12 :
13 : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : for more details.
17 :
18 : You should have received a copy of the GNU General Public License
19 : along with GCC; see the file COPYING3. If not see
20 : <http://www.gnu.org/licenses/>. */
21 :
22 : #define INCLUDE_ALGORITHM
23 : #include "config.h"
24 : #include "system.h"
25 : #include "coretypes.h"
26 : #include "backend.h"
27 : #include "target.h"
28 : #include "rtl.h"
29 : #include "tree.h"
30 : #include "gimple.h"
31 : #include "cfghooks.h"
32 : #include "tree-pass.h"
33 : #include "ssa.h"
34 : #include "optabs-tree.h"
35 : #include "memmodel.h"
36 : #include "optabs.h"
37 : #include "diagnostic-core.h"
38 : #include "fold-const.h"
39 : #include "stor-layout.h"
40 : #include "cfganal.h"
41 : #include "gimplify.h"
42 : #include "gimple-iterator.h"
43 : #include "gimplify-me.h"
44 : #include "tree-ssa-loop-ivopts.h"
45 : #include "tree-ssa-loop-manip.h"
46 : #include "tree-ssa-loop-niter.h"
47 : #include "tree-ssa-loop.h"
48 : #include "cfgloop.h"
49 : #include "tree-scalar-evolution.h"
50 : #include "tree-vectorizer.h"
51 : #include "gimple-fold.h"
52 : #include "cgraph.h"
53 : #include "tree-cfg.h"
54 : #include "tree-if-conv.h"
55 : #include "internal-fn.h"
56 : #include "tree-vector-builder.h"
57 : #include "vec-perm-indices.h"
58 : #include "tree-eh.h"
59 : #include "case-cfn-macros.h"
60 : #include "langhooks.h"
61 : #include "opts.h"
62 : #include "hierarchical_discriminator.h"
63 :
64 : /* Loop Vectorization Pass.
65 :
66 : This pass tries to vectorize loops.
67 :
68 : For example, the vectorizer transforms the following simple loop:
69 :
70 : short a[N]; short b[N]; short c[N]; int i;
71 :
72 : for (i=0; i<N; i++){
73 : a[i] = b[i] + c[i];
74 : }
75 :
76 : as if it was manually vectorized by rewriting the source code into:
77 :
78 : typedef int __attribute__((mode(V8HI))) v8hi;
79 : short a[N]; short b[N]; short c[N]; int i;
80 : v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
81 : v8hi va, vb, vc;
82 :
83 : for (i=0; i<N/8; i++){
84 : vb = pb[i];
85 : vc = pc[i];
86 : va = vb + vc;
87 : pa[i] = va;
88 : }
89 :
90 : The main entry to this pass is vectorize_loops(), in which
91 : the vectorizer applies a set of analyses on a given set of loops,
92 : followed by the actual vectorization transformation for the loops that
93 : had successfully passed the analysis phase.
94 : Throughout this pass we make a distinction between two types of
95 : data: scalars (which are represented by SSA_NAMES), and memory references
96 : ("data-refs"). These two types of data require different handling both
97 : during analysis and transformation. The types of data-refs that the
98 : vectorizer currently supports are ARRAY_REFS which base is an array DECL
99 : (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
100 : accesses are required to have a simple (consecutive) access pattern.
101 :
102 : Analysis phase:
103 : ===============
104 : The driver for the analysis phase is vect_analyze_loop().
105 : It applies a set of analyses, some of which rely on the scalar evolution
106 : analyzer (scev) developed by Sebastian Pop.
107 :
108 : During the analysis phase the vectorizer records some information
109 : per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
110 : loop, as well as general information about the loop as a whole, which is
111 : recorded in a "loop_vec_info" struct attached to each loop.
112 :
113 : Transformation phase:
114 : =====================
115 : The loop transformation phase scans all the stmts in the loop, and
116 : creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
117 : the loop that needs to be vectorized. It inserts the vector code sequence
118 : just before the scalar stmt S, and records a pointer to the vector code
119 : in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
120 : attached to S). This pointer will be used for the vectorization of following
121 : stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
122 : otherwise, we rely on dead code elimination for removing it.
123 :
124 : For example, say stmt S1 was vectorized into stmt VS1:
125 :
126 : VS1: vb = px[i];
127 : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
128 : S2: a = b;
129 :
130 : To vectorize stmt S2, the vectorizer first finds the stmt that defines
131 : the operand 'b' (S1), and gets the relevant vector def 'vb' from the
132 : vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
133 : resulting sequence would be:
134 :
135 : VS1: vb = px[i];
136 : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
137 : VS2: va = vb;
138 : S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
139 :
140 : Operands that are not SSA_NAMEs, are data-refs that appear in
141 : load/store operations (like 'x[i]' in S1), and are handled differently.
142 :
143 : Target modeling:
144 : =================
145 : Currently the only target specific information that is used is the
146 : size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
147 : Targets that can support different sizes of vectors, for now will need
148 : to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
149 : flexibility will be added in the future.
150 :
151 : Since we only vectorize operations which vector form can be
152 : expressed using existing tree codes, to verify that an operation is
153 : supported, the vectorizer checks the relevant optab at the relevant
154 : machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
155 : the value found is CODE_FOR_nothing, then there's no target support, and
156 : we can't vectorize the stmt.
157 :
158 : For additional information on this project see:
159 : http://gcc.gnu.org/projects/tree-ssa/vectorization.html
160 : */
161 :
162 : static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
163 : unsigned *);
164 : static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
165 : gphi **);
166 :
167 :
168 : /* Function vect_is_simple_iv_evolution.
169 :
170 : FORNOW: A simple evolution of an induction variables in the loop is
171 : considered a polynomial evolution. */
172 :
173 : static bool
174 768556 : vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn,
175 : stmt_vec_info stmt_info)
176 : {
177 768556 : tree init_expr;
178 768556 : tree step_expr;
179 768556 : tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
180 768556 : basic_block bb;
181 :
182 : /* When there is no evolution in this loop, the evolution function
183 : is not "simple". */
184 768556 : if (evolution_part == NULL_TREE)
185 : return false;
186 :
187 : /* When the evolution is a polynomial of degree >= 2
188 : the evolution function is not "simple". */
189 816922 : if (tree_is_chrec (evolution_part))
190 : return false;
191 :
192 665369 : step_expr = evolution_part;
193 665369 : init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
194 :
195 665369 : if (dump_enabled_p ())
196 38587 : dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
197 : step_expr, init_expr);
198 :
199 665369 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = init_expr;
200 665369 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = step_expr;
201 :
202 665369 : if (TREE_CODE (step_expr) != INTEGER_CST
203 56050 : && (TREE_CODE (step_expr) != SSA_NAME
204 44481 : || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
205 44231 : && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
206 7722 : || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
207 127 : && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
208 127 : || !flag_associative_math)))
209 713792 : && (TREE_CODE (step_expr) != REAL_CST
210 431 : || !flag_associative_math))
211 : {
212 48366 : if (dump_enabled_p ())
213 2948 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
214 : "step unknown.\n");
215 48366 : return false;
216 : }
217 :
218 : return true;
219 : }
220 :
221 : /* Function vect_is_nonlinear_iv_evolution
222 :
223 : Only support nonlinear induction for integer type
224 : 1. neg
225 : 2. mul by constant
226 : 3. lshift/rshift by constant.
227 :
228 : For neg induction, return a fake step as integer -1. */
229 : static bool
230 149121 : vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
231 : gphi* loop_phi_node)
232 : {
233 149121 : tree init_expr, ev_expr, result, op1, op2;
234 149121 : gimple* def;
235 :
236 149121 : if (gimple_phi_num_args (loop_phi_node) != 2)
237 : return false;
238 :
239 149121 : init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
240 149121 : ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
241 :
242 : /* Support nonlinear induction only for integer type. */
243 149121 : if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
244 : return false;
245 :
246 91444 : result = PHI_RESULT (loop_phi_node);
247 :
248 91444 : if (TREE_CODE (ev_expr) != SSA_NAME
249 89149 : || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
250 91444 : || !is_gimple_assign (def))
251 : return false;
252 :
253 83283 : enum tree_code t_code = gimple_assign_rhs_code (def);
254 83283 : tree step;
255 83283 : switch (t_code)
256 : {
257 1808 : case NEGATE_EXPR:
258 1808 : if (gimple_assign_rhs1 (def) != result)
259 : return false;
260 1808 : step = build_int_cst (TREE_TYPE (init_expr), -1);
261 1808 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
262 1808 : break;
263 :
264 15096 : case RSHIFT_EXPR:
265 15096 : case LSHIFT_EXPR:
266 15096 : case MULT_EXPR:
267 15096 : op1 = gimple_assign_rhs1 (def);
268 15096 : op2 = gimple_assign_rhs2 (def);
269 15096 : if (TREE_CODE (op2) != INTEGER_CST
270 11233 : || op1 != result)
271 : return false;
272 10851 : step = op2;
273 10851 : if (t_code == LSHIFT_EXPR)
274 472 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
275 10379 : else if (t_code == RSHIFT_EXPR)
276 9411 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
277 : /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
278 : else
279 968 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
280 : break;
281 :
282 : default:
283 : return false;
284 : }
285 :
286 12659 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = init_expr;
287 12659 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = step;
288 :
289 12659 : return true;
290 : }
291 :
292 : /* Returns true if Phi is a first-order recurrence. A first-order
293 : recurrence is a non-reduction recurrence relation in which the value of
294 : the recurrence in the current loop iteration equals a value defined in
295 : the previous iteration. */
296 :
297 : static bool
298 67262 : vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
299 : gphi *phi)
300 : {
301 : /* A nested cycle isn't vectorizable as first order recurrence. */
302 67262 : if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
303 : return false;
304 :
305 : /* Ensure the loop latch definition is from within the loop. */
306 67096 : edge latch = loop_latch_edge (loop);
307 67096 : tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
308 67096 : if (TREE_CODE (ldef) != SSA_NAME
309 64458 : || SSA_NAME_IS_DEFAULT_DEF (ldef)
310 64392 : || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
311 126878 : || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
312 7964 : return false;
313 :
314 59132 : tree def = gimple_phi_result (phi);
315 :
316 : /* Ensure every use_stmt of the phi node is dominated by the latch
317 : definition. */
318 59132 : imm_use_iterator imm_iter;
319 59132 : use_operand_p use_p;
320 131282 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
321 71657 : if (!is_gimple_debug (USE_STMT (use_p))
322 139730 : && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
323 46733 : || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
324 : USE_STMT (use_p))))
325 58639 : return false;
326 :
327 : /* First-order recurrence autovectorization needs shuffle vector. */
328 493 : tree scalar_type = TREE_TYPE (def);
329 493 : tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
330 493 : if (!vectype)
331 : return false;
332 :
333 : return true;
334 : }
335 :
336 : /* Function vect_analyze_scalar_cycles_1.
337 :
338 : Examine the cross iteration def-use cycles of scalar variables
339 : in LOOP. LOOP_VINFO represents the loop that is now being
340 : considered for vectorization (can be LOOP, or an outer-loop
341 : enclosing LOOP). SLP indicates there will be some subsequent
342 : slp analyses or not. */
343 :
344 : static void
345 383182 : vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
346 : {
347 383182 : basic_block bb = loop->header;
348 383182 : auto_vec<stmt_vec_info, 64> worklist;
349 383182 : gphi_iterator gsi;
350 :
351 383182 : DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
352 :
353 : /* First - identify all inductions. Reduction detection assumes that all the
354 : inductions have been identified, therefore, this order must not be
355 : changed. */
356 1346927 : for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
357 : {
358 963745 : gphi *phi = gsi.phi ();
359 963745 : tree access_fn = NULL;
360 963745 : tree def = PHI_RESULT (phi);
361 963745 : stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
362 :
363 : /* Skip virtual phi's. The data dependences that are associated with
364 : virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
365 1927490 : if (virtual_operand_p (def))
366 334089 : continue;
367 :
368 : /* Skip already analyzed inner loop PHIs of double reductions. */
369 769503 : if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))
370 947 : continue;
371 :
372 768556 : if (dump_enabled_p ())
373 40650 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
374 : (gimple *) phi);
375 :
376 768556 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
377 :
378 : /* Analyze the evolution function. */
379 768556 : access_fn = analyze_scalar_evolution (loop, def);
380 768556 : if (dump_enabled_p ())
381 40650 : dump_printf_loc (MSG_NOTE, vect_location,
382 : "Access function of PHI: %T\n", access_fn);
383 768556 : if (access_fn)
384 768556 : STRIP_NOPS (access_fn);
385 :
386 907456 : if ((!access_fn
387 768556 : || !vect_is_simple_iv_evolution (loop->num, access_fn, stmt_vinfo)
388 617003 : || (LOOP_VINFO_LOOP (loop_vinfo) != loop
389 10861 : && (TREE_CODE (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo))
390 : != INTEGER_CST)))
391 : /* Only handle nonlinear iv for same loop. */
392 920115 : && (LOOP_VINFO_LOOP (loop_vinfo) != loop
393 149121 : || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo, phi)))
394 : {
395 138900 : worklist.safe_push (stmt_vinfo);
396 138900 : continue;
397 : }
398 :
399 629656 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
400 : != NULL_TREE);
401 629656 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
402 :
403 629656 : if (dump_enabled_p ())
404 35740 : dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
405 629656 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
406 :
407 : /* Mark if we have a non-linear IV. */
408 629656 : LOOP_VINFO_NON_LINEAR_IV (loop_vinfo)
409 629656 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_vinfo) != vect_step_op_add;
410 : }
411 :
412 :
413 : /* Second - identify all reductions and nested cycles. */
414 522082 : while (worklist.length () > 0)
415 : {
416 138900 : stmt_vec_info stmt_vinfo = worklist.pop ();
417 138900 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
418 138900 : tree def = PHI_RESULT (phi);
419 :
420 138900 : if (dump_enabled_p ())
421 4910 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
422 : (gimple *) phi);
423 :
424 277800 : gcc_assert (!virtual_operand_p (def)
425 : && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
426 :
427 138900 : gphi *double_reduc;
428 138900 : stmt_vec_info reduc_stmt_info
429 138900 : = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc);
430 138900 : if (reduc_stmt_info && double_reduc)
431 : {
432 1049 : stmt_vec_info inner_phi_info
433 1049 : = loop_vinfo->lookup_stmt (double_reduc);
434 : /* ??? Pass down flag we're the inner loop of a double reduc. */
435 1049 : stmt_vec_info inner_reduc_info
436 1049 : = vect_is_simple_reduction (loop_vinfo, inner_phi_info, NULL);
437 1049 : if (inner_reduc_info)
438 : {
439 947 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
440 947 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
441 947 : STMT_VINFO_REDUC_DEF (inner_phi_info) = inner_reduc_info;
442 947 : STMT_VINFO_REDUC_DEF (inner_reduc_info) = inner_phi_info;
443 947 : if (dump_enabled_p ())
444 130 : dump_printf_loc (MSG_NOTE, vect_location,
445 : "Detected double reduction.\n");
446 :
447 947 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
448 947 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
449 947 : STMT_VINFO_DEF_TYPE (inner_phi_info) = vect_nested_cycle;
450 : /* Make it accessible for SLP vectorization. */
451 947 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
452 : }
453 102 : else if (dump_enabled_p ())
454 14 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
455 : "Unknown def-use cycle pattern.\n");
456 : }
457 137851 : else if (reduc_stmt_info)
458 : {
459 70589 : if (loop != LOOP_VINFO_LOOP (loop_vinfo))
460 : {
461 2272 : if (dump_enabled_p ())
462 433 : dump_printf_loc (MSG_NOTE, vect_location,
463 : "Detected vectorizable nested cycle.\n");
464 :
465 2272 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
466 : }
467 : else
468 : {
469 68317 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
470 68317 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
471 68317 : if (dump_enabled_p ())
472 3788 : dump_printf_loc (MSG_NOTE, vect_location,
473 : "Detected reduction.\n");
474 :
475 68317 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
476 68317 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
477 68317 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
478 : }
479 : }
480 67262 : else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
481 487 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
482 : else
483 66775 : if (dump_enabled_p ())
484 473 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
485 : "Unknown def-use cycle pattern.\n");
486 : }
487 383182 : }
488 :
489 :
490 : /* Function vect_analyze_scalar_cycles.
491 :
492 : Examine the cross iteration def-use cycles of scalar variables, by
493 : analyzing the loop-header PHIs of scalar variables. Classify each
494 : cycle as one of the following: invariant, induction, reduction, unknown.
495 : We do that for the loop represented by LOOP_VINFO, and also to its
496 : inner-loop, if exists.
497 : Examples for scalar cycles:
498 :
499 : Example1: reduction:
500 :
501 : loop1:
502 : for (i=0; i<N; i++)
503 : sum += a[i];
504 :
505 : Example2: induction:
506 :
507 : loop2:
508 : for (i=0; i<N; i++)
509 : a[i] = i; */
510 :
511 : static void
512 377655 : vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
513 : {
514 377655 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
515 :
516 377655 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
517 :
518 : /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
519 : Reductions in such inner-loop therefore have different properties than
520 : the reductions in the nest that gets vectorized:
521 : 1. When vectorized, they are executed in the same order as in the original
522 : scalar loop, so we can't change the order of computation when
523 : vectorizing them.
524 : 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
525 : current checks are too strict. */
526 :
527 377655 : if (loop->inner)
528 5527 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
529 377655 : }
530 :
531 : /* Function vect_get_loop_niters.
532 :
533 : Determine how many iterations the loop is executed and place it
534 : in NUMBER_OF_ITERATIONS. Place the number of latch iterations
535 : in NUMBER_OF_ITERATIONSM1. Place the condition under which the
536 : niter information holds in ASSUMPTIONS.
537 :
538 : Return the loop exit conditions. */
539 :
540 :
541 : static vec<gcond *>
542 281971 : vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
543 : tree *number_of_iterations, tree *number_of_iterationsm1)
544 : {
545 281971 : auto_vec<edge> exits = get_loop_exit_edges (loop);
546 281971 : vec<gcond *> conds;
547 563942 : conds.create (exits.length ());
548 281971 : class tree_niter_desc niter_desc;
549 281971 : tree niter_assumptions, niter, may_be_zero;
550 :
551 281971 : *assumptions = boolean_true_node;
552 281971 : *number_of_iterationsm1 = chrec_dont_know;
553 281971 : *number_of_iterations = chrec_dont_know;
554 :
555 281971 : DUMP_VECT_SCOPE ("get_loop_niters");
556 :
557 281971 : if (exits.is_empty ())
558 0 : return conds;
559 :
560 281971 : if (dump_enabled_p ())
561 14594 : dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
562 : exits.length ());
563 :
564 : edge exit;
565 : unsigned int i;
566 686848 : FOR_EACH_VEC_ELT (exits, i, exit)
567 : {
568 404877 : gcond *cond = get_loop_exit_condition (exit);
569 404877 : if (cond)
570 404847 : conds.safe_push (cond);
571 :
572 404877 : if (dump_enabled_p ())
573 15725 : dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
574 :
575 404877 : if (exit != main_exit)
576 183200 : continue;
577 :
578 281971 : may_be_zero = NULL_TREE;
579 281971 : if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
580 281971 : || chrec_contains_undetermined (niter_desc.niter))
581 60294 : continue;
582 :
583 221677 : niter_assumptions = niter_desc.assumptions;
584 221677 : may_be_zero = niter_desc.may_be_zero;
585 221677 : niter = niter_desc.niter;
586 :
587 221677 : if (may_be_zero && integer_zerop (may_be_zero))
588 : may_be_zero = NULL_TREE;
589 :
590 9319 : if (may_be_zero)
591 : {
592 9319 : if (COMPARISON_CLASS_P (may_be_zero))
593 : {
594 : /* Try to combine may_be_zero with assumptions, this can simplify
595 : computation of niter expression. */
596 9319 : if (niter_assumptions && !integer_nonzerop (niter_assumptions))
597 1023 : niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
598 : niter_assumptions,
599 : fold_build1 (TRUTH_NOT_EXPR,
600 : boolean_type_node,
601 : may_be_zero));
602 : else
603 8296 : niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
604 : build_int_cst (TREE_TYPE (niter), 0),
605 : rewrite_to_non_trapping_overflow (niter));
606 :
607 221677 : may_be_zero = NULL_TREE;
608 : }
609 0 : else if (integer_nonzerop (may_be_zero))
610 : {
611 0 : *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
612 0 : *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
613 0 : continue;
614 : }
615 : else
616 0 : continue;
617 : }
618 :
619 : /* Loop assumptions are based off the normal exit. */
620 221677 : *assumptions = niter_assumptions;
621 221677 : *number_of_iterationsm1 = niter;
622 :
623 : /* We want the number of loop header executions which is the number
624 : of latch executions plus one.
625 : ??? For UINT_MAX latch executions this number overflows to zero
626 : for loops like do { n++; } while (n != 0); */
627 221677 : if (niter && !chrec_contains_undetermined (niter))
628 : {
629 221677 : niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
630 : unshare_expr (niter),
631 : build_int_cst (TREE_TYPE (niter), 1));
632 221677 : if (TREE_CODE (niter) == INTEGER_CST
633 120646 : && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
634 : {
635 : /* If we manage to fold niter + 1 into INTEGER_CST even when
636 : niter is some complex expression, ensure back
637 : *number_of_iterationsm1 is an INTEGER_CST as well. See
638 : PR113210. */
639 0 : *number_of_iterationsm1
640 0 : = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
641 : build_minus_one_cst (TREE_TYPE (niter)));
642 : }
643 : }
644 221677 : *number_of_iterations = niter;
645 : }
646 :
647 281971 : if (dump_enabled_p ())
648 14594 : dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
649 :
650 281971 : return conds;
651 281971 : }
652 :
653 : /* Determine the main loop exit for the vectorizer. */
654 :
655 : edge
656 493698 : vec_init_loop_exit_info (class loop *loop)
657 : {
658 : /* Before we begin we must first determine which exit is the main one and
659 : which are auxilary exits. */
660 493698 : auto_vec<edge> exits = get_loop_exit_edges (loop);
661 982385 : if (exits.length () == 0)
662 : return NULL;
663 488687 : if (exits.length () == 1)
664 322950 : return exits[0];
665 :
666 : /* If we have multiple exits, look for counting IV exit.
667 : Analyze all exits and return the last one we can analyze. */
668 165737 : class tree_niter_desc niter_desc;
669 165737 : edge candidate = NULL;
670 613452 : for (edge exit : exits)
671 : {
672 467854 : if (!get_loop_exit_condition (exit))
673 : {
674 20139 : if (dump_enabled_p ())
675 14 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
676 : "Unhandled loop exit detected.\n");
677 20139 : return NULL;
678 : }
679 :
680 447715 : if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
681 447715 : && !chrec_contains_undetermined (niter_desc.niter))
682 : {
683 132580 : tree may_be_zero = niter_desc.may_be_zero;
684 132580 : if ((integer_zerop (may_be_zero)
685 : /* As we are handling may_be_zero that's not false by
686 : rewriting niter to may_be_zero ? 0 : niter we require
687 : an empty latch. */
688 458073 : || (single_pred_p (loop->latch)
689 10011 : && exit->src == single_pred (loop->latch)
690 2523 : && (integer_nonzerop (may_be_zero)
691 2523 : || COMPARISON_CLASS_P (may_be_zero))))
692 135103 : && (!candidate
693 5935 : || dominated_by_p (CDI_DOMINATORS, exit->src,
694 5935 : candidate->src)))
695 : candidate = exit;
696 : }
697 : }
698 :
699 : /* If no exit is analyzable by scalar evolution, we return the last exit
700 : under the assummption we are dealing with an uncounted loop. */
701 200810 : if (!candidate && single_pred_p (loop->latch))
702 35073 : candidate = loop_exits_from_bb_p (loop, single_pred (loop->latch));
703 :
704 : return candidate;
705 165737 : }
706 :
707 : /* Function bb_in_loop_p
708 :
709 : Used as predicate for dfs order traversal of the loop bbs. */
710 :
711 : static bool
712 1527666 : bb_in_loop_p (const_basic_block bb, const void *data)
713 : {
714 1527666 : const class loop *const loop = (const class loop *)data;
715 1527666 : if (flow_bb_inside_loop_p (loop, bb))
716 : return true;
717 : return false;
718 : }
719 :
720 :
721 : /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
722 : stmt_vec_info structs for all the stmts in LOOP_IN. */
723 :
724 500502 : _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
725 : : vec_info (vec_info::loop, shared),
726 500502 : loop (loop_in),
727 500502 : num_itersm1 (NULL_TREE),
728 500502 : num_iters (NULL_TREE),
729 500502 : num_iters_unchanged (NULL_TREE),
730 500502 : num_iters_assumptions (NULL_TREE),
731 500502 : vector_costs (nullptr),
732 500502 : scalar_costs (nullptr),
733 500502 : th (0),
734 500502 : versioning_threshold (0),
735 500502 : vectorization_factor (0),
736 500502 : main_loop_edge (nullptr),
737 500502 : skip_main_loop_edge (nullptr),
738 500502 : skip_this_loop_edge (nullptr),
739 500502 : reusable_accumulators (),
740 500502 : suggested_unroll_factor (1),
741 500502 : max_vectorization_factor (0),
742 500502 : mask_skip_niters (NULL_TREE),
743 500502 : mask_skip_niters_pfa_offset (NULL_TREE),
744 500502 : rgroup_compare_type (NULL_TREE),
745 500502 : simd_if_cond (NULL_TREE),
746 500502 : partial_vector_style (vect_partial_vectors_none),
747 500502 : unaligned_dr (NULL),
748 500502 : peeling_for_alignment (0),
749 500502 : ptr_mask (0),
750 500502 : max_spec_read_amount (0),
751 500502 : nonlinear_iv (false),
752 500502 : ivexpr_map (NULL),
753 500502 : scan_map (NULL),
754 500502 : inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
755 500502 : vectorizable (false),
756 500502 : can_use_partial_vectors_p (true),
757 500502 : must_use_partial_vectors_p (false),
758 500502 : using_partial_vectors_p (false),
759 500502 : using_decrementing_iv_p (false),
760 500502 : using_select_vl_p (false),
761 500502 : allow_mutual_alignment (false),
762 500502 : partial_load_store_bias (0),
763 500502 : peeling_for_gaps (false),
764 500502 : peeling_for_niter (false),
765 500502 : early_breaks (false),
766 500502 : loop_iv_cond (NULL),
767 500502 : user_unroll (false),
768 500502 : no_data_dependencies (false),
769 500502 : has_mask_store (false),
770 500502 : scalar_loop_scaling (profile_probability::uninitialized ()),
771 500502 : scalar_loop (NULL),
772 500502 : main_loop_info (NULL),
773 500502 : orig_loop_info (NULL),
774 500502 : epilogue_vinfo (NULL),
775 500502 : drs_advanced_by (NULL_TREE),
776 500502 : vec_loop_main_exit (NULL),
777 500502 : vec_epilogue_loop_main_exit (NULL),
778 500502 : scalar_loop_main_exit (NULL)
779 : {
780 : /* CHECKME: We want to visit all BBs before their successors (except for
781 : latch blocks, for which this assertion wouldn't hold). In the simple
782 : case of the loop forms we allow, a dfs order of the BBs would the same
783 : as reversed postorder traversal, so we are safe. */
784 :
785 500502 : bbs = XCNEWVEC (basic_block, loop->num_nodes);
786 1001004 : nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, bbs,
787 500502 : loop->num_nodes, loop);
788 500502 : gcc_assert (nbbs == loop->num_nodes);
789 :
790 1777867 : for (unsigned int i = 0; i < nbbs; i++)
791 : {
792 1277365 : basic_block bb = bbs[i];
793 1277365 : gimple_stmt_iterator si;
794 :
795 2575177 : for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
796 : {
797 1297812 : gimple *phi = gsi_stmt (si);
798 1297812 : gimple_set_uid (phi, 0);
799 1297812 : add_stmt (phi);
800 : }
801 :
802 11733082 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
803 : {
804 9178352 : gimple *stmt = gsi_stmt (si);
805 9178352 : gimple_set_uid (stmt, 0);
806 9178352 : if (is_gimple_debug (stmt))
807 4049056 : continue;
808 5129296 : add_stmt (stmt);
809 : /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
810 : third argument is the #pragma omp simd if (x) condition, when 0,
811 : loop shouldn't be vectorized, when non-zero constant, it should
812 : be vectorized normally, otherwise versioned with vectorized loop
813 : done if the condition is non-zero at runtime. */
814 5129296 : if (loop_in->simduid
815 43372 : && is_gimple_call (stmt)
816 4268 : && gimple_call_internal_p (stmt)
817 4141 : && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
818 4137 : && gimple_call_num_args (stmt) >= 3
819 103 : && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
820 5129399 : && (loop_in->simduid
821 103 : == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
822 : {
823 103 : tree arg = gimple_call_arg (stmt, 2);
824 103 : if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
825 103 : simd_if_cond = arg;
826 : else
827 0 : gcc_assert (integer_nonzerop (arg));
828 : }
829 : }
830 : }
831 500502 : }
832 :
833 : /* Free all levels of rgroup CONTROLS. */
834 :
835 : void
836 1263273 : release_vec_loop_controls (vec<rgroup_controls> *controls)
837 : {
838 1263273 : rgroup_controls *rgc;
839 1263273 : unsigned int i;
840 1280847 : FOR_EACH_VEC_ELT (*controls, i, rgc)
841 17574 : rgc->controls.release ();
842 1263273 : controls->release ();
843 1263273 : }
844 :
845 : /* Free all memory used by the _loop_vec_info, as well as all the
846 : stmt_vec_info structs of all the stmts in the loop. */
847 :
848 500502 : _loop_vec_info::~_loop_vec_info ()
849 : {
850 500502 : free (bbs);
851 :
852 500502 : release_vec_loop_controls (&masks.rgc_vec);
853 500502 : release_vec_loop_controls (&lens);
854 504455 : delete ivexpr_map;
855 500824 : delete scan_map;
856 500502 : delete scalar_costs;
857 500502 : delete vector_costs;
858 647255 : for (auto reduc_info : reduc_infos)
859 142774 : delete reduc_info;
860 :
861 : /* When we release an epiloge vinfo that we do not intend to use
862 : avoid clearing AUX of the main loop which should continue to
863 : point to the main loop vinfo since otherwise we'll leak that. */
864 500502 : if (loop->aux == this)
865 61713 : loop->aux = NULL;
866 1001004 : }
867 :
868 : /* Return an invariant or register for EXPR and emit necessary
869 : computations in the LOOP_VINFO loop preheader. */
870 :
871 : tree
872 20595 : cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
873 : {
874 20595 : if (is_gimple_reg (expr)
875 20595 : || is_gimple_min_invariant (expr))
876 6910 : return expr;
877 :
878 13685 : if (! loop_vinfo->ivexpr_map)
879 3953 : loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
880 13685 : tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
881 13685 : if (! cached)
882 : {
883 8851 : gimple_seq stmts = NULL;
884 8851 : cached = force_gimple_operand (unshare_expr (expr),
885 : &stmts, true, NULL_TREE);
886 8851 : if (stmts)
887 : {
888 8711 : edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
889 8711 : gsi_insert_seq_on_edge_immediate (e, stmts);
890 : }
891 : }
892 13685 : return cached;
893 : }
894 :
895 : /* Return true if we can use CMP_TYPE as the comparison type to produce
896 : all masks required to mask LOOP_VINFO. */
897 :
898 : static bool
899 78817 : can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
900 : {
901 78817 : rgroup_controls *rgm;
902 78817 : unsigned int i;
903 91375 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
904 91375 : if (rgm->type != NULL_TREE
905 91375 : && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
906 : cmp_type, rgm->type,
907 : OPTIMIZE_FOR_SPEED))
908 : return false;
909 : return true;
910 : }
911 :
912 : /* Calculate the maximum number of scalars per iteration for every
913 : rgroup in LOOP_VINFO. */
914 :
915 : static unsigned int
916 16755 : vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
917 : {
918 16755 : unsigned int res = 1;
919 16755 : unsigned int i;
920 16755 : rgroup_controls *rgm;
921 41044 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
922 24289 : res = MAX (res, rgm->max_nscalars_per_iter);
923 16755 : return res;
924 : }
925 :
926 : /* Calculate the minimum precision necessary to represent:
927 :
928 : MAX_NITERS * FACTOR
929 :
930 : as an unsigned integer, where MAX_NITERS is the maximum number of
931 : loop header iterations for the original scalar form of LOOP_VINFO. */
932 :
933 : unsigned
934 18531 : vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
935 : {
936 18531 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
937 :
938 : /* Get the maximum number of iterations that is representable
939 : in the counter type. */
940 18531 : tree ni_type;
941 18531 : if (!LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo))
942 18531 : ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
943 : else
944 0 : ni_type = sizetype;
945 18531 : widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
946 :
947 : /* Get a more refined estimate for the number of iterations. */
948 18531 : widest_int max_back_edges;
949 18531 : if (max_loop_iterations (loop, &max_back_edges))
950 18531 : max_ni = wi::smin (max_ni, max_back_edges + 1);
951 :
952 : /* Work out how many bits we need to represent the limit. */
953 18531 : return wi::min_precision (max_ni * factor, UNSIGNED);
954 18531 : }
955 :
956 : /* True if the loop needs peeling or partial vectors when vectorized. */
957 :
958 : static bool
959 112993 : vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
960 : {
961 112993 : unsigned HOST_WIDE_INT const_vf;
962 :
963 112993 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
964 : return true;
965 :
966 12583 : loop_vec_info main_loop_vinfo
967 111731 : = (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
968 111731 : ? LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo) : loop_vinfo);
969 111731 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
970 53463 : && LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo) >= 0)
971 : {
972 : /* Work out the (constant) number of iterations that need to be
973 : peeled for reasons other than niters. */
974 53420 : unsigned int peel_niter
975 : = LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo);
976 53420 : return !multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
977 53420 : LOOP_VINFO_VECT_FACTOR (loop_vinfo));
978 : }
979 :
980 58311 : if (!LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo)
981 58311 : && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf))
982 : {
983 : /* When the number of iterations is a multiple of the vectorization
984 : factor and we are not doing prologue or forced epilogue peeling
985 : the epilogue isn't necessary. */
986 57900 : if (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
987 115800 : >= (unsigned) exact_log2 (const_vf))
988 : return false;
989 : }
990 :
991 : return true;
992 : }
993 :
994 : /* Each statement in LOOP_VINFO can be masked where necessary. Check
995 : whether we can actually generate the masks required. Return true if so,
996 : storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
997 :
998 : static bool
999 16755 : vect_verify_full_masking (loop_vec_info loop_vinfo)
1000 : {
1001 16755 : unsigned int min_ni_width;
1002 :
1003 : /* Use a normal loop if there are no statements that need masking.
1004 : This only happens in rare degenerate cases: it means that the loop
1005 : has no loads, no stores, and no live-out values. */
1006 16755 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1007 : return false;
1008 :
1009 : /* Produce the rgroup controls. */
1010 67279 : for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1011 : {
1012 25262 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1013 25262 : tree vectype = mask.first;
1014 25262 : unsigned nvectors = mask.second;
1015 :
1016 33769 : if (masks->rgc_vec.length () < nvectors)
1017 18459 : masks->rgc_vec.safe_grow_cleared (nvectors, true);
1018 25262 : rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1019 : /* The number of scalars per iteration and the number of vectors are
1020 : both compile-time constants. */
1021 25262 : unsigned int nscalars_per_iter
1022 25262 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1023 25262 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1024 :
1025 25262 : if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1026 : {
1027 20027 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1028 20027 : rgm->type = truth_type_for (vectype);
1029 20027 : rgm->factor = 1;
1030 : }
1031 : }
1032 :
1033 16755 : unsigned int max_nscalars_per_iter
1034 16755 : = vect_get_max_nscalars_per_iter (loop_vinfo);
1035 :
1036 : /* Work out how many bits we need to represent the limit. */
1037 16755 : min_ni_width
1038 16755 : = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1039 :
1040 : /* Find a scalar mode for which WHILE_ULT is supported. */
1041 16755 : opt_scalar_int_mode cmp_mode_iter;
1042 16755 : tree cmp_type = NULL_TREE;
1043 16755 : tree iv_type = NULL_TREE;
1044 16755 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1045 16755 : unsigned int iv_precision = UINT_MAX;
1046 :
1047 16755 : if (iv_limit != -1)
1048 16755 : iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1049 : UNSIGNED);
1050 :
1051 134040 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1052 : {
1053 117285 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1054 117285 : if (cmp_bits >= min_ni_width
1055 117285 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1056 : {
1057 78817 : tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1058 78817 : if (this_type
1059 78817 : && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1060 : {
1061 : /* Although we could stop as soon as we find a valid mode,
1062 : there are at least two reasons why that's not always the
1063 : best choice:
1064 :
1065 : - An IV that's Pmode or wider is more likely to be reusable
1066 : in address calculations than an IV that's narrower than
1067 : Pmode.
1068 :
1069 : - Doing the comparison in IV_PRECISION or wider allows
1070 : a natural 0-based IV, whereas using a narrower comparison
1071 : type requires mitigations against wrap-around.
1072 :
1073 : Conversely, if the IV limit is variable, doing the comparison
1074 : in a wider type than the original type can introduce
1075 : unnecessary extensions, so picking the widest valid mode
1076 : is not always a good choice either.
1077 :
1078 : Here we prefer the first IV type that's Pmode or wider,
1079 : and the first comparison type that's IV_PRECISION or wider.
1080 : (The comparison type must be no wider than the IV type,
1081 : to avoid extensions in the vector loop.)
1082 :
1083 : ??? We might want to try continuing beyond Pmode for ILP32
1084 : targets if CMP_BITS < IV_PRECISION. */
1085 0 : iv_type = this_type;
1086 0 : if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1087 : cmp_type = this_type;
1088 0 : if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1089 : break;
1090 : }
1091 : }
1092 : }
1093 :
1094 16755 : if (!cmp_type)
1095 : {
1096 16755 : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1097 16755 : return false;
1098 : }
1099 :
1100 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1101 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1102 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1103 0 : return true;
1104 16755 : }
1105 :
1106 : /* Each statement in LOOP_VINFO can be masked where necessary. Check
1107 : whether we can actually generate AVX512 style masks. Return true if so,
1108 : storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1109 :
1110 : static bool
1111 16755 : vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1112 : {
1113 : /* Produce differently organized rgc_vec and differently check
1114 : we can produce masks. */
1115 :
1116 : /* Use a normal loop if there are no statements that need masking.
1117 : This only happens in rare degenerate cases: it means that the loop
1118 : has no loads, no stores, and no live-out values. */
1119 16755 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1120 : return false;
1121 :
1122 : /* For the decrementing IV we need to represent all values in
1123 : [0, niter + niter_skip] where niter_skip is the elements we
1124 : skip in the first iteration for prologue peeling. */
1125 16755 : tree iv_type = NULL_TREE;
1126 16755 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1127 16755 : unsigned int iv_precision = UINT_MAX;
1128 16755 : if (iv_limit != -1)
1129 16755 : iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1130 :
1131 : /* First compute the type for the IV we use to track the remaining
1132 : scalar iterations. */
1133 16755 : opt_scalar_int_mode cmp_mode_iter;
1134 21696 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1135 : {
1136 21696 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1137 21696 : if (cmp_bits >= iv_precision
1138 21696 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1139 : {
1140 16755 : iv_type = build_nonstandard_integer_type (cmp_bits, true);
1141 16755 : if (iv_type)
1142 : break;
1143 : }
1144 : }
1145 16755 : if (!iv_type)
1146 : return false;
1147 :
1148 : /* Produce the rgroup controls. */
1149 67279 : for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1150 : {
1151 25262 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1152 25262 : tree vectype = mask.first;
1153 25262 : unsigned nvectors = mask.second;
1154 :
1155 : /* The number of scalars per iteration and the number of vectors are
1156 : both compile-time constants. */
1157 25262 : unsigned int nscalars_per_iter
1158 25262 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1159 25262 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1160 :
1161 : /* We index the rgroup_controls vector with nscalars_per_iter
1162 : which we keep constant and instead have a varying nvectors,
1163 : remembering the vector mask with the fewest nV. */
1164 33769 : if (masks->rgc_vec.length () < nscalars_per_iter)
1165 16811 : masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1166 25262 : rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1167 :
1168 25262 : if (!rgm->type || rgm->factor > nvectors)
1169 : {
1170 18383 : rgm->type = truth_type_for (vectype);
1171 18383 : rgm->compare_type = NULL_TREE;
1172 18383 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1173 18383 : rgm->factor = nvectors;
1174 18383 : rgm->bias_adjusted_ctrl = NULL_TREE;
1175 : }
1176 : }
1177 :
1178 : /* There is no fixed compare type we are going to use but we have to
1179 : be able to get at one for each mask group. */
1180 16755 : unsigned int min_ni_width
1181 16755 : = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1182 :
1183 16755 : bool ok = true;
1184 64142 : for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1185 : {
1186 17532 : tree mask_type = rgc.type;
1187 17532 : if (!mask_type)
1188 700 : continue;
1189 :
1190 : /* For now vect_get_loop_mask only supports integer mode masks
1191 : when we need to split it. */
1192 16832 : if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1193 16832 : || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1194 : {
1195 : ok = false;
1196 : break;
1197 : }
1198 :
1199 : /* If iv_type is usable as compare type use that - we can elide the
1200 : saturation in that case. */
1201 13181 : if (TYPE_PRECISION (iv_type) >= min_ni_width)
1202 : {
1203 13181 : tree cmp_vectype
1204 13181 : = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1205 13181 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1206 4668 : rgc.compare_type = cmp_vectype;
1207 : }
1208 13181 : if (!rgc.compare_type)
1209 24888 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1210 : {
1211 24884 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1212 24884 : if (cmp_bits >= min_ni_width
1213 24884 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1214 : {
1215 24872 : tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1216 24872 : if (!cmp_type)
1217 0 : continue;
1218 :
1219 : /* Check whether we can produce the mask with cmp_type. */
1220 24872 : tree cmp_vectype
1221 24872 : = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1222 24872 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1223 : {
1224 8509 : rgc.compare_type = cmp_vectype;
1225 8509 : break;
1226 : }
1227 : }
1228 : }
1229 13181 : if (!rgc.compare_type)
1230 : {
1231 : ok = false;
1232 : break;
1233 : }
1234 : }
1235 16755 : if (!ok)
1236 : {
1237 3655 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1238 3655 : return false;
1239 : }
1240 :
1241 13100 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1242 13100 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1243 13100 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1244 13100 : return true;
1245 16755 : }
1246 :
1247 : /* Check whether we can use vector access with length based on precison
1248 : comparison. So far, to keep it simple, we only allow the case that the
1249 : precision of the target supported length is larger than the precision
1250 : required by loop niters. */
1251 :
1252 : static bool
1253 6 : vect_verify_loop_lens (loop_vec_info loop_vinfo)
1254 : {
1255 6 : if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1256 : return false;
1257 :
1258 0 : if (!VECTOR_MODE_P (loop_vinfo->vector_mode))
1259 : return false;
1260 :
1261 0 : machine_mode len_load_mode, len_store_mode;
1262 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1263 0 : .exists (&len_load_mode))
1264 0 : return false;
1265 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1266 0 : .exists (&len_store_mode))
1267 0 : return false;
1268 :
1269 0 : signed char partial_load_bias = internal_len_load_store_bias
1270 0 : (IFN_LEN_LOAD, len_load_mode);
1271 :
1272 0 : signed char partial_store_bias = internal_len_load_store_bias
1273 0 : (IFN_LEN_STORE, len_store_mode);
1274 :
1275 0 : gcc_assert (partial_load_bias == partial_store_bias);
1276 :
1277 0 : if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1278 : return false;
1279 :
1280 : /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1281 : len_loads with a length of zero. In order to avoid that we prohibit
1282 : more than one loop length here. */
1283 0 : if (partial_load_bias == -1
1284 0 : && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1285 : return false;
1286 :
1287 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1288 :
1289 0 : unsigned int max_nitems_per_iter = 1;
1290 0 : unsigned int i;
1291 0 : rgroup_controls *rgl;
1292 : /* Find the maximum number of items per iteration for every rgroup. */
1293 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1294 : {
1295 0 : unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1296 0 : max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1297 : }
1298 :
1299 : /* Work out how many bits we need to represent the length limit. */
1300 0 : unsigned int min_ni_prec
1301 0 : = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1302 :
1303 : /* Now use the maximum of below precisions for one suitable IV type:
1304 : - the IV's natural precision
1305 : - the precision needed to hold: the maximum number of scalar
1306 : iterations multiplied by the scale factor (min_ni_prec above)
1307 : - the Pmode precision
1308 :
1309 : If min_ni_prec is less than the precision of the current niters,
1310 : we perfer to still use the niters type. Prefer to use Pmode and
1311 : wider IV to avoid narrow conversions. */
1312 :
1313 0 : unsigned int ni_prec
1314 0 : = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1315 0 : min_ni_prec = MAX (min_ni_prec, ni_prec);
1316 0 : min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1317 :
1318 0 : tree iv_type = NULL_TREE;
1319 0 : opt_scalar_int_mode tmode_iter;
1320 0 : FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1321 : {
1322 0 : scalar_mode tmode = tmode_iter.require ();
1323 0 : unsigned int tbits = GET_MODE_BITSIZE (tmode);
1324 :
1325 : /* ??? Do we really want to construct one IV whose precision exceeds
1326 : BITS_PER_WORD? */
1327 0 : if (tbits > BITS_PER_WORD)
1328 : break;
1329 :
1330 : /* Find the first available standard integral type. */
1331 0 : if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1332 : {
1333 0 : iv_type = build_nonstandard_integer_type (tbits, true);
1334 0 : break;
1335 : }
1336 : }
1337 :
1338 0 : if (!iv_type)
1339 : {
1340 0 : if (dump_enabled_p ())
1341 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1342 : "can't vectorize with length-based partial vectors"
1343 : " because there is no suitable iv type.\n");
1344 0 : return false;
1345 : }
1346 :
1347 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1348 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1349 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1350 :
1351 0 : return true;
1352 : }
1353 :
1354 : /* Calculate the cost of one scalar iteration of the loop. */
1355 : static void
1356 297963 : vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1357 : {
1358 297963 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1359 297963 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1360 297963 : int nbbs = loop->num_nodes, factor;
1361 297963 : int innerloop_iters, i;
1362 :
1363 297963 : DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1364 :
1365 : /* Gather costs for statements in the scalar loop. */
1366 :
1367 : /* FORNOW. */
1368 297963 : innerloop_iters = 1;
1369 297963 : if (loop->inner)
1370 1348 : innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1371 :
1372 1049853 : for (i = 0; i < nbbs; i++)
1373 : {
1374 751890 : gimple_stmt_iterator si;
1375 751890 : basic_block bb = bbs[i];
1376 :
1377 751890 : if (bb->loop_father == loop->inner)
1378 : factor = innerloop_iters;
1379 : else
1380 749194 : factor = 1;
1381 :
1382 5978921 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1383 : {
1384 4475141 : gimple *stmt = gsi_stmt (si);
1385 4475141 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1386 :
1387 4475141 : if (!is_gimple_assign (stmt)
1388 : && !is_gimple_call (stmt)
1389 : && !is_a<gcond *> (stmt))
1390 1656190 : continue;
1391 :
1392 : /* Skip stmts that are not vectorized inside the loop. */
1393 2818951 : stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1394 2818951 : if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1395 1372873 : && (!STMT_VINFO_LIVE_P (vstmt_info)
1396 47 : || !VECTORIZABLE_CYCLE_DEF
1397 : (STMT_VINFO_DEF_TYPE (vstmt_info))))
1398 1372873 : continue;
1399 :
1400 1446078 : vect_cost_for_stmt kind;
1401 1446078 : if (STMT_VINFO_DATA_REF (stmt_info))
1402 : {
1403 689837 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1404 : kind = scalar_load;
1405 : else
1406 242767 : kind = scalar_store;
1407 : }
1408 756241 : else if (vect_nop_conversion_p (stmt_info))
1409 43107 : continue;
1410 : else
1411 : kind = scalar_stmt;
1412 :
1413 : /* We are using vect_prologue here to avoid scaling twice
1414 : by the inner loop factor. */
1415 1402971 : record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1416 : factor, kind, stmt_info, 0, vect_prologue);
1417 : }
1418 : }
1419 :
1420 : /* Now accumulate cost. */
1421 297963 : loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1422 297963 : add_stmt_costs (loop_vinfo->scalar_costs,
1423 : &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1424 297963 : loop_vinfo->scalar_costs->finish_cost (nullptr);
1425 297963 : }
1426 :
1427 : /* Function vect_analyze_loop_form.
1428 :
1429 : Verify that certain CFG restrictions hold, including:
1430 : - the loop has a pre-header
1431 : - the loop has a single entry
1432 : - nested loops can have only a single exit.
1433 : - the loop exit condition is simple enough
1434 : - the number of iterations can be analyzed, i.e, a countable loop. The
1435 : niter could be analyzed under some assumptions. */
1436 :
1437 : opt_result
1438 457121 : vect_analyze_loop_form (class loop *loop, gimple *loop_vectorized_call,
1439 : vect_loop_form_info *info)
1440 : {
1441 457121 : DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1442 :
1443 457121 : edge exit_e = vec_init_loop_exit_info (loop);
1444 457121 : if (!exit_e)
1445 29147 : return opt_result::failure_at (vect_location,
1446 : "not vectorized:"
1447 : " Infinite loop detected.\n");
1448 427974 : if (loop_vectorized_call)
1449 : {
1450 28807 : tree arg = gimple_call_arg (loop_vectorized_call, 1);
1451 28807 : class loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg));
1452 28807 : edge scalar_exit_e = vec_init_loop_exit_info (scalar_loop);
1453 28807 : if (!scalar_exit_e)
1454 0 : return opt_result::failure_at (vect_location,
1455 : "not vectorized:"
1456 : " could not determine main exit from"
1457 : " loop with multiple exits.\n");
1458 : }
1459 :
1460 427974 : info->loop_exit = exit_e;
1461 427974 : if (dump_enabled_p ())
1462 15984 : dump_printf_loc (MSG_NOTE, vect_location,
1463 : "using as main loop exit: %d -> %d [AUX: %p]\n",
1464 15984 : exit_e->src->index, exit_e->dest->index, exit_e->aux);
1465 :
1466 : /* Check if we have any control flow that doesn't leave the loop. */
1467 427974 : basic_block *bbs = get_loop_body (loop);
1468 1402193 : for (unsigned i = 0; i < loop->num_nodes; i++)
1469 1089214 : if (EDGE_COUNT (bbs[i]->succs) != 1
1470 1089214 : && (EDGE_COUNT (bbs[i]->succs) != 2
1471 652616 : || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1472 : {
1473 114995 : free (bbs);
1474 114995 : return opt_result::failure_at (vect_location,
1475 : "not vectorized:"
1476 : " unsupported control flow in loop.\n");
1477 : }
1478 :
1479 : /* Check if we have any control flow that doesn't leave the loop. */
1480 314074 : bool has_phi = false;
1481 314074 : for (unsigned i = 0; i < loop->num_nodes; i++)
1482 313617 : if (!gimple_seq_empty_p (phi_nodes (bbs[i])))
1483 : {
1484 : has_phi = true;
1485 : break;
1486 : }
1487 312979 : if (!has_phi)
1488 457 : return opt_result::failure_at (vect_location,
1489 : "not vectorized:"
1490 : " no scalar evolution detected in loop.\n");
1491 :
1492 312522 : free (bbs);
1493 :
1494 : /* Different restrictions apply when we are considering an inner-most loop,
1495 : vs. an outer (nested) loop.
1496 : (FORNOW. May want to relax some of these restrictions in the future). */
1497 :
1498 312522 : info->inner_loop_cond = NULL;
1499 312522 : if (!loop->inner)
1500 : {
1501 : /* Inner-most loop. */
1502 :
1503 294055 : if (empty_block_p (loop->header))
1504 0 : return opt_result::failure_at (vect_location,
1505 : "not vectorized: empty loop.\n");
1506 : }
1507 : else
1508 : {
1509 18467 : class loop *innerloop = loop->inner;
1510 18467 : edge entryedge;
1511 :
1512 : /* Nested loop. We currently require that the loop is doubly-nested,
1513 : contains a single inner loop with a single exit to the block
1514 : with the single exit condition in the outer loop.
1515 : Vectorizable outer-loops look like this:
1516 :
1517 : (pre-header)
1518 : |
1519 : header <---+
1520 : | |
1521 : inner-loop |
1522 : | |
1523 : tail ------+
1524 : |
1525 : (exit-bb)
1526 :
1527 : The inner-loop also has the properties expected of inner-most loops
1528 : as described above. */
1529 :
1530 18467 : if ((loop->inner)->inner || (loop->inner)->next)
1531 2935 : return opt_result::failure_at (vect_location,
1532 : "not vectorized:"
1533 : " multiple nested loops.\n");
1534 :
1535 15532 : entryedge = loop_preheader_edge (innerloop);
1536 15532 : if (entryedge->src != loop->header
1537 15036 : || !single_exit (innerloop)
1538 26938 : || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1539 4468 : return opt_result::failure_at (vect_location,
1540 : "not vectorized:"
1541 : " unsupported outerloop form.\n");
1542 :
1543 : /* Analyze the inner-loop. */
1544 11064 : vect_loop_form_info inner;
1545 11064 : opt_result res = vect_analyze_loop_form (loop->inner, NULL, &inner);
1546 11064 : if (!res)
1547 : {
1548 416 : if (dump_enabled_p ())
1549 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1550 : "not vectorized: Bad inner loop.\n");
1551 416 : return res;
1552 : }
1553 :
1554 : /* Don't support analyzing niter under assumptions for inner
1555 : loop. */
1556 10648 : if (!integer_onep (inner.assumptions))
1557 257 : return opt_result::failure_at (vect_location,
1558 : "not vectorized: Bad inner loop.\n");
1559 :
1560 10391 : if (inner.number_of_iterations == chrec_dont_know
1561 10391 : || !expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1562 1837 : return opt_result::failure_at (vect_location,
1563 : "not vectorized: inner-loop count not"
1564 : " invariant.\n");
1565 :
1566 8554 : if (dump_enabled_p ())
1567 1046 : dump_printf_loc (MSG_NOTE, vect_location,
1568 : "Considering outer-loop vectorization.\n");
1569 8554 : info->inner_loop_cond = inner.conds[0];
1570 11064 : }
1571 :
1572 302609 : if (EDGE_COUNT (loop->header->preds) != 2)
1573 0 : return opt_result::failure_at (vect_location,
1574 : "not vectorized:"
1575 : " too many incoming edges.\n");
1576 :
1577 : /* We assume that the latch is empty. */
1578 302609 : basic_block latch = loop->latch;
1579 302609 : do
1580 : {
1581 302609 : if (!empty_block_p (latch)
1582 302609 : || !gimple_seq_empty_p (phi_nodes (latch)))
1583 20605 : return opt_result::failure_at (vect_location,
1584 : "not vectorized: latch block not "
1585 : "empty.\n");
1586 282004 : latch = single_pred (latch);
1587 : }
1588 564008 : while (single_succ_p (latch));
1589 :
1590 : /* Make sure there is no abnormal exit. */
1591 282004 : auto_vec<edge> exits = get_loop_exit_edges (loop);
1592 1250889 : for (edge e : exits)
1593 : {
1594 404910 : if (e->flags & EDGE_ABNORMAL)
1595 33 : return opt_result::failure_at (vect_location,
1596 : "not vectorized:"
1597 : " abnormal loop exit edge.\n");
1598 : }
1599 :
1600 281971 : info->conds
1601 281971 : = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1602 : &info->number_of_iterations,
1603 281971 : &info->number_of_iterationsm1);
1604 281971 : if (info->conds.is_empty ())
1605 30 : return opt_result::failure_at
1606 30 : (vect_location,
1607 : "not vectorized: complicated exit condition.\n");
1608 :
1609 : /* Determine what the primary and alternate exit conds are. */
1610 686788 : for (unsigned i = 0; i < info->conds.length (); i++)
1611 : {
1612 404847 : gcond *cond = info->conds[i];
1613 404847 : if (exit_e->src == gimple_bb (cond))
1614 281941 : std::swap (info->conds[0], info->conds[i]);
1615 : }
1616 :
1617 281941 : if (chrec_contains_undetermined (info->number_of_iterations))
1618 : {
1619 60264 : if (dump_enabled_p ())
1620 257 : dump_printf_loc (MSG_NOTE, vect_location,
1621 : "Loop being analyzed as uncounted.\n");
1622 60264 : if (loop->inner)
1623 562 : return opt_result::failure_at
1624 562 : (vect_location,
1625 : "not vectorized: outer loop vectorization of uncounted loops"
1626 : " is unsupported.\n");
1627 59702 : return opt_result::success ();
1628 : }
1629 :
1630 221677 : if (integer_zerop (info->assumptions))
1631 4 : return opt_result::failure_at
1632 4 : (info->conds[0],
1633 : "not vectorized: number of iterations cannot be computed.\n");
1634 :
1635 221673 : if (integer_zerop (info->number_of_iterations))
1636 12 : return opt_result::failure_at
1637 12 : (info->conds[0],
1638 : "not vectorized: number of iterations = 0.\n");
1639 :
1640 221661 : if (!(tree_fits_shwi_p (info->number_of_iterations)
1641 120623 : && tree_to_shwi (info->number_of_iterations) > 0))
1642 : {
1643 101038 : if (dump_enabled_p ())
1644 : {
1645 2469 : dump_printf_loc (MSG_NOTE, vect_location,
1646 : "Symbolic number of iterations is ");
1647 2469 : dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1648 2469 : dump_printf (MSG_NOTE, "\n");
1649 : }
1650 : }
1651 :
1652 221661 : if (!integer_onep (info->assumptions))
1653 : {
1654 8533 : if (dump_enabled_p ())
1655 : {
1656 66 : dump_printf_loc (MSG_NOTE, vect_location,
1657 : "Loop to be versioned with niter assumption ");
1658 66 : dump_generic_expr (MSG_NOTE, TDF_SLIM, info->assumptions);
1659 66 : dump_printf (MSG_NOTE, "\n");
1660 : }
1661 : }
1662 :
1663 221661 : return opt_result::success ();
1664 282004 : }
1665 :
1666 : /* Create a loop_vec_info for LOOP with SHARED and the
1667 : vect_analyze_loop_form result. */
1668 :
1669 : loop_vec_info
1670 500502 : vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1671 : const vect_loop_form_info *info,
1672 : loop_vec_info orig_loop_info)
1673 : {
1674 500502 : loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1675 500502 : LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1676 500502 : LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1677 500502 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1678 500502 : LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_info;
1679 500502 : if (orig_loop_info && LOOP_VINFO_EPILOGUE_P (orig_loop_info))
1680 171 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo)
1681 171 : = LOOP_VINFO_MAIN_LOOP_INFO (orig_loop_info);
1682 : else
1683 500331 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo) = orig_loop_info;
1684 : /* Also record the assumptions for versioning. */
1685 500502 : if (!integer_onep (info->assumptions) && !orig_loop_info)
1686 18847 : LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1687 :
1688 2252309 : for (gcond *cond : info->conds)
1689 : {
1690 750803 : stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1691 : /* Mark the statement as a condition. */
1692 750803 : STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1693 : }
1694 :
1695 500502 : unsigned cond_id = 0;
1696 500502 : if (!LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo))
1697 412813 : LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[cond_id++];
1698 :
1699 838492 : for (; cond_id < info->conds.length (); cond_id ++)
1700 337990 : LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[cond_id]);
1701 :
1702 500502 : LOOP_VINFO_MAIN_EXIT (loop_vinfo) = info->loop_exit;
1703 :
1704 : /* Check to see if we're vectorizing multiple exits. */
1705 500502 : LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1706 500502 : = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1707 :
1708 500502 : if (info->inner_loop_cond)
1709 : {
1710 : /* If we have an estimate on the number of iterations of the inner
1711 : loop use that to limit the scale for costing, otherwise use
1712 : --param vect-inner-loop-cost-factor literally. */
1713 8675 : widest_int nit;
1714 8675 : if (estimated_stmt_executions (loop->inner, &nit))
1715 7394 : LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1716 7394 : = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1717 8675 : }
1718 :
1719 500502 : return loop_vinfo;
1720 : }
1721 :
1722 :
1723 :
1724 : /* Return true if we know that the iteration count is smaller than the
1725 : vectorization factor. Return false if it isn't, or if we can't be sure
1726 : either way. */
1727 :
1728 : static bool
1729 112184 : vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1730 : {
1731 112184 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1732 :
1733 112184 : HOST_WIDE_INT max_niter;
1734 112184 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1735 53681 : max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1736 : else
1737 58503 : max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1738 :
1739 112184 : if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1740 10798 : return true;
1741 :
1742 : return false;
1743 : }
1744 :
1745 : /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1746 : is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1747 : definitely no, or -1 if it's worth retrying. */
1748 :
1749 : static int
1750 112192 : vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1751 : unsigned *suggested_unroll_factor)
1752 : {
1753 112192 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1754 112192 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1755 :
1756 : /* Only loops that can handle partially-populated vectors can have iteration
1757 : counts less than the vectorization factor. */
1758 112192 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
1759 112192 : && vect_known_niters_smaller_than_vf (loop_vinfo))
1760 : {
1761 10788 : if (dump_enabled_p ())
1762 236 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1763 : "not vectorized: iteration count smaller than "
1764 : "vectorization factor.\n");
1765 10788 : return 0;
1766 : }
1767 :
1768 : /* If we know the number of iterations we can do better, for the
1769 : epilogue we can also decide whether the main loop leaves us
1770 : with enough iterations, prefering a smaller vector epilog then
1771 : also possibly used for the case we skip the vector loop. */
1772 101404 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1773 : {
1774 44131 : widest_int scalar_niters
1775 44131 : = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
1776 44131 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1777 : {
1778 2691 : loop_vec_info orig_loop_vinfo
1779 : = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1780 2691 : loop_vec_info main_loop_vinfo
1781 : = LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo);
1782 2691 : unsigned lowest_vf
1783 2691 : = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
1784 2691 : int prolog_peeling = 0;
1785 2691 : if (!vect_use_loop_mask_for_alignment_p (main_loop_vinfo))
1786 2691 : prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo);
1787 2691 : if (prolog_peeling >= 0
1788 2691 : && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
1789 : lowest_vf))
1790 : {
1791 5372 : unsigned gap
1792 2686 : = LOOP_VINFO_PEELING_FOR_GAPS (main_loop_vinfo) ? 1 : 0;
1793 5372 : scalar_niters = ((scalar_niters - gap - prolog_peeling)
1794 5372 : % lowest_vf + gap);
1795 : }
1796 : }
1797 : /* Reject vectorizing for a single scalar iteration, even if
1798 : we could in principle implement that using partial vectors.
1799 : But allow such vectorization if VF == 1 in case we do not
1800 : need to peel for gaps (if we need, avoid vectorization for
1801 : reasons of code footprint). */
1802 44131 : unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
1803 44131 : if (scalar_niters <= peeling_gap + 1
1804 44131 : && (assumed_vf > 1 || peeling_gap != 0))
1805 : {
1806 690 : if (dump_enabled_p ())
1807 159 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1808 : "not vectorized: loop only has a single "
1809 : "scalar iteration.\n");
1810 690 : return 0;
1811 : }
1812 :
1813 43441 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1814 : {
1815 : /* Check that the loop processes at least one full vector. */
1816 43430 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1817 43430 : if (known_lt (scalar_niters, vf))
1818 : {
1819 364 : if (dump_enabled_p ())
1820 296 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1821 : "loop does not have enough iterations "
1822 : "to support vectorization.\n");
1823 404 : return 0;
1824 : }
1825 :
1826 : /* If we need to peel an extra epilogue iteration to handle data
1827 : accesses with gaps, check that there are enough scalar iterations
1828 : available.
1829 :
1830 : The check above is redundant with this one when peeling for gaps,
1831 : but the distinction is useful for diagnostics. */
1832 43066 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1833 43365 : && known_le (scalar_niters, vf))
1834 : {
1835 40 : if (dump_enabled_p ())
1836 9 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1837 : "loop does not have enough iterations "
1838 : "to support peeling for gaps.\n");
1839 40 : return 0;
1840 : }
1841 : }
1842 44131 : }
1843 :
1844 : /* If using the "very cheap" model. reject cases in which we'd keep
1845 : a copy of the scalar code (even if we might be able to vectorize it). */
1846 100310 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1847 100310 : && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1848 49594 : || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
1849 : {
1850 721 : if (dump_enabled_p ())
1851 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1852 : "some scalar iterations would need to be peeled\n");
1853 721 : return 0;
1854 : }
1855 :
1856 99589 : int min_profitable_iters, min_profitable_estimate;
1857 99589 : vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1858 : &min_profitable_estimate,
1859 : suggested_unroll_factor);
1860 :
1861 99589 : if (min_profitable_iters < 0)
1862 : {
1863 24161 : if (dump_enabled_p ())
1864 22 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1865 : "not vectorized: vectorization not profitable.\n");
1866 24161 : if (dump_enabled_p ())
1867 22 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1868 : "not vectorized: vector version will never be "
1869 : "profitable.\n");
1870 24161 : return -1;
1871 : }
1872 :
1873 75428 : int min_scalar_loop_bound = (param_min_vect_loop_bound
1874 75428 : * assumed_vf);
1875 :
1876 : /* Use the cost model only if it is more conservative than user specified
1877 : threshold. */
1878 75428 : unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1879 : min_profitable_iters);
1880 :
1881 75428 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1882 :
1883 38061 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1884 113489 : && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1885 : {
1886 384 : if (dump_enabled_p ())
1887 1 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1888 : "not vectorized: vectorization not profitable.\n");
1889 384 : if (dump_enabled_p ())
1890 1 : dump_printf_loc (MSG_NOTE, vect_location,
1891 : "not vectorized: iteration count smaller than user "
1892 : "specified loop bound parameter or minimum profitable "
1893 : "iterations (whichever is more conservative).\n");
1894 384 : return 0;
1895 : }
1896 :
1897 : /* The static profitablity threshold min_profitable_estimate includes
1898 : the cost of having to check at runtime whether the scalar loop
1899 : should be used instead. If it turns out that we don't need or want
1900 : such a check, the threshold we should use for the static estimate
1901 : is simply the point at which the vector loop becomes more profitable
1902 : than the scalar loop. */
1903 75044 : if (min_profitable_estimate > min_profitable_iters
1904 16099 : && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1905 15584 : && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1906 287 : && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1907 75331 : && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1908 : {
1909 11 : if (dump_enabled_p ())
1910 6 : dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1911 : " choice between the scalar and vector loops\n");
1912 11 : min_profitable_estimate = min_profitable_iters;
1913 : }
1914 :
1915 : /* If the vector loop needs multiple iterations to be beneficial then
1916 : things are probably too close to call, and the conservative thing
1917 : would be to stick with the scalar code. */
1918 75044 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1919 75044 : && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1920 : {
1921 8541 : if (dump_enabled_p ())
1922 177 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1923 : "one iteration of the vector loop would be"
1924 : " more expensive than the equivalent number of"
1925 : " iterations of the scalar loop\n");
1926 8541 : return 0;
1927 : }
1928 :
1929 66503 : HOST_WIDE_INT estimated_niter;
1930 :
1931 : /* If we are vectorizing an epilogue then we know the maximum number of
1932 : scalar iterations it will cover is at least one lower than the
1933 : vectorization factor of the main loop. */
1934 66503 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1935 11023 : estimated_niter
1936 11023 : = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1937 : else
1938 : {
1939 55480 : estimated_niter = estimated_stmt_executions_int (loop);
1940 55480 : if (estimated_niter == -1)
1941 21280 : estimated_niter = likely_max_stmt_executions_int (loop);
1942 : }
1943 32303 : if (estimated_niter != -1
1944 64676 : && ((unsigned HOST_WIDE_INT) estimated_niter
1945 64676 : < MAX (th, (unsigned) min_profitable_estimate)))
1946 : {
1947 4465 : if (dump_enabled_p ())
1948 28 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1949 : "not vectorized: estimated iteration count too "
1950 : "small.\n");
1951 4465 : if (dump_enabled_p ())
1952 28 : dump_printf_loc (MSG_NOTE, vect_location,
1953 : "not vectorized: estimated iteration count smaller "
1954 : "than specified loop bound parameter or minimum "
1955 : "profitable iterations (whichever is more "
1956 : "conservative).\n");
1957 4465 : return -1;
1958 : }
1959 :
1960 : /* As we cannot use a runtime check to gate profitability for uncounted
1961 : loops require either an estimate or if none, at least a profitable
1962 : vectorization within the first vector iteration (that condition
1963 : will practically never be true due to the required epilog and
1964 : likely alignment prologue). */
1965 62038 : if (LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo)
1966 155 : && estimated_niter == -1
1967 62166 : && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1968 : {
1969 112 : if (dump_enabled_p ())
1970 2 : dump_printf_loc (MSG_NOTE, vect_location,
1971 : "not vectorized: no loop iteration estimate on the "
1972 : "uncounted loop and not trivially profitable.\n");
1973 112 : return -1;
1974 : }
1975 :
1976 : return 1;
1977 : }
1978 :
1979 : /* Gather data references in LOOP with body BBS and store them into
1980 : *DATAREFS. */
1981 :
1982 : static opt_result
1983 279213 : vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1984 : vec<data_reference_p> *datarefs)
1985 : {
1986 833969 : for (unsigned i = 0; i < loop->num_nodes; i++)
1987 1236610 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1988 5227591 : !gsi_end_p (gsi); gsi_next (&gsi))
1989 : {
1990 4672835 : gimple *stmt = gsi_stmt (gsi);
1991 4672835 : if (is_gimple_debug (stmt))
1992 2148774 : continue;
1993 2524191 : opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1994 : NULL, 0);
1995 2524191 : if (!res)
1996 : {
1997 63679 : if (is_gimple_call (stmt) && loop->safelen)
1998 : {
1999 404 : tree fndecl = gimple_call_fndecl (stmt), op;
2000 404 : if (fndecl == NULL_TREE
2001 404 : && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2002 : {
2003 0 : fndecl = gimple_call_arg (stmt, 0);
2004 0 : gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2005 0 : fndecl = TREE_OPERAND (fndecl, 0);
2006 0 : gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2007 : }
2008 404 : if (fndecl != NULL_TREE)
2009 : {
2010 367 : cgraph_node *node = cgraph_node::get (fndecl);
2011 367 : if (node != NULL && node->simd_clones != NULL)
2012 : {
2013 131 : unsigned int j, n = gimple_call_num_args (stmt);
2014 545 : for (j = 0; j < n; j++)
2015 : {
2016 284 : op = gimple_call_arg (stmt, j);
2017 284 : if (DECL_P (op)
2018 284 : || (REFERENCE_CLASS_P (op)
2019 0 : && get_base_address (op)))
2020 : break;
2021 : }
2022 131 : op = gimple_call_lhs (stmt);
2023 : /* Ignore #pragma omp declare simd functions
2024 : if they don't have data references in the
2025 : call stmt itself. */
2026 261 : if (j == n
2027 131 : && !(op
2028 120 : && (DECL_P (op)
2029 120 : || (REFERENCE_CLASS_P (op)
2030 0 : && get_base_address (op)))))
2031 130 : continue;
2032 : }
2033 : }
2034 : }
2035 63549 : return res;
2036 : }
2037 : /* If dependence analysis will give up due to the limit on the
2038 : number of datarefs stop here and fail fatally. */
2039 4311520 : if (datarefs->length ()
2040 1851008 : > (unsigned)param_loop_max_datarefs_for_datadeps)
2041 0 : return opt_result::failure_at (stmt, "exceeded param "
2042 : "loop-max-datarefs-for-datadeps\n");
2043 : }
2044 215664 : return opt_result::success ();
2045 : }
2046 :
2047 : /* Determine if operating on full vectors for LOOP_VINFO might leave
2048 : some scalar iterations still to do. If so, decide how we should
2049 : handle those scalar iterations. The possibilities are:
2050 :
2051 : (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2052 : In this case:
2053 :
2054 : LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2055 : LOOP_VINFO_PEELING_FOR_NITER == false
2056 :
2057 : (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2058 : to handle the remaining scalar iterations. In this case:
2059 :
2060 : LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2061 : LOOP_VINFO_PEELING_FOR_NITER == true
2062 :
2063 : The MASKED_P argument specifies to what extent
2064 : param_vect_partial_vector_usage is to be honored. For MASKED_P == 0
2065 : no partial vectors are to be used, for MASKED_P == -1 it's
2066 : param_vect_partial_vector_usage that gets to decide whether we may
2067 : consider partial vector usage. For MASKED_P == 1 partial vectors
2068 : may be used if possible.
2069 :
2070 : */
2071 :
2072 : static opt_result
2073 112993 : vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2074 : int masked_p)
2075 : {
2076 : /* Determine whether there would be any scalar iterations left over. */
2077 112993 : bool need_peeling_or_partial_vectors_p
2078 112993 : = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2079 :
2080 : /* Decide whether to vectorize the loop with partial vectors. */
2081 112993 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2082 112993 : if (masked_p == 0
2083 112993 : || (masked_p == -1 && param_vect_partial_vector_usage == 0))
2084 : /* If requested explicitly do not use partial vectors. */
2085 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2086 121 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2087 42 : && LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo))
2088 0 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2089 121 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2090 42 : && need_peeling_or_partial_vectors_p)
2091 : {
2092 : /* For partial-vector-usage=1, try to push the handling of partial
2093 : vectors to the epilogue, with the main loop continuing to operate
2094 : on full vectors.
2095 :
2096 : If we are unrolling we also do not want to use partial vectors. This
2097 : is to avoid the overhead of generating multiple masks and also to
2098 : avoid having to execute entire iterations of FALSE masked instructions
2099 : when dealing with one or less full iterations.
2100 :
2101 : ??? We could then end up failing to use partial vectors if we
2102 : decide to peel iterations into a prologue, and if the main loop
2103 : then ends up processing fewer than VF iterations. */
2104 34 : if ((param_vect_partial_vector_usage == 1
2105 8 : || loop_vinfo->suggested_unroll_factor > 1)
2106 26 : && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2107 52 : && !vect_known_niters_smaller_than_vf (loop_vinfo))
2108 : ;
2109 : else
2110 26 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2111 : }
2112 :
2113 112993 : if (LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo)
2114 0 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2115 0 : return opt_result::failure_at (vect_location,
2116 : "not vectorized: loop needs but cannot "
2117 : "use partial vectors\n");
2118 :
2119 112993 : if (dump_enabled_p ())
2120 12004 : dump_printf_loc (MSG_NOTE, vect_location,
2121 : "operating on %s vectors%s.\n",
2122 12004 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2123 : ? "partial" : "full",
2124 12004 : LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2125 : ? " for epilogue loop" : "");
2126 :
2127 112993 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2128 225986 : = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2129 112993 : && need_peeling_or_partial_vectors_p);
2130 :
2131 112993 : return opt_result::success ();
2132 : }
2133 :
2134 : /* Function vect_analyze_loop_2.
2135 :
2136 : Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2137 : analyses will record information in some members of LOOP_VINFO. FATAL
2138 : indicates if some analysis meets fatal error. If one non-NULL pointer
2139 : SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2140 : worked out suggested unroll factor, while one NULL pointer shows it's
2141 : going to apply the suggested unroll factor.
2142 : SINGLE_LANE_SLP_DONE_FOR_SUGGESTED_UF is to hold whether single-lane
2143 : slp was forced when the suggested unroll factor was worked out. */
2144 : static opt_result
2145 499802 : vect_analyze_loop_2 (loop_vec_info loop_vinfo, int masked_p, bool &fatal,
2146 : unsigned *suggested_unroll_factor,
2147 : bool& single_lane_slp_done_for_suggested_uf)
2148 : {
2149 499802 : opt_result ok = opt_result::success ();
2150 499802 : int res;
2151 499802 : unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2152 499802 : loop_vec_info orig_loop_vinfo = NULL;
2153 :
2154 : /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2155 : loop_vec_info of the first vectorized loop. */
2156 499802 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2157 18056 : orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2158 : else
2159 : orig_loop_vinfo = loop_vinfo;
2160 18056 : gcc_assert (orig_loop_vinfo);
2161 :
2162 : /* We can't mask on niters for uncounted loops due to unkown upper bound. */
2163 499802 : if (LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo))
2164 87689 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2165 :
2166 : /* The first group of checks is independent of the vector size. */
2167 499802 : fatal = true;
2168 :
2169 499802 : if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2170 499802 : && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2171 5 : return opt_result::failure_at (vect_location,
2172 : "not vectorized: simd if(0)\n");
2173 :
2174 : /* Find all data references in the loop (which correspond to vdefs/vuses)
2175 : and analyze their evolution in the loop. */
2176 :
2177 499797 : loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2178 :
2179 : /* Gather the data references. */
2180 499797 : if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2181 : {
2182 279213 : opt_result res
2183 279213 : = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2184 : &LOOP_VINFO_DATAREFS (loop_vinfo));
2185 279213 : if (!res)
2186 : {
2187 63549 : if (dump_enabled_p ())
2188 1642 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2189 : "not vectorized: loop contains function "
2190 : "calls or data references that cannot "
2191 : "be analyzed\n");
2192 63549 : return res;
2193 : }
2194 215664 : loop_vinfo->shared->save_datarefs ();
2195 : }
2196 : else
2197 220584 : loop_vinfo->shared->check_datarefs ();
2198 :
2199 : /* Analyze the data references and also adjust the minimal
2200 : vectorization factor according to the loads and stores. */
2201 :
2202 436248 : ok = vect_analyze_data_refs (loop_vinfo, &fatal);
2203 436248 : if (!ok)
2204 : {
2205 58593 : if (dump_enabled_p ())
2206 1033 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2207 : "bad data references.\n");
2208 58593 : return ok;
2209 : }
2210 :
2211 : /* Check if we are applying unroll factor now. */
2212 377655 : bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2213 377655 : gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2214 :
2215 : /* When single-lane SLP was forced and we are applying suggested unroll
2216 : factor, keep that decision here. */
2217 755310 : bool force_single_lane = (applying_suggested_uf
2218 377655 : && single_lane_slp_done_for_suggested_uf);
2219 :
2220 : /* Classify all cross-iteration scalar data-flow cycles.
2221 : Cross-iteration cycles caused by virtual phis are analyzed separately. */
2222 377655 : vect_analyze_scalar_cycles (loop_vinfo);
2223 :
2224 377655 : vect_pattern_recog (loop_vinfo);
2225 :
2226 : /* Analyze the access patterns of the data-refs in the loop (consecutive,
2227 : complex, etc.). FORNOW: Only handle consecutive access pattern. */
2228 :
2229 377655 : ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2230 377655 : if (!ok)
2231 : {
2232 7929 : if (dump_enabled_p ())
2233 291 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2234 : "bad data access.\n");
2235 7929 : return ok;
2236 : }
2237 :
2238 : /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2239 :
2240 369726 : ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2241 369726 : if (!ok)
2242 : {
2243 46217 : if (dump_enabled_p ())
2244 399 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2245 : "unexpected pattern.\n");
2246 46217 : return ok;
2247 : }
2248 :
2249 : /* While the rest of the analysis below depends on it in some way. */
2250 323509 : fatal = false;
2251 :
2252 : /* Analyze data dependences between the data-refs in the loop
2253 : and adjust the maximum vectorization factor according to
2254 : the dependences.
2255 : FORNOW: fail at the first data dependence that we encounter. */
2256 :
2257 323509 : ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2258 323509 : if (!ok)
2259 : {
2260 25546 : if (dump_enabled_p ())
2261 532 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2262 : "bad data dependence.\n");
2263 25546 : return ok;
2264 : }
2265 297963 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2266 :
2267 : /* Compute the scalar iteration cost. */
2268 297963 : vect_compute_single_scalar_iteration_cost (loop_vinfo);
2269 :
2270 297963 : bool saved_can_use_partial_vectors_p
2271 : = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2272 :
2273 : /* This is the point where we can re-start analysis with single-lane
2274 : SLP forced. */
2275 427270 : start_over:
2276 :
2277 : /* Check the SLP opportunities in the loop, analyze and build
2278 : SLP trees. */
2279 854540 : ok = vect_analyze_slp (loop_vinfo, loop_vinfo->stmt_vec_infos.length (),
2280 : force_single_lane);
2281 427270 : if (!ok)
2282 21447 : return ok;
2283 :
2284 : /* If there are any SLP instances mark them as pure_slp and compute
2285 : the overall vectorization factor. */
2286 405823 : if (!vect_make_slp_decision (loop_vinfo))
2287 46564 : return opt_result::failure_at (vect_location, "no stmts to vectorize.\n");
2288 :
2289 359259 : if (dump_enabled_p ())
2290 18419 : dump_printf_loc (MSG_NOTE, vect_location, "Loop contains only SLP stmts\n");
2291 :
2292 : /* Dump the vectorization factor from the SLP decision. */
2293 359259 : if (dump_enabled_p ())
2294 : {
2295 18419 : dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
2296 18419 : dump_dec (MSG_NOTE, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2297 18419 : dump_printf (MSG_NOTE, "\n");
2298 : }
2299 :
2300 : /* We don't expect to have to roll back to anything other than an empty
2301 : set of rgroups. */
2302 359259 : gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2303 :
2304 : /* Apply the suggested unrolling factor, this was determined by the backend
2305 : during finish_cost the first time we ran the analyzis for this
2306 : vector mode. */
2307 359259 : if (applying_suggested_uf)
2308 247 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2309 :
2310 : /* Now the vectorization factor is final. */
2311 359259 : poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2312 359259 : gcc_assert (known_ne (vectorization_factor, 0U));
2313 :
2314 : /* Optimize the SLP graph with the vectorization factor fixed. */
2315 359259 : vect_optimize_slp (loop_vinfo);
2316 :
2317 : /* Gather the loads reachable from the SLP graph entries. */
2318 359259 : vect_gather_slp_loads (loop_vinfo);
2319 :
2320 359259 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2321 : {
2322 13815 : dump_printf_loc (MSG_NOTE, vect_location,
2323 : "vectorization_factor = ");
2324 13815 : dump_dec (MSG_NOTE, vectorization_factor);
2325 13815 : dump_printf (MSG_NOTE, ", niters = %wd\n",
2326 13815 : LOOP_VINFO_INT_NITERS (loop_vinfo));
2327 : }
2328 :
2329 359259 : if (max_vf != MAX_VECTORIZATION_FACTOR
2330 359259 : && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2331 41 : return opt_result::failure_at (vect_location, "bad data dependence.\n");
2332 :
2333 359218 : loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2334 :
2335 : /* Analyze the alignment of the data-refs in the loop. */
2336 359218 : vect_analyze_data_refs_alignment (loop_vinfo);
2337 :
2338 : /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2339 : It is important to call pruning after vect_analyze_data_ref_accesses,
2340 : since we use grouping information gathered by interleaving analysis. */
2341 359218 : ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2342 359218 : if (!ok)
2343 17068 : return ok;
2344 :
2345 : /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2346 : vectorization, since we do not want to add extra peeling or
2347 : add versioning for alignment. */
2348 342150 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2349 : /* This pass will decide on using loop versioning and/or loop peeling in
2350 : order to enhance the alignment of data references in the loop. */
2351 327039 : ok = vect_enhance_data_refs_alignment (loop_vinfo);
2352 342150 : if (!ok)
2353 0 : return ok;
2354 :
2355 : /* Analyze operations in the SLP instances. We can't simply
2356 : remove unsupported SLP instances as this makes the above
2357 : SLP kind detection invalid and might also affect the VF. */
2358 342150 : if (! vect_slp_analyze_operations (loop_vinfo))
2359 : {
2360 229157 : ok = opt_result::failure_at (vect_location,
2361 : "unsupported SLP instances\n");
2362 229157 : goto again;
2363 : }
2364 :
2365 : /* For now, we don't expect to mix both masking and length approaches for one
2366 : loop, disable it if both are recorded. */
2367 112993 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2368 16761 : && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2369 129748 : && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2370 : {
2371 0 : if (dump_enabled_p ())
2372 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2373 : "can't vectorize a loop with partial vectors"
2374 : " because we don't expect to mix different"
2375 : " approaches with partial vectors for the"
2376 : " same loop.\n");
2377 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2378 : }
2379 :
2380 : /* If we still have the option of using partial vectors,
2381 : check whether we can generate the necessary loop controls. */
2382 112993 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2383 : {
2384 16761 : if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2385 : {
2386 16755 : if (!vect_verify_full_masking (loop_vinfo)
2387 16755 : && !vect_verify_full_masking_avx512 (loop_vinfo))
2388 3655 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2389 : }
2390 : else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
2391 6 : if (!vect_verify_loop_lens (loop_vinfo))
2392 6 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2393 : }
2394 :
2395 : /* Decide whether this loop_vinfo should use partial vectors or peeling,
2396 : assuming that the loop will be used as a main loop. We will redo
2397 : this analysis later if we instead decide to use the loop as an
2398 : epilogue loop. */
2399 112993 : ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, masked_p);
2400 112993 : if (!ok)
2401 0 : return ok;
2402 :
2403 : /* If we're vectorizing a loop that uses length "controls" and
2404 : can iterate more than once, we apply decrementing IV approach
2405 : in loop control. */
2406 112993 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2407 26 : && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
2408 0 : && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
2409 112993 : && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2410 0 : && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
2411 : LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
2412 0 : LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
2413 :
2414 : /* If a loop uses length controls and has a decrementing loop control IV,
2415 : we will normally pass that IV through a MIN_EXPR to calcaluate the
2416 : basis for the length controls. E.g. in a loop that processes one
2417 : element per scalar iteration, the number of elements would be
2418 : MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
2419 :
2420 : This MIN_EXPR approach allows us to use pointer IVs with an invariant
2421 : step, since only the final iteration of the vector loop can have
2422 : inactive lanes.
2423 :
2424 : However, some targets have a dedicated instruction for calculating the
2425 : preferred length, given the total number of elements that still need to
2426 : be processed. This is encapsulated in the SELECT_VL internal function.
2427 :
2428 : If the target supports SELECT_VL, we can use it instead of MIN_EXPR
2429 : to determine the basis for the length controls. However, unlike the
2430 : MIN_EXPR calculation, the SELECT_VL calculation can decide to make
2431 : lanes inactive in any iteration of the vector loop, not just the last
2432 : iteration. This SELECT_VL approach therefore requires us to use pointer
2433 : IVs with variable steps.
2434 :
2435 : Once we've decided how many elements should be processed by one
2436 : iteration of the vector loop, we need to populate the rgroup controls.
2437 : If a loop has multiple rgroups, we need to make sure that those rgroups
2438 : "line up" (that is, they must be consistent about which elements are
2439 : active and which aren't). This is done by vect_adjust_loop_lens_control.
2440 :
2441 : In principle, it would be possible to use vect_adjust_loop_lens_control
2442 : on either the result of a MIN_EXPR or the result of a SELECT_VL.
2443 : However:
2444 :
2445 : (1) In practice, it only makes sense to use SELECT_VL when a vector
2446 : operation will be controlled directly by the result. It is not
2447 : worth using SELECT_VL if it would only be the input to other
2448 : calculations.
2449 :
2450 : (2) If we use SELECT_VL for an rgroup that has N controls, each associated
2451 : pointer IV will need N updates by a variable amount (N-1 updates
2452 : within the iteration and 1 update to move to the next iteration).
2453 :
2454 : Because of this, we prefer to use the MIN_EXPR approach whenever there
2455 : is more than one length control.
2456 :
2457 : In addition, SELECT_VL always operates to a granularity of 1 unit.
2458 : If we wanted to use it to control an SLP operation on N consecutive
2459 : elements, we would need to make the SELECT_VL inputs measure scalar
2460 : iterations (rather than elements) and then multiply the SELECT_VL
2461 : result by N. But using SELECT_VL this way is inefficient because
2462 : of (1) above.
2463 :
2464 : 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
2465 : satisfied:
2466 :
2467 : (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
2468 : (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
2469 :
2470 : Since SELECT_VL (variable step) will make SCEV analysis failed and then
2471 : we will fail to gain benefits of following unroll optimizations. We prefer
2472 : using the MIN_EXPR approach in this situation. */
2473 112993 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
2474 : {
2475 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
2476 0 : if (LOOP_VINFO_LENS (loop_vinfo).length () == 1
2477 0 : && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1
2478 0 : && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2479 : || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
2480 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
2481 :
2482 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
2483 0 : for (auto rgc : LOOP_VINFO_LENS (loop_vinfo))
2484 0 : if (rgc.type
2485 0 : && !direct_internal_fn_supported_p (IFN_SELECT_VL,
2486 : rgc.type, iv_type,
2487 : OPTIMIZE_FOR_SPEED))
2488 : {
2489 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2490 0 : break;
2491 : }
2492 :
2493 : /* If any of the SLP instances cover more than a single lane
2494 : we cannot use .SELECT_VL at the moment, even if the number
2495 : of lanes is uniform throughout the SLP graph. */
2496 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
2497 0 : for (slp_instance inst : LOOP_VINFO_SLP_INSTANCES (loop_vinfo))
2498 0 : if (SLP_TREE_LANES (SLP_INSTANCE_TREE (inst)) != 1
2499 0 : && !(SLP_INSTANCE_KIND (inst) == slp_inst_kind_store
2500 0 : && SLP_INSTANCE_TREE (inst)->ldst_lanes))
2501 : {
2502 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2503 0 : break;
2504 : }
2505 : }
2506 :
2507 : /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2508 : to be able to handle fewer than VF scalars, or needs to have a lower VF
2509 : than the main loop. */
2510 112993 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2511 12656 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2512 : {
2513 12646 : poly_uint64 unscaled_vf
2514 12646 : = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2515 : orig_loop_vinfo->suggested_unroll_factor);
2516 12646 : if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
2517 285 : return opt_result::failure_at (vect_location,
2518 : "Vectorization factor too high for"
2519 : " epilogue loop.\n");
2520 : }
2521 :
2522 : /* If the epilogue needs peeling for gaps but the main loop doesn't give
2523 : up on the epilogue. */
2524 112708 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2525 12371 : && LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2526 67 : && (LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo)
2527 : != LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
2528 4 : return opt_result::failure_at (vect_location,
2529 : "Epilogue loop requires peeling for gaps "
2530 : "but main loop does not.\n");
2531 :
2532 : /* If an epilogue loop is required make sure we can create one. */
2533 112704 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2534 111452 : || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2535 32635 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
2536 : {
2537 81009 : if (dump_enabled_p ())
2538 5282 : dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2539 81009 : if (!vect_can_advance_ivs_p (loop_vinfo)
2540 161506 : || !slpeel_can_duplicate_loop_p (loop,
2541 : LOOP_VINFO_MAIN_EXIT (loop_vinfo),
2542 80497 : LOOP_VINFO_MAIN_EXIT (loop_vinfo)))
2543 : {
2544 512 : ok = opt_result::failure_at (vect_location,
2545 : "not vectorized: can't create required "
2546 : "epilog loop\n");
2547 512 : goto again;
2548 : }
2549 : }
2550 :
2551 : /* Check the costings of the loop make vectorizing worthwhile. */
2552 112192 : res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2553 112192 : if (res < 0)
2554 : {
2555 28738 : ok = opt_result::failure_at (vect_location,
2556 : "Loop costings may not be worthwhile.\n");
2557 28738 : goto again;
2558 : }
2559 83454 : if (!res)
2560 21528 : return opt_result::failure_at (vect_location,
2561 : "Loop costings not worthwhile.\n");
2562 :
2563 : /* During peeling, we need to check if number of loop iterations is
2564 : enough for both peeled prolog loop and vector loop. This check
2565 : can be merged along with threshold check of loop versioning, so
2566 : increase threshold for this case if necessary.
2567 :
2568 : If we are analyzing an epilogue we still want to check what its
2569 : versioning threshold would be. If we decide to vectorize the epilogues we
2570 : will want to use the lowest versioning threshold of all epilogues and main
2571 : loop. This will enable us to enter a vectorized epilogue even when
2572 : versioning the loop. We can't simply check whether the epilogue requires
2573 : versioning though since we may have skipped some versioning checks when
2574 : analyzing the epilogue. For instance, checks for alias versioning will be
2575 : skipped when dealing with epilogues as we assume we already checked them
2576 : for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2577 61926 : if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2578 : {
2579 5814 : poly_uint64 niters_th = 0;
2580 5814 : unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2581 :
2582 5814 : if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2583 : {
2584 : /* Niters for peeled prolog loop. */
2585 5814 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2586 : {
2587 118 : dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2588 118 : tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2589 118 : niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2590 : }
2591 : else
2592 5696 : niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2593 : }
2594 :
2595 : /* Niters for at least one iteration of vectorized loop. */
2596 5814 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2597 5810 : niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2598 : /* One additional iteration because of peeling for gap. */
2599 5814 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2600 60 : niters_th += 1;
2601 :
2602 : /* Use the same condition as vect_transform_loop to decide when to use
2603 : the cost to determine a versioning threshold. */
2604 5814 : if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2605 5814 : && ordered_p (th, niters_th))
2606 3911 : niters_th = ordered_max (poly_uint64 (th), niters_th);
2607 :
2608 5814 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2609 : }
2610 :
2611 61926 : gcc_assert (known_eq (vectorization_factor,
2612 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2613 :
2614 61926 : single_lane_slp_done_for_suggested_uf = force_single_lane;
2615 :
2616 : /* Ok to vectorize! */
2617 61926 : LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2618 61926 : return opt_result::success ();
2619 :
2620 258407 : again:
2621 : /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2622 258407 : gcc_assert (!ok);
2623 :
2624 : /* Try again with single-lane SLP. */
2625 258407 : if (force_single_lane)
2626 128160 : return ok;
2627 :
2628 : /* If we are applying suggested unroll factor, we don't need to
2629 : re-try any more as we want to keep the SLP mode fixed. */
2630 130247 : if (applying_suggested_uf)
2631 6 : return ok;
2632 :
2633 : /* Likewise if the grouped loads or stores in the SLP cannot be handled
2634 : via interleaving or lane instructions. */
2635 : slp_instance instance;
2636 : slp_tree node;
2637 : unsigned i, j;
2638 353890 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2639 : {
2640 224583 : if (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance)) != vect_internal_def)
2641 0 : continue;
2642 :
2643 224583 : stmt_vec_info vinfo;
2644 224583 : vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2645 224583 : if (!vinfo || !STMT_VINFO_GROUPED_ACCESS (vinfo))
2646 222071 : continue;
2647 2512 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2648 2512 : unsigned int size = DR_GROUP_SIZE (vinfo);
2649 2512 : tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
2650 2512 : if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
2651 4344 : && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2652 5019 : && ! vect_grouped_store_supported (vectype, size))
2653 675 : return opt_result::failure_at (vinfo->stmt,
2654 : "unsupported grouped store\n");
2655 226763 : FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2656 : {
2657 1929 : vinfo = SLP_TREE_REPRESENTATIVE (node);
2658 1929 : if (STMT_VINFO_GROUPED_ACCESS (vinfo))
2659 : {
2660 1681 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2661 1681 : bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2662 1681 : size = DR_GROUP_SIZE (vinfo);
2663 1681 : vectype = SLP_TREE_VECTYPE (node);
2664 1681 : if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
2665 1681 : && ! vect_grouped_load_supported (vectype, single_element_p,
2666 : size))
2667 259 : return opt_result::failure_at (vinfo->stmt,
2668 : "unsupported grouped load\n");
2669 : }
2670 : }
2671 : }
2672 :
2673 : /* Roll back state appropriately. Force single-lane SLP this time. */
2674 129307 : force_single_lane = true;
2675 129307 : if (dump_enabled_p ())
2676 3297 : dump_printf_loc (MSG_NOTE, vect_location,
2677 : "re-trying with single-lane SLP\n");
2678 :
2679 : /* Reset the vectorization factor. */
2680 129307 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = 0;
2681 : /* Free the SLP instances. */
2682 352949 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2683 223642 : vect_free_slp_instance (instance);
2684 129307 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2685 : /* Reset altered state on stmts. */
2686 494739 : for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2687 : {
2688 365432 : basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2689 365432 : for (gimple_stmt_iterator si = gsi_start_phis (bb);
2690 648738 : !gsi_end_p (si); gsi_next (&si))
2691 : {
2692 283306 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2693 283306 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2694 283306 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2695 : {
2696 : /* vectorizable_reduction adjusts reduction stmt def-types,
2697 : restore them to that of the PHI. */
2698 20556 : STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2699 20556 : = STMT_VINFO_DEF_TYPE (stmt_info);
2700 20556 : STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2701 : (STMT_VINFO_REDUC_DEF (stmt_info)))
2702 20556 : = STMT_VINFO_DEF_TYPE (stmt_info);
2703 : }
2704 : }
2705 730864 : for (gimple_stmt_iterator si = gsi_start_bb (bb);
2706 2238684 : !gsi_end_p (si); gsi_next (&si))
2707 : {
2708 1873252 : if (is_gimple_debug (gsi_stmt (si)))
2709 718239 : continue;
2710 1155013 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2711 1155013 : if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2712 : {
2713 218933 : stmt_vec_info pattern_stmt_info
2714 : = STMT_VINFO_RELATED_STMT (stmt_info);
2715 218933 : if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2716 0 : STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2717 : }
2718 : }
2719 : }
2720 : /* Free optimized alias test DDRS. */
2721 129307 : LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2722 129307 : LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2723 129307 : LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2724 : /* Reset target cost data. */
2725 129307 : delete loop_vinfo->vector_costs;
2726 129307 : loop_vinfo->vector_costs = nullptr;
2727 : /* Reset accumulated rgroup information. */
2728 129307 : LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
2729 129307 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
2730 129307 : release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2731 : /* Reset assorted flags. */
2732 129307 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2733 129307 : LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2734 129307 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2735 129307 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2736 129307 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2737 129307 : = saved_can_use_partial_vectors_p;
2738 129307 : LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2739 129307 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2740 129307 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2741 129307 : LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = false;
2742 :
2743 129307 : if (loop_vinfo->scan_map)
2744 122 : loop_vinfo->scan_map->empty ();
2745 :
2746 129307 : goto start_over;
2747 : }
2748 :
2749 : /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2750 : to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2751 : OLD_LOOP_VINFO is better unless something specifically indicates
2752 : otherwise.
2753 :
2754 : Note that this deliberately isn't a partial order. */
2755 :
2756 : static bool
2757 5 : vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2758 : loop_vec_info old_loop_vinfo)
2759 : {
2760 5 : struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2761 5 : gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2762 :
2763 5 : poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2764 5 : poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2765 :
2766 : /* Always prefer a VF of loop->simdlen over any other VF. */
2767 5 : if (loop->simdlen)
2768 : {
2769 0 : bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2770 0 : bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2771 0 : if (new_simdlen_p != old_simdlen_p)
2772 : return new_simdlen_p;
2773 : }
2774 :
2775 5 : const auto *old_costs = old_loop_vinfo->vector_costs;
2776 5 : const auto *new_costs = new_loop_vinfo->vector_costs;
2777 5 : if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2778 0 : return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2779 :
2780 5 : return new_costs->better_main_loop_than_p (old_costs);
2781 : }
2782 :
2783 : /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2784 : true if we should. */
2785 :
2786 : static bool
2787 5 : vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2788 : loop_vec_info old_loop_vinfo)
2789 : {
2790 5 : if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2791 : return false;
2792 :
2793 1 : if (dump_enabled_p ())
2794 1 : dump_printf_loc (MSG_NOTE, vect_location,
2795 : "***** Preferring vector mode %s to vector mode %s\n",
2796 1 : GET_MODE_NAME (new_loop_vinfo->vector_mode),
2797 1 : GET_MODE_NAME (old_loop_vinfo->vector_mode));
2798 : return true;
2799 : }
2800 :
2801 : /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if ORIG_LOOP_VINFO is
2802 : not NULL. When MASKED_P is not -1 override the default
2803 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P with it.
2804 : Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance MODE_I to the next
2805 : mode useful to analyze.
2806 : Return the loop_vinfo on success and wrapped null on failure. */
2807 :
2808 : static opt_loop_vec_info
2809 499555 : vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2810 : const vect_loop_form_info *loop_form_info,
2811 : loop_vec_info orig_loop_vinfo,
2812 : const vector_modes &vector_modes, unsigned &mode_i,
2813 : int masked_p,
2814 : machine_mode &autodetected_vector_mode,
2815 : bool &fatal)
2816 : {
2817 499555 : loop_vec_info loop_vinfo
2818 499555 : = vect_create_loop_vinfo (loop, shared, loop_form_info, orig_loop_vinfo);
2819 :
2820 499555 : machine_mode vector_mode = vector_modes[mode_i];
2821 499555 : loop_vinfo->vector_mode = vector_mode;
2822 499555 : unsigned int suggested_unroll_factor = 1;
2823 499555 : bool single_lane_slp_done_for_suggested_uf = false;
2824 :
2825 : /* Run the main analysis. */
2826 499555 : opt_result res = vect_analyze_loop_2 (loop_vinfo, masked_p, fatal,
2827 : &suggested_unroll_factor,
2828 : single_lane_slp_done_for_suggested_uf);
2829 499555 : if (dump_enabled_p ())
2830 20373 : dump_printf_loc (MSG_NOTE, vect_location,
2831 : "***** Analysis %s with vector mode %s\n",
2832 20373 : res ? "succeeded" : "failed",
2833 20373 : GET_MODE_NAME (loop_vinfo->vector_mode));
2834 :
2835 499555 : auto user_unroll = LOOP_VINFO_LOOP (loop_vinfo)->unroll;
2836 499555 : if (res && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2837 : /* Check to see if the user wants to unroll or if the target wants to. */
2838 554222 : && (suggested_unroll_factor > 1 || user_unroll > 1))
2839 : {
2840 261 : if (suggested_unroll_factor == 1)
2841 : {
2842 44 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
2843 44 : suggested_unroll_factor = user_unroll / assumed_vf;
2844 44 : if (suggested_unroll_factor > 1)
2845 : {
2846 30 : if (dump_enabled_p ())
2847 20 : dump_printf_loc (MSG_NOTE, vect_location,
2848 : "setting unroll factor to %d based on user requested "
2849 : "unroll factor %d and suggested vectorization "
2850 : "factor: %d\n",
2851 : suggested_unroll_factor, user_unroll, assumed_vf);
2852 : }
2853 : }
2854 :
2855 261 : if (suggested_unroll_factor > 1)
2856 : {
2857 247 : if (dump_enabled_p ())
2858 44 : dump_printf_loc (MSG_NOTE, vect_location,
2859 : "***** Re-trying analysis for unrolling"
2860 : " with unroll factor %d and %s slp.\n",
2861 : suggested_unroll_factor,
2862 : single_lane_slp_done_for_suggested_uf
2863 : ? "single-lane" : "");
2864 247 : loop_vec_info unroll_vinfo
2865 247 : = vect_create_loop_vinfo (loop, shared, loop_form_info, NULL);
2866 247 : unroll_vinfo->vector_mode = vector_mode;
2867 247 : unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
2868 247 : opt_result new_res
2869 247 : = vect_analyze_loop_2 (unroll_vinfo, masked_p, fatal, NULL,
2870 : single_lane_slp_done_for_suggested_uf);
2871 247 : if (new_res)
2872 : {
2873 201 : delete loop_vinfo;
2874 201 : loop_vinfo = unroll_vinfo;
2875 : }
2876 : else
2877 46 : delete unroll_vinfo;
2878 : }
2879 :
2880 : /* Record that we have honored a user unroll factor. */
2881 261 : LOOP_VINFO_USER_UNROLL (loop_vinfo) = user_unroll > 1;
2882 : }
2883 :
2884 : /* Remember the autodetected vector mode. */
2885 499555 : if (vector_mode == VOIDmode)
2886 270015 : autodetected_vector_mode = loop_vinfo->vector_mode;
2887 :
2888 : /* Advance mode_i, first skipping modes that would result in the
2889 : same analysis result. */
2890 2319213 : while (mode_i + 1 < vector_modes.length ()
2891 1638192 : && vect_chooses_same_modes_p (loop_vinfo,
2892 728363 : vector_modes[mode_i + 1]))
2893 : {
2894 410274 : if (dump_enabled_p ())
2895 17015 : dump_printf_loc (MSG_NOTE, vect_location,
2896 : "***** The result for vector mode %s would"
2897 : " be the same\n",
2898 17015 : GET_MODE_NAME (vector_modes[mode_i + 1]));
2899 410274 : mode_i += 1;
2900 : }
2901 499555 : if (mode_i + 1 < vector_modes.length ()
2902 817644 : && vect_chooses_same_modes_p (autodetected_vector_mode,
2903 318089 : vector_modes[mode_i + 1]))
2904 : {
2905 349 : if (dump_enabled_p ())
2906 10 : dump_printf_loc (MSG_NOTE, vect_location,
2907 : "***** Skipping vector mode %s, which would"
2908 : " repeat the analysis for %s\n",
2909 10 : GET_MODE_NAME (vector_modes[mode_i + 1]),
2910 10 : GET_MODE_NAME (autodetected_vector_mode));
2911 349 : mode_i += 1;
2912 : }
2913 499555 : mode_i++;
2914 :
2915 499555 : if (!res)
2916 : {
2917 437830 : delete loop_vinfo;
2918 437830 : if (fatal)
2919 105370 : gcc_checking_assert (orig_loop_vinfo == NULL);
2920 437830 : return opt_loop_vec_info::propagate_failure (res);
2921 : }
2922 :
2923 61725 : return opt_loop_vec_info::success (loop_vinfo);
2924 : }
2925 :
2926 : /* Function vect_analyze_loop.
2927 :
2928 : Apply a set of analyses on LOOP, and create a loop_vec_info struct
2929 : for it. The different analyses will record information in the
2930 : loop_vec_info struct. */
2931 : opt_loop_vec_info
2932 467718 : vect_analyze_loop (class loop *loop, gimple *loop_vectorized_call,
2933 : vec_info_shared *shared)
2934 : {
2935 467718 : DUMP_VECT_SCOPE ("analyze_loop_nest");
2936 :
2937 467718 : if (loop_outer (loop)
2938 467718 : && loop_vec_info_for_loop (loop_outer (loop))
2939 468276 : && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2940 558 : return opt_loop_vec_info::failure_at (vect_location,
2941 : "outer-loop already vectorized.\n");
2942 :
2943 467160 : if (!find_loop_nest (loop, &shared->loop_nest))
2944 22378 : return opt_loop_vec_info::failure_at
2945 22378 : (vect_location,
2946 : "not vectorized: loop nest containing two or more consecutive inner"
2947 : " loops cannot be vectorized\n");
2948 :
2949 : /* Analyze the loop form. */
2950 444782 : vect_loop_form_info loop_form_info;
2951 444782 : opt_result res = vect_analyze_loop_form (loop, loop_vectorized_call,
2952 : &loop_form_info);
2953 444782 : if (!res)
2954 : {
2955 174767 : if (dump_enabled_p ())
2956 1519 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2957 : "bad loop form.\n");
2958 174767 : return opt_loop_vec_info::propagate_failure (res);
2959 : }
2960 270015 : if (!integer_onep (loop_form_info.assumptions))
2961 : {
2962 : /* We consider to vectorize this loop by versioning it under
2963 : some assumptions. In order to do this, we need to clear
2964 : existing information computed by scev and niter analyzer. */
2965 8276 : scev_reset_htab ();
2966 8276 : free_numbers_of_iterations_estimates (loop);
2967 : /* Also set flag for this loop so that following scev and niter
2968 : analysis are done under the assumptions. */
2969 8276 : loop_constraint_set (loop, LOOP_C_FINITE);
2970 : }
2971 : else
2972 : /* Clear the existing niter information to make sure the nonwrapping flag
2973 : will be calculated and set propriately. */
2974 261739 : free_numbers_of_iterations_estimates (loop);
2975 :
2976 270015 : auto_vector_modes vector_modes;
2977 : /* Autodetect first vector size we try. */
2978 270015 : vector_modes.safe_push (VOIDmode);
2979 270015 : unsigned int autovec_flags
2980 540030 : = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2981 270015 : loop->simdlen != 0);
2982 270015 : bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2983 270015 : && !unlimited_cost_model (loop));
2984 270015 : machine_mode autodetected_vector_mode = VOIDmode;
2985 270015 : opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2986 270015 : unsigned int mode_i = 0;
2987 270015 : unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2988 :
2989 : /* Keep track of the VF for each mode. Initialize all to 0 which indicates
2990 : a mode has not been analyzed. */
2991 270015 : auto_vec<poly_uint64, 8> cached_vf_per_mode;
2992 2712330 : for (unsigned i = 0; i < vector_modes.length (); ++i)
2993 1086150 : cached_vf_per_mode.safe_push (0);
2994 :
2995 : /* First determine the main loop vectorization mode, either the first
2996 : one that works, starting with auto-detecting the vector mode and then
2997 : following the targets order of preference, or the one with the
2998 : lowest cost if pick_lowest_cost_p. */
2999 692983 : while (1)
3000 : {
3001 481499 : bool fatal;
3002 481499 : unsigned int last_mode_i = mode_i;
3003 : /* Set cached VF to -1 prior to analysis, which indicates a mode has
3004 : failed. */
3005 481499 : cached_vf_per_mode[last_mode_i] = -1;
3006 481499 : opt_loop_vec_info loop_vinfo
3007 481499 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3008 : NULL, vector_modes, mode_i, -1,
3009 : autodetected_vector_mode, fatal);
3010 481499 : if (fatal)
3011 : break;
3012 :
3013 376129 : if (loop_vinfo)
3014 : {
3015 : /* Analyzis has been successful so update the VF value. The
3016 : VF should always be a multiple of unroll_factor and we want to
3017 : capture the original VF here. */
3018 54667 : cached_vf_per_mode[last_mode_i]
3019 54667 : = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3020 54667 : loop_vinfo->suggested_unroll_factor);
3021 : /* Once we hit the desired simdlen for the first time,
3022 : discard any previous attempts. */
3023 54667 : if (simdlen
3024 54667 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3025 : {
3026 47 : delete first_loop_vinfo;
3027 : first_loop_vinfo = opt_loop_vec_info::success (NULL);
3028 : simdlen = 0;
3029 : }
3030 54620 : else if (pick_lowest_cost_p
3031 10 : && first_loop_vinfo
3032 54625 : && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3033 : {
3034 : /* Pick loop_vinfo over first_loop_vinfo. */
3035 1 : delete first_loop_vinfo;
3036 1 : first_loop_vinfo = opt_loop_vec_info::success (NULL);
3037 : }
3038 54667 : if (first_loop_vinfo == NULL)
3039 : first_loop_vinfo = loop_vinfo;
3040 : else
3041 : {
3042 6 : delete loop_vinfo;
3043 6 : loop_vinfo = opt_loop_vec_info::success (NULL);
3044 : }
3045 :
3046 : /* Commit to first_loop_vinfo if we have no reason to try
3047 : alternatives. */
3048 54667 : if (!simdlen && !pick_lowest_cost_p)
3049 : break;
3050 : }
3051 321481 : if (mode_i == vector_modes.length ()
3052 321481 : || autodetected_vector_mode == VOIDmode)
3053 : break;
3054 :
3055 : /* Try the next biggest vector size. */
3056 211484 : if (dump_enabled_p ())
3057 3961 : dump_printf_loc (MSG_NOTE, vect_location,
3058 : "***** Re-trying analysis with vector mode %s\n",
3059 3961 : GET_MODE_NAME (vector_modes[mode_i]));
3060 211484 : }
3061 270015 : if (!first_loop_vinfo)
3062 215360 : return opt_loop_vec_info::propagate_failure (res);
3063 :
3064 54655 : if (dump_enabled_p ())
3065 9502 : dump_printf_loc (MSG_NOTE, vect_location,
3066 : "***** Choosing vector mode %s\n",
3067 9502 : GET_MODE_NAME (first_loop_vinfo->vector_mode));
3068 :
3069 : /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3070 : enabled, SIMDUID is not set, it is the innermost loop and we have
3071 : either already found the loop's SIMDLEN or there was no SIMDLEN to
3072 : begin with.
3073 : TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3074 54655 : bool vect_epilogues = (!simdlen
3075 54653 : && loop->inner == NULL
3076 54081 : && param_vect_epilogues_nomask
3077 53008 : && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3078 : /* No code motion support for multiple epilogues so for now
3079 : not supported when multiple exits. */
3080 26151 : && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3081 25679 : && !loop->simduid
3082 78921 : && loop_cost_model (loop) > VECT_COST_MODEL_VERY_CHEAP);
3083 54655 : if (!vect_epilogues)
3084 41747 : return first_loop_vinfo;
3085 :
3086 : /* Now analyze first_loop_vinfo for epilogue vectorization. */
3087 :
3088 : /* For epilogues start the analysis from the first mode. The motivation
3089 : behind starting from the beginning comes from cases where the VECTOR_MODES
3090 : array may contain length-agnostic and length-specific modes. Their
3091 : ordering is not guaranteed, so we could end up picking a mode for the main
3092 : loop that is after the epilogue's optimal mode. */
3093 12908 : int masked_p = -1;
3094 12908 : if (!unlimited_cost_model (loop)
3095 12908 : && (first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p)
3096 : != VOIDmode))
3097 : {
3098 4 : vector_modes[0]
3099 4 : = first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p);
3100 4 : cached_vf_per_mode[0] = 0;
3101 : }
3102 : else
3103 12904 : vector_modes[0] = autodetected_vector_mode;
3104 12908 : mode_i = 0;
3105 :
3106 12944 : bool supports_partial_vectors = (param_vect_partial_vector_usage != 0
3107 12908 : || masked_p == 1);
3108 : if (supports_partial_vectors
3109 36 : && !partial_vectors_supported_p ()
3110 36 : && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (first_loop_vinfo))
3111 : supports_partial_vectors = false;
3112 12908 : poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3113 :
3114 12908 : loop_vec_info orig_loop_vinfo = first_loop_vinfo;
3115 13096 : do
3116 : {
3117 : /* Let the user override what the target suggests. */
3118 13002 : if (OPTION_SET_P (param_vect_partial_vector_usage))
3119 43 : masked_p = -1;
3120 :
3121 44528 : while (1)
3122 : {
3123 : /* If the target does not support partial vectors we can shorten the
3124 : number of modes to analyze for the epilogue as we know we can't
3125 : pick a mode that would lead to a VF at least as big as the
3126 : FIRST_VINFO_VF. */
3127 58305 : if (!supports_partial_vectors
3128 44528 : && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3129 : {
3130 13806 : mode_i++;
3131 27612 : if (mode_i == vector_modes.length ())
3132 : break;
3133 26443 : continue;
3134 : }
3135 : /* We would need an exhaustive search to find all modes we
3136 : skipped but that would lead to the same result as the
3137 : analysis it was skipped for and where we'd could check
3138 : cached_vf_per_mode against.
3139 : Check for the autodetected mode, which is the common
3140 : situation on x86 which does not perform cost comparison. */
3141 43388 : if (!supports_partial_vectors
3142 30712 : && maybe_ge (cached_vf_per_mode[0], first_vinfo_vf)
3143 60895 : && vect_chooses_same_modes_p (autodetected_vector_mode,
3144 30173 : vector_modes[mode_i]))
3145 : {
3146 12666 : mode_i++;
3147 25332 : if (mode_i == vector_modes.length ())
3148 : break;
3149 12666 : continue;
3150 : }
3151 :
3152 18056 : if (dump_enabled_p ())
3153 3232 : dump_printf_loc (MSG_NOTE, vect_location,
3154 : "***** Re-trying epilogue analysis with vector "
3155 3232 : "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3156 :
3157 18056 : bool fatal;
3158 18056 : opt_loop_vec_info loop_vinfo
3159 18056 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3160 : orig_loop_vinfo,
3161 : vector_modes, mode_i, masked_p,
3162 : autodetected_vector_mode, fatal);
3163 18056 : if (fatal)
3164 : break;
3165 :
3166 18056 : if (loop_vinfo)
3167 : {
3168 7058 : if (pick_lowest_cost_p
3169 4 : && orig_loop_vinfo->epilogue_vinfo
3170 7058 : && vect_joust_loop_vinfos (loop_vinfo,
3171 0 : orig_loop_vinfo->epilogue_vinfo))
3172 : {
3173 0 : gcc_assert (vect_epilogues);
3174 0 : delete orig_loop_vinfo->epilogue_vinfo;
3175 0 : orig_loop_vinfo->epilogue_vinfo = nullptr;
3176 : }
3177 7058 : if (!orig_loop_vinfo->epilogue_vinfo)
3178 7058 : orig_loop_vinfo->epilogue_vinfo = loop_vinfo;
3179 : else
3180 : {
3181 0 : delete loop_vinfo;
3182 0 : loop_vinfo = opt_loop_vec_info::success (NULL);
3183 : }
3184 :
3185 : /* For now only allow one epilogue loop, but allow
3186 : pick_lowest_cost_p to replace it, so commit to the
3187 : first epilogue if we have no reason to try alternatives. */
3188 7058 : if (!pick_lowest_cost_p)
3189 : break;
3190 : }
3191 :
3192 : /* Revert back to the default from the suggested prefered
3193 : epilogue vectorization mode. */
3194 11002 : masked_p = -1;
3195 22004 : if (mode_i == vector_modes.length ())
3196 : break;
3197 : }
3198 :
3199 13002 : orig_loop_vinfo = orig_loop_vinfo->epilogue_vinfo;
3200 13002 : if (!orig_loop_vinfo)
3201 : break;
3202 :
3203 : /* When we selected a first vectorized epilogue, see if the target
3204 : suggests to have another one. */
3205 7058 : masked_p = -1;
3206 7058 : if (!unlimited_cost_model (loop)
3207 4122 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (orig_loop_vinfo)
3208 11173 : && (orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p)
3209 : != VOIDmode))
3210 : {
3211 188 : vector_modes[0]
3212 94 : = orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p);
3213 94 : cached_vf_per_mode[0] = 0;
3214 94 : mode_i = 0;
3215 : }
3216 : else
3217 : break;
3218 94 : }
3219 : while (1);
3220 :
3221 12908 : if (first_loop_vinfo->epilogue_vinfo)
3222 : {
3223 6969 : poly_uint64 lowest_th
3224 6969 : = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3225 6969 : loop_vec_info epilog_vinfo = first_loop_vinfo->epilogue_vinfo;
3226 7058 : do
3227 : {
3228 7058 : poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (epilog_vinfo);
3229 7058 : gcc_assert (!LOOP_REQUIRES_VERSIONING (epilog_vinfo)
3230 : || maybe_ne (lowest_th, 0U));
3231 : /* Keep track of the known smallest versioning threshold. */
3232 7058 : if (ordered_p (lowest_th, th))
3233 7058 : lowest_th = ordered_min (lowest_th, th);
3234 7058 : epilog_vinfo = epilog_vinfo->epilogue_vinfo;
3235 : }
3236 7058 : while (epilog_vinfo);
3237 6969 : LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3238 6969 : if (dump_enabled_p ())
3239 1441 : dump_printf_loc (MSG_NOTE, vect_location,
3240 : "***** Choosing epilogue vector mode %s\n",
3241 1441 : GET_MODE_NAME
3242 : (first_loop_vinfo->epilogue_vinfo->vector_mode));
3243 : }
3244 :
3245 12908 : return first_loop_vinfo;
3246 714797 : }
3247 :
3248 : /* Return true if there is an in-order reduction function for CODE, storing
3249 : it in *REDUC_FN if so. */
3250 :
3251 : static bool
3252 4714 : fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3253 : {
3254 : /* We support MINUS_EXPR by negating the operand. This also preserves an
3255 : initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3256 : (-0.0) = -0.0. */
3257 4714 : if (code == PLUS_EXPR || code == MINUS_EXPR)
3258 : {
3259 4038 : *reduc_fn = IFN_FOLD_LEFT_PLUS;
3260 0 : return true;
3261 : }
3262 : return false;
3263 : }
3264 :
3265 : /* Function reduction_fn_for_scalar_code
3266 :
3267 : Input:
3268 : CODE - tree_code of a reduction operations.
3269 :
3270 : Output:
3271 : REDUC_FN - the corresponding internal function to be used to reduce the
3272 : vector of partial results into a single scalar result, or IFN_LAST
3273 : if the operation is a supported reduction operation, but does not have
3274 : such an internal function.
3275 :
3276 : Return FALSE if CODE currently cannot be vectorized as reduction. */
3277 :
3278 : bool
3279 1984595 : reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3280 : {
3281 1984595 : if (code.is_tree_code ())
3282 1984537 : switch (tree_code (code))
3283 : {
3284 15107 : case MAX_EXPR:
3285 15107 : *reduc_fn = IFN_REDUC_MAX;
3286 15107 : return true;
3287 :
3288 50737 : case MIN_EXPR:
3289 50737 : *reduc_fn = IFN_REDUC_MIN;
3290 50737 : return true;
3291 :
3292 1081038 : case PLUS_EXPR:
3293 1081038 : *reduc_fn = IFN_REDUC_PLUS;
3294 1081038 : return true;
3295 :
3296 235553 : case BIT_AND_EXPR:
3297 235553 : *reduc_fn = IFN_REDUC_AND;
3298 235553 : return true;
3299 :
3300 280020 : case BIT_IOR_EXPR:
3301 280020 : *reduc_fn = IFN_REDUC_IOR;
3302 280020 : return true;
3303 :
3304 42895 : case BIT_XOR_EXPR:
3305 42895 : *reduc_fn = IFN_REDUC_XOR;
3306 42895 : return true;
3307 :
3308 279187 : case MULT_EXPR:
3309 279187 : case MINUS_EXPR:
3310 279187 : *reduc_fn = IFN_LAST;
3311 279187 : return true;
3312 :
3313 : default:
3314 : return false;
3315 : }
3316 : else
3317 58 : switch (combined_fn (code))
3318 : {
3319 34 : CASE_CFN_FMAX:
3320 34 : *reduc_fn = IFN_REDUC_FMAX;
3321 34 : return true;
3322 :
3323 24 : CASE_CFN_FMIN:
3324 24 : *reduc_fn = IFN_REDUC_FMIN;
3325 24 : return true;
3326 :
3327 : default:
3328 : return false;
3329 : }
3330 : }
3331 :
3332 : /* Set *SBOOL_FN to the corresponding function working on vector masks
3333 : for REDUC_FN. Return true if that exists, false otherwise. */
3334 :
3335 : static bool
3336 0 : sbool_reduction_fn_for_fn (internal_fn reduc_fn, internal_fn *sbool_fn)
3337 : {
3338 0 : switch (reduc_fn)
3339 : {
3340 0 : case IFN_REDUC_AND:
3341 0 : *sbool_fn = IFN_REDUC_SBOOL_AND;
3342 0 : return true;
3343 0 : case IFN_REDUC_IOR:
3344 0 : *sbool_fn = IFN_REDUC_SBOOL_IOR;
3345 0 : return true;
3346 0 : case IFN_REDUC_XOR:
3347 0 : *sbool_fn = IFN_REDUC_SBOOL_XOR;
3348 0 : return true;
3349 : default:
3350 : return false;
3351 : }
3352 : }
3353 :
3354 : /* If there is a neutral value X such that a reduction would not be affected
3355 : by the introduction of additional X elements, return that X, otherwise
3356 : return null. CODE is the code of the reduction and SCALAR_TYPE is type
3357 : of the scalar elements. If the reduction has just a single initial value
3358 : then INITIAL_VALUE is that value, otherwise it is null.
3359 : If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3360 : In that case no signed zero is returned. */
3361 :
3362 : tree
3363 52244 : neutral_op_for_reduction (tree scalar_type, code_helper code,
3364 : tree initial_value, bool as_initial)
3365 : {
3366 52244 : if (code.is_tree_code ())
3367 52186 : switch (tree_code (code))
3368 : {
3369 7792 : case DOT_PROD_EXPR:
3370 7792 : case SAD_EXPR:
3371 7792 : case MINUS_EXPR:
3372 7792 : case BIT_IOR_EXPR:
3373 7792 : case BIT_XOR_EXPR:
3374 7792 : return build_zero_cst (scalar_type);
3375 39415 : case WIDEN_SUM_EXPR:
3376 39415 : case PLUS_EXPR:
3377 39415 : if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3378 92 : return build_real (scalar_type, dconstm0);
3379 : else
3380 39323 : return build_zero_cst (scalar_type);
3381 :
3382 2046 : case MULT_EXPR:
3383 2046 : return build_one_cst (scalar_type);
3384 :
3385 934 : case BIT_AND_EXPR:
3386 934 : return build_all_ones_cst (scalar_type);
3387 :
3388 : case MAX_EXPR:
3389 : case MIN_EXPR:
3390 : return initial_value;
3391 :
3392 384 : default:
3393 384 : return NULL_TREE;
3394 : }
3395 : else
3396 58 : switch (combined_fn (code))
3397 : {
3398 : CASE_CFN_FMIN:
3399 : CASE_CFN_FMAX:
3400 : return initial_value;
3401 :
3402 0 : default:
3403 0 : return NULL_TREE;
3404 : }
3405 : }
3406 :
3407 : /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3408 : STMT is printed with a message MSG. */
3409 :
3410 : static void
3411 577 : report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3412 : {
3413 577 : dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3414 577 : }
3415 :
3416 : /* Return true if we need an in-order reduction for operation CODE
3417 : on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3418 : overflow must wrap. */
3419 :
3420 : bool
3421 6441562 : needs_fold_left_reduction_p (tree type, code_helper code)
3422 : {
3423 : /* CHECKME: check for !flag_finite_math_only too? */
3424 6441562 : if (SCALAR_FLOAT_TYPE_P (type))
3425 : {
3426 547769 : if (code.is_tree_code ())
3427 547715 : switch (tree_code (code))
3428 : {
3429 : case MIN_EXPR:
3430 : case MAX_EXPR:
3431 : return false;
3432 :
3433 546016 : default:
3434 546016 : return !flag_associative_math;
3435 : }
3436 : else
3437 54 : switch (combined_fn (code))
3438 : {
3439 : CASE_CFN_FMIN:
3440 : CASE_CFN_FMAX:
3441 : return false;
3442 :
3443 2 : default:
3444 2 : return !flag_associative_math;
3445 : }
3446 : }
3447 :
3448 5893793 : if (INTEGRAL_TYPE_P (type))
3449 5892927 : return (!code.is_tree_code ()
3450 5892927 : || !operation_no_trapping_overflow (type, tree_code (code)));
3451 :
3452 866 : if (SAT_FIXED_POINT_TYPE_P (type))
3453 : return true;
3454 :
3455 : return false;
3456 : }
3457 :
3458 : /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3459 : has a handled computation expression. Store the main reduction
3460 : operation in *CODE. */
3461 :
3462 : static bool
3463 76291 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3464 : tree loop_arg, code_helper *code,
3465 : vec<std::pair<ssa_op_iter, use_operand_p> > &path,
3466 : bool inner_loop_of_double_reduc)
3467 : {
3468 76291 : auto_bitmap visited;
3469 76291 : tree lookfor = PHI_RESULT (phi);
3470 76291 : ssa_op_iter curri;
3471 76291 : use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3472 160172 : while (USE_FROM_PTR (curr) != loop_arg)
3473 7590 : curr = op_iter_next_use (&curri);
3474 76291 : curri.i = curri.numops;
3475 701067 : do
3476 : {
3477 701067 : path.safe_push (std::make_pair (curri, curr));
3478 701067 : tree use = USE_FROM_PTR (curr);
3479 701067 : if (use == lookfor)
3480 : break;
3481 625168 : gimple *def = SSA_NAME_DEF_STMT (use);
3482 625168 : if (gimple_nop_p (def)
3483 625168 : || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3484 : {
3485 523255 : pop:
3486 523255 : do
3487 : {
3488 523255 : std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3489 523255 : curri = x.first;
3490 523255 : curr = x.second;
3491 574521 : do
3492 574521 : curr = op_iter_next_use (&curri);
3493 : /* Skip already visited or non-SSA operands (from iterating
3494 : over PHI args). */
3495 : while (curr != NULL_USE_OPERAND_P
3496 1149042 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3497 204660 : || ! bitmap_set_bit (visited,
3498 204660 : SSA_NAME_VERSION
3499 : (USE_FROM_PTR (curr)))));
3500 : }
3501 1046510 : while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3502 180625 : if (curr == NULL_USE_OPERAND_P)
3503 : break;
3504 : }
3505 : else
3506 : {
3507 525394 : if (gimple_code (def) == GIMPLE_PHI)
3508 56320 : curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3509 : else
3510 469074 : curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3511 : while (curr != NULL_USE_OPERAND_P
3512 637712 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3513 551870 : || ! bitmap_set_bit (visited,
3514 551870 : SSA_NAME_VERSION
3515 : (USE_FROM_PTR (curr)))))
3516 112318 : curr = op_iter_next_use (&curri);
3517 525394 : if (curr == NULL_USE_OPERAND_P)
3518 80851 : goto pop;
3519 : }
3520 : }
3521 : while (1);
3522 76291 : if (dump_file && (dump_flags & TDF_DETAILS))
3523 : {
3524 3980 : dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3525 3980 : unsigned i;
3526 3980 : std::pair<ssa_op_iter, use_operand_p> *x;
3527 13527 : FOR_EACH_VEC_ELT (path, i, x)
3528 9547 : dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3529 3980 : dump_printf (MSG_NOTE, "\n");
3530 : }
3531 :
3532 : /* Check whether the reduction path detected is valid. */
3533 76291 : bool fail = path.length () == 0;
3534 76291 : bool neg = false;
3535 76291 : int sign = -1;
3536 76291 : *code = ERROR_MARK;
3537 161791 : for (unsigned i = 1; i < path.length (); ++i)
3538 : {
3539 92103 : gimple *use_stmt = USE_STMT (path[i].second);
3540 92103 : gimple_match_op op;
3541 92103 : if (!gimple_extract_op (use_stmt, &op))
3542 : {
3543 : fail = true;
3544 6603 : break;
3545 : }
3546 91219 : unsigned int opi = op.num_ops;
3547 91219 : if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3548 : {
3549 : /* The following make sure we can compute the operand index
3550 : easily plus it mostly disallows chaining via COND_EXPR condition
3551 : operands. */
3552 147771 : for (opi = 0; opi < op.num_ops; ++opi)
3553 146819 : if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3554 : break;
3555 : }
3556 3524 : else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3557 : {
3558 7062 : for (opi = 0; opi < op.num_ops; ++opi)
3559 7062 : if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3560 : break;
3561 : }
3562 91219 : if (opi == op.num_ops)
3563 : {
3564 : fail = true;
3565 : break;
3566 : }
3567 90267 : op.code = canonicalize_code (op.code, op.type);
3568 90267 : if (op.code == MINUS_EXPR)
3569 : {
3570 3872 : op.code = PLUS_EXPR;
3571 : /* Track whether we negate the reduction value each iteration. */
3572 3872 : if (op.ops[1] == op.ops[opi])
3573 34 : neg = ! neg;
3574 : }
3575 86395 : else if (op.code == IFN_COND_SUB)
3576 : {
3577 2 : op.code = IFN_COND_ADD;
3578 : /* Track whether we negate the reduction value each iteration. */
3579 2 : if (op.ops[2] == op.ops[opi])
3580 0 : neg = ! neg;
3581 : }
3582 : /* For an FMA the reduction code is the PLUS if the addition chain
3583 : is the reduction. */
3584 86393 : else if (op.code == IFN_FMA && opi == 2)
3585 28 : op.code = PLUS_EXPR;
3586 90267 : if (CONVERT_EXPR_CODE_P (op.code)
3587 90267 : && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3588 : ;
3589 86793 : else if (*code == ERROR_MARK)
3590 : {
3591 74157 : *code = op.code;
3592 74157 : sign = TYPE_SIGN (op.type);
3593 : }
3594 12636 : else if (op.code != *code)
3595 : {
3596 : fail = true;
3597 : break;
3598 : }
3599 11314 : else if ((op.code == MIN_EXPR
3600 11230 : || op.code == MAX_EXPR)
3601 11329 : && sign != TYPE_SIGN (op.type))
3602 : {
3603 : fail = true;
3604 : break;
3605 : }
3606 : /* Check there's only a single stmt the op is used on. For the
3607 : not value-changing tail and the last stmt allow out-of-loop uses,
3608 : but not when this is the inner loop of a double reduction.
3609 : ??? We could relax this and handle arbitrary live stmts by
3610 : forcing a scalar epilogue for example. */
3611 88942 : imm_use_iterator imm_iter;
3612 88942 : use_operand_p use_p;
3613 88942 : gimple *op_use_stmt;
3614 88942 : unsigned cnt = 0;
3615 92436 : bool cond_fn_p = op.code.is_internal_fn ()
3616 3494 : && (conditional_internal_fn_code (internal_fn (op.code))
3617 88942 : != ERROR_MARK);
3618 :
3619 303743 : FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3620 : {
3621 : /* In case of a COND_OP (mask, op1, op2, op1) reduction we should
3622 : have op1 twice (once as definition, once as else) in the same
3623 : operation. Enforce this. */
3624 125859 : if (cond_fn_p && op_use_stmt == use_stmt)
3625 : {
3626 3428 : gcall *call = as_a<gcall *> (use_stmt);
3627 3428 : unsigned else_pos
3628 3428 : = internal_fn_else_index (internal_fn (op.code));
3629 3428 : if (gimple_call_arg (call, else_pos) != op.ops[opi])
3630 : {
3631 : fail = true;
3632 : break;
3633 : }
3634 17140 : for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
3635 : {
3636 13712 : if (j == else_pos)
3637 3428 : continue;
3638 10284 : if (gimple_call_arg (call, j) == op.ops[opi])
3639 3428 : cnt++;
3640 : }
3641 : }
3642 122431 : else if (!is_gimple_debug (op_use_stmt)
3643 122431 : && ((*code != ERROR_MARK || inner_loop_of_double_reduc)
3644 1806 : || flow_bb_inside_loop_p (loop,
3645 1806 : gimple_bb (op_use_stmt))))
3646 178059 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3647 89034 : cnt++;
3648 88942 : }
3649 :
3650 88942 : if (cnt != 1)
3651 : {
3652 : fail = true;
3653 : break;
3654 : }
3655 : }
3656 83297 : return ! fail && ! neg && *code != ERROR_MARK;
3657 76291 : }
3658 :
3659 : bool
3660 21 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3661 : tree loop_arg, enum tree_code code)
3662 : {
3663 21 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3664 21 : code_helper code_;
3665 21 : return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path, false)
3666 21 : && code_ == code);
3667 21 : }
3668 :
3669 :
3670 :
3671 : /* Function vect_is_simple_reduction
3672 :
3673 : (1) Detect a cross-iteration def-use cycle that represents a simple
3674 : reduction computation. We look for the following pattern:
3675 :
3676 : loop_header:
3677 : a1 = phi < a0, a2 >
3678 : a3 = ...
3679 : a2 = operation (a3, a1)
3680 :
3681 : or
3682 :
3683 : a3 = ...
3684 : loop_header:
3685 : a1 = phi < a0, a2 >
3686 : a2 = operation (a3, a1)
3687 :
3688 : such that:
3689 : 1. operation is commutative and associative and it is safe to
3690 : change the order of the computation
3691 : 2. no uses for a2 in the loop (a2 is used out of the loop)
3692 : 3. no uses of a1 in the loop besides the reduction operation
3693 : 4. no uses of a1 outside the loop.
3694 :
3695 : Conditions 1,4 are tested here.
3696 : Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3697 :
3698 : (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3699 : nested cycles.
3700 :
3701 : (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3702 : reductions:
3703 :
3704 : a1 = phi < a0, a2 >
3705 : inner loop (def of a3)
3706 : a2 = phi < a3 >
3707 :
3708 : (4) Detect condition expressions, ie:
3709 : for (int i = 0; i < N; i++)
3710 : if (a[i] < val)
3711 : ret_val = a[i];
3712 :
3713 : */
3714 :
3715 : static stmt_vec_info
3716 139949 : vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3717 : gphi **double_reduc)
3718 : {
3719 139949 : gphi *phi = as_a <gphi *> (phi_info->stmt);
3720 139949 : gimple *phi_use_stmt = NULL;
3721 139949 : imm_use_iterator imm_iter;
3722 139949 : use_operand_p use_p;
3723 :
3724 : /* When double_reduc is NULL we are testing the inner loop of a
3725 : double reduction. */
3726 139949 : bool inner_loop_of_double_reduc = double_reduc == NULL;
3727 139949 : if (double_reduc)
3728 138900 : *double_reduc = NULL;
3729 139949 : STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3730 :
3731 139949 : tree phi_name = PHI_RESULT (phi);
3732 : /* ??? If there are no uses of the PHI result the inner loop reduction
3733 : won't be detected as possibly double-reduction by vectorizable_reduction
3734 : because that tries to walk the PHI arg from the preheader edge which
3735 : can be constant. See PR60382. */
3736 139949 : if (has_zero_uses (phi_name))
3737 : return NULL;
3738 139822 : class loop *loop = (gimple_bb (phi))->loop_father;
3739 139822 : unsigned nphi_def_loop_uses = 0;
3740 537955 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3741 : {
3742 270028 : gimple *use_stmt = USE_STMT (use_p);
3743 270028 : if (is_gimple_debug (use_stmt))
3744 72367 : continue;
3745 :
3746 197661 : if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3747 : {
3748 11717 : if (dump_enabled_p ())
3749 34 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3750 : "intermediate value used outside loop.\n");
3751 :
3752 11717 : return NULL;
3753 : }
3754 :
3755 : /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
3756 : op1 twice (once as definition, once as else) in the same operation.
3757 : Only count it as one. */
3758 185944 : if (use_stmt != phi_use_stmt)
3759 : {
3760 182079 : nphi_def_loop_uses++;
3761 182079 : phi_use_stmt = use_stmt;
3762 : }
3763 11717 : }
3764 :
3765 128105 : tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3766 128105 : if (TREE_CODE (latch_def) != SSA_NAME)
3767 : {
3768 1449 : if (dump_enabled_p ())
3769 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3770 : "reduction: not ssa_name: %T\n", latch_def);
3771 1449 : return NULL;
3772 : }
3773 :
3774 126656 : stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3775 126656 : if (!def_stmt_info
3776 126656 : || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3777 161 : return NULL;
3778 :
3779 126495 : bool nested_in_vect_loop
3780 126495 : = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3781 126495 : unsigned nlatch_def_loop_uses = 0;
3782 126495 : auto_vec<gphi *, 3> lcphis;
3783 620484 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3784 : {
3785 367494 : gimple *use_stmt = USE_STMT (use_p);
3786 367494 : if (is_gimple_debug (use_stmt))
3787 112168 : continue;
3788 255326 : if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3789 166089 : nlatch_def_loop_uses++;
3790 : else
3791 : /* We can have more than one loop-closed PHI. */
3792 89237 : lcphis.safe_push (as_a <gphi *> (use_stmt));
3793 126495 : }
3794 :
3795 : /* If we are vectorizing an inner reduction we are executing that
3796 : in the original order only in case we are not dealing with a
3797 : double reduction. */
3798 126495 : if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3799 : {
3800 2272 : if (dump_enabled_p ())
3801 433 : report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3802 : "detected nested cycle: ");
3803 2272 : return def_stmt_info;
3804 : }
3805 :
3806 : /* When the inner loop of a double reduction ends up with more than
3807 : one loop-closed PHI we have failed to classify alternate such
3808 : PHIs as double reduction, leading to wrong code. See PR103237. */
3809 125260 : if (inner_loop_of_double_reduc && lcphis.length () != 1)
3810 : {
3811 1 : if (dump_enabled_p ())
3812 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3813 : "unhandle double reduction\n");
3814 1 : return NULL;
3815 : }
3816 :
3817 : /* If this isn't a nested cycle or if the nested cycle reduction value
3818 : is used ouside of the inner loop we cannot handle uses of the reduction
3819 : value. */
3820 124222 : if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3821 : {
3822 46648 : if (dump_enabled_p ())
3823 401 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3824 : "reduction used in loop.\n");
3825 46648 : return NULL;
3826 : }
3827 :
3828 : /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3829 : defined in the inner loop. */
3830 77574 : if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3831 : {
3832 1304 : tree op1 = PHI_ARG_DEF (def_stmt, 0);
3833 1304 : if (gimple_phi_num_args (def_stmt) != 1
3834 1304 : || TREE_CODE (op1) != SSA_NAME)
3835 : {
3836 91 : if (dump_enabled_p ())
3837 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3838 : "unsupported phi node definition.\n");
3839 :
3840 91 : return NULL;
3841 : }
3842 :
3843 : /* Verify there is an inner cycle composed of the PHI phi_use_stmt
3844 : and the latch definition op1. */
3845 1213 : gimple *def1 = SSA_NAME_DEF_STMT (op1);
3846 1213 : if (gimple_bb (def1)
3847 1213 : && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3848 1213 : && loop->inner
3849 1159 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3850 1159 : && (is_gimple_assign (def1) || is_gimple_call (def1))
3851 1150 : && is_a <gphi *> (phi_use_stmt)
3852 1138 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
3853 1138 : && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
3854 : loop_latch_edge (loop->inner)))
3855 2349 : && lcphis.length () == 1)
3856 : {
3857 1049 : if (dump_enabled_p ())
3858 144 : report_vect_op (MSG_NOTE, def_stmt,
3859 : "detected double reduction: ");
3860 :
3861 1049 : *double_reduc = as_a <gphi *> (phi_use_stmt);
3862 1049 : return def_stmt_info;
3863 : }
3864 :
3865 164 : return NULL;
3866 : }
3867 :
3868 : /* Look for the expression computing latch_def from then loop PHI result. */
3869 76270 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3870 76270 : code_helper code;
3871 76270 : if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3872 : path, inner_loop_of_double_reduc))
3873 : {
3874 69264 : STMT_VINFO_REDUC_CODE (phi_info) = code;
3875 69264 : if (code == COND_EXPR && !nested_in_vect_loop)
3876 8193 : STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3877 :
3878 : /* Fill in STMT_VINFO_REDUC_IDX. */
3879 69264 : unsigned i;
3880 222374 : for (i = path.length () - 1; i >= 1; --i)
3881 : {
3882 83846 : gimple *stmt = USE_STMT (path[i].second);
3883 83846 : stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3884 83846 : gimple_match_op op;
3885 83846 : if (!gimple_extract_op (stmt, &op))
3886 0 : gcc_unreachable ();
3887 83846 : if (gassign *assign = dyn_cast<gassign *> (stmt))
3888 80342 : STMT_VINFO_REDUC_IDX (stmt_info)
3889 80342 : = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3890 : else
3891 : {
3892 3504 : gcall *call = as_a<gcall *> (stmt);
3893 3504 : STMT_VINFO_REDUC_IDX (stmt_info)
3894 3504 : = path[i].second->use - gimple_call_arg_ptr (call, 0);
3895 : }
3896 : }
3897 69264 : if (dump_enabled_p ())
3898 3918 : dump_printf_loc (MSG_NOTE, vect_location,
3899 : "reduction: detected reduction\n");
3900 :
3901 69264 : return def_stmt_info;
3902 : }
3903 :
3904 7006 : if (dump_enabled_p ())
3905 86 : dump_printf_loc (MSG_NOTE, vect_location,
3906 : "reduction: unknown pattern\n");
3907 :
3908 : return NULL;
3909 202765 : }
3910 :
3911 : /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3912 : PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3913 : or -1 if not known. */
3914 :
3915 : static int
3916 362796 : vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3917 : {
3918 362796 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
3919 362796 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3920 : {
3921 157436 : if (dump_enabled_p ())
3922 2912 : dump_printf_loc (MSG_NOTE, vect_location,
3923 : "cost model: epilogue peel iters set to vf/2 "
3924 : "because loop iterations are unknown .\n");
3925 157436 : return assumed_vf / 2;
3926 : }
3927 : else
3928 : {
3929 205360 : int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3930 205360 : peel_iters_prologue = MIN (niters, peel_iters_prologue);
3931 205360 : int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3932 : /* If we need to peel for gaps, but no peeling is required, we have to
3933 : peel VF iterations. */
3934 205360 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3935 205360 : peel_iters_epilogue = assumed_vf;
3936 205360 : return peel_iters_epilogue;
3937 : }
3938 : }
3939 :
3940 : /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3941 : int
3942 280373 : vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3943 : int *peel_iters_epilogue,
3944 : stmt_vector_for_cost *scalar_cost_vec,
3945 : stmt_vector_for_cost *prologue_cost_vec,
3946 : stmt_vector_for_cost *epilogue_cost_vec)
3947 : {
3948 280373 : int retval = 0;
3949 :
3950 280373 : *peel_iters_epilogue
3951 280373 : = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3952 :
3953 280373 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3954 : {
3955 : /* If peeled iterations are known but number of scalar loop
3956 : iterations are unknown, count a taken branch per peeled loop. */
3957 108132 : if (peel_iters_prologue > 0)
3958 69327 : retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3959 : vect_prologue);
3960 108132 : if (*peel_iters_epilogue > 0)
3961 108055 : retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3962 : vect_epilogue);
3963 : }
3964 :
3965 280373 : stmt_info_for_cost *si;
3966 280373 : int j;
3967 280373 : if (peel_iters_prologue)
3968 608492 : FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3969 480554 : retval += record_stmt_cost (prologue_cost_vec,
3970 480554 : si->count * peel_iters_prologue,
3971 : si->kind, si->stmt_info, si->misalign,
3972 : vect_prologue);
3973 280373 : if (*peel_iters_epilogue)
3974 946028 : FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3975 750654 : retval += record_stmt_cost (epilogue_cost_vec,
3976 750654 : si->count * *peel_iters_epilogue,
3977 : si->kind, si->stmt_info, si->misalign,
3978 : vect_epilogue);
3979 :
3980 280373 : return retval;
3981 : }
3982 :
3983 : /* Function vect_estimate_min_profitable_iters
3984 :
3985 : Return the number of iterations required for the vector version of the
3986 : loop to be profitable relative to the cost of the scalar version of the
3987 : loop.
3988 :
3989 : *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3990 : of iterations for vectorization. -1 value means loop vectorization
3991 : is not profitable. This returned value may be used for dynamic
3992 : profitability check.
3993 :
3994 : *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3995 : for static check against estimated number of iterations. */
3996 :
3997 : static void
3998 99589 : vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3999 : int *ret_min_profitable_niters,
4000 : int *ret_min_profitable_estimate,
4001 : unsigned *suggested_unroll_factor)
4002 : {
4003 99589 : int min_profitable_iters;
4004 99589 : int min_profitable_estimate;
4005 99589 : int peel_iters_prologue;
4006 99589 : int peel_iters_epilogue;
4007 99589 : unsigned vec_inside_cost = 0;
4008 99589 : int vec_outside_cost = 0;
4009 99589 : unsigned vec_prologue_cost = 0;
4010 99589 : unsigned vec_epilogue_cost = 0;
4011 99589 : int scalar_single_iter_cost = 0;
4012 99589 : int scalar_outside_cost = 0;
4013 99589 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
4014 99589 : int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4015 99589 : vector_costs *target_cost_data = loop_vinfo->vector_costs;
4016 :
4017 : /* Cost model disabled. */
4018 99589 : if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4019 : {
4020 16864 : if (dump_enabled_p ())
4021 10591 : dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4022 16864 : *ret_min_profitable_niters = 0;
4023 16864 : *ret_min_profitable_estimate = 0;
4024 16864 : return;
4025 : }
4026 :
4027 : /* Requires loop versioning tests to handle misalignment. */
4028 82725 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4029 : {
4030 : /* FIXME: Make cost depend on complexity of individual check. */
4031 13 : unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4032 13 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4033 13 : if (dump_enabled_p ())
4034 1 : dump_printf (MSG_NOTE,
4035 : "cost model: Adding cost of checks for loop "
4036 : "versioning to treat misalignment.\n");
4037 : }
4038 :
4039 : /* Requires loop versioning with alias checks. */
4040 82725 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4041 : {
4042 : /* FIXME: Make cost depend on complexity of individual check. */
4043 4117 : unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4044 4117 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4045 4117 : len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4046 2 : if (len)
4047 : /* Count LEN - 1 ANDs and LEN comparisons. */
4048 2 : (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4049 : scalar_stmt, vect_prologue);
4050 4117 : len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4051 1108 : if (len)
4052 : {
4053 : /* Count LEN - 1 ANDs and LEN comparisons. */
4054 1108 : unsigned int nstmts = len * 2 - 1;
4055 : /* +1 for each bias that needs adding. */
4056 2216 : for (unsigned int i = 0; i < len; ++i)
4057 1108 : if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4058 125 : nstmts += 1;
4059 1108 : (void) add_stmt_cost (target_cost_data, nstmts,
4060 : scalar_stmt, vect_prologue);
4061 : }
4062 4117 : if (dump_enabled_p ())
4063 18 : dump_printf (MSG_NOTE,
4064 : "cost model: Adding cost of checks for loop "
4065 : "versioning aliasing.\n");
4066 : }
4067 :
4068 : /* Requires loop versioning with niter checks. */
4069 82725 : if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4070 : {
4071 : /* FIXME: Make cost depend on complexity of individual check. */
4072 665 : (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4073 : NULL, NULL, NULL_TREE, 0, vect_prologue);
4074 665 : if (dump_enabled_p ())
4075 1 : dump_printf (MSG_NOTE,
4076 : "cost model: Adding cost of checks for loop "
4077 : "versioning niters.\n");
4078 : }
4079 :
4080 82725 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4081 4791 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4082 : vect_prologue);
4083 :
4084 : /* Count statements in scalar loop. Using this as scalar cost for a single
4085 : iteration for now.
4086 :
4087 : TODO: Add outer loop support.
4088 :
4089 : TODO: Consider assigning different costs to different scalar
4090 : statements. */
4091 :
4092 82725 : scalar_single_iter_cost = (loop_vinfo->scalar_costs->total_cost ()
4093 82725 : * param_vect_scalar_cost_multiplier) / 100;
4094 :
4095 : /* Add additional cost for the peeled instructions in prologue and epilogue
4096 : loop. (For fully-masked loops there will be no peeling.)
4097 :
4098 : FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4099 : at compile-time - we assume it's vf/2 (the worst would be vf-1).
4100 :
4101 : TODO: Build an expression that represents peel_iters for prologue and
4102 : epilogue to be used in a run-time test. */
4103 :
4104 82725 : bool prologue_need_br_taken_cost = false;
4105 82725 : bool prologue_need_br_not_taken_cost = false;
4106 :
4107 : /* Calculate peel_iters_prologue. */
4108 82725 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4109 : peel_iters_prologue = 0;
4110 82725 : else if (npeel < 0)
4111 : {
4112 280 : peel_iters_prologue = assumed_vf / 2;
4113 280 : if (dump_enabled_p ())
4114 6 : dump_printf (MSG_NOTE, "cost model: "
4115 : "prologue peel iters set to vf/2.\n");
4116 :
4117 : /* If peeled iterations are unknown, count a taken branch and a not taken
4118 : branch per peeled loop. Even if scalar loop iterations are known,
4119 : vector iterations are not known since peeled prologue iterations are
4120 : not known. Hence guards remain the same. */
4121 : prologue_need_br_taken_cost = true;
4122 : prologue_need_br_not_taken_cost = true;
4123 : }
4124 : else
4125 : {
4126 82445 : peel_iters_prologue = npeel;
4127 82445 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4128 : /* If peeled iterations are known but number of scalar loop
4129 : iterations are unknown, count a taken branch per peeled loop. */
4130 82725 : prologue_need_br_taken_cost = true;
4131 : }
4132 :
4133 82725 : bool epilogue_need_br_taken_cost = false;
4134 82725 : bool epilogue_need_br_not_taken_cost = false;
4135 :
4136 : /* Calculate peel_iters_epilogue. */
4137 82725 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4138 : /* We need to peel exactly one iteration for gaps. */
4139 22 : peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4140 82703 : else if (npeel < 0)
4141 : {
4142 : /* If peeling for alignment is unknown, loop bound of main loop
4143 : becomes unknown. */
4144 280 : peel_iters_epilogue = assumed_vf / 2;
4145 280 : if (dump_enabled_p ())
4146 6 : dump_printf (MSG_NOTE, "cost model: "
4147 : "epilogue peel iters set to vf/2 because "
4148 : "peeling for alignment is unknown.\n");
4149 :
4150 : /* See the same reason above in peel_iters_prologue calculation. */
4151 : epilogue_need_br_taken_cost = true;
4152 : epilogue_need_br_not_taken_cost = true;
4153 : }
4154 : else
4155 : {
4156 82423 : peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4157 82423 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4158 : /* If peeled iterations are known but number of scalar loop
4159 : iterations are unknown, count a taken branch per peeled loop. */
4160 82725 : epilogue_need_br_taken_cost = true;
4161 : }
4162 :
4163 82725 : stmt_info_for_cost *si;
4164 82725 : int j;
4165 : /* Add costs associated with peel_iters_prologue. */
4166 82725 : if (peel_iters_prologue)
4167 1028 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4168 : {
4169 739 : (void) add_stmt_cost (target_cost_data,
4170 739 : si->count * peel_iters_prologue, si->kind,
4171 : si->stmt_info, si->node, si->vectype,
4172 : si->misalign, vect_prologue);
4173 : }
4174 :
4175 : /* Add costs associated with peel_iters_epilogue. */
4176 82725 : if (peel_iters_epilogue)
4177 283425 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4178 : {
4179 224978 : (void) add_stmt_cost (target_cost_data,
4180 224978 : si->count * peel_iters_epilogue, si->kind,
4181 : si->stmt_info, si->node, si->vectype,
4182 : si->misalign, vect_epilogue);
4183 : }
4184 :
4185 : /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4186 :
4187 82725 : if (prologue_need_br_taken_cost)
4188 280 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4189 : vect_prologue);
4190 :
4191 82725 : if (prologue_need_br_not_taken_cost)
4192 280 : (void) add_stmt_cost (target_cost_data, 1,
4193 : cond_branch_not_taken, vect_prologue);
4194 :
4195 82725 : if (epilogue_need_br_taken_cost)
4196 49008 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4197 : vect_epilogue);
4198 :
4199 82725 : if (epilogue_need_br_not_taken_cost)
4200 280 : (void) add_stmt_cost (target_cost_data, 1,
4201 : cond_branch_not_taken, vect_epilogue);
4202 :
4203 : /* Take care of special costs for rgroup controls of partial vectors. */
4204 22 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4205 82747 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4206 : == vect_partial_vectors_avx512))
4207 : {
4208 : /* Calculate how many masks we need to generate. */
4209 22 : unsigned int num_masks = 0;
4210 22 : bool need_saturation = false;
4211 90 : for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4212 24 : if (rgm.type)
4213 : {
4214 22 : unsigned nvectors = rgm.factor;
4215 22 : num_masks += nvectors;
4216 22 : if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4217 22 : < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4218 7 : need_saturation = true;
4219 : }
4220 :
4221 : /* ??? The target isn't able to identify the costs below as
4222 : producing masks so it cannot penaltize cases where we'd run
4223 : out of mask registers for example. */
4224 :
4225 : /* ??? We are also failing to account for smaller vector masks
4226 : we generate by splitting larger masks in vect_get_loop_mask. */
4227 :
4228 : /* In the worst case, we need to generate each mask in the prologue
4229 : and in the loop body. We need one splat per group and one
4230 : compare per mask.
4231 :
4232 : Sometimes the prologue mask will fold to a constant,
4233 : so the actual prologue cost might be smaller. However, it's
4234 : simpler and safer to use the worst-case cost; if this ends up
4235 : being the tie-breaker between vectorizing or not, then it's
4236 : probably better not to vectorize. */
4237 22 : (void) add_stmt_cost (target_cost_data,
4238 : num_masks
4239 22 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4240 : vector_stmt, NULL, NULL, NULL_TREE, 0,
4241 : vect_prologue);
4242 44 : (void) add_stmt_cost (target_cost_data,
4243 : num_masks
4244 44 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4245 : vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4246 :
4247 : /* When we need saturation we need it both in the prologue and
4248 : the epilogue. */
4249 22 : if (need_saturation)
4250 : {
4251 7 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4252 : NULL, NULL, NULL_TREE, 0, vect_prologue);
4253 7 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4254 : NULL, NULL, NULL_TREE, 0, vect_body);
4255 : }
4256 : }
4257 0 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4258 82703 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4259 : == vect_partial_vectors_while_ult))
4260 : {
4261 : /* Calculate how many masks we need to generate. */
4262 : unsigned int num_masks = 0;
4263 : rgroup_controls *rgm;
4264 : unsigned int num_vectors_m1;
4265 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4266 : num_vectors_m1, rgm)
4267 0 : if (rgm->type)
4268 0 : num_masks += num_vectors_m1 + 1;
4269 0 : gcc_assert (num_masks > 0);
4270 :
4271 : /* In the worst case, we need to generate each mask in the prologue
4272 : and in the loop body. One of the loop body mask instructions
4273 : replaces the comparison in the scalar loop, and since we don't
4274 : count the scalar comparison against the scalar body, we shouldn't
4275 : count that vector instruction against the vector body either.
4276 :
4277 : Sometimes we can use unpacks instead of generating prologue
4278 : masks and sometimes the prologue mask will fold to a constant,
4279 : so the actual prologue cost might be smaller. However, it's
4280 : simpler and safer to use the worst-case cost; if this ends up
4281 : being the tie-breaker between vectorizing or not, then it's
4282 : probably better not to vectorize. */
4283 0 : (void) add_stmt_cost (target_cost_data, num_masks,
4284 : vector_stmt, NULL, NULL, NULL_TREE, 0,
4285 : vect_prologue);
4286 0 : (void) add_stmt_cost (target_cost_data, num_masks - 1,
4287 : vector_stmt, NULL, NULL, NULL_TREE, 0,
4288 : vect_body);
4289 : }
4290 82703 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4291 : {
4292 : /* Referring to the functions vect_set_loop_condition_partial_vectors
4293 : and vect_set_loop_controls_directly, we need to generate each
4294 : length in the prologue and in the loop body if required. Although
4295 : there are some possible optimizations, we consider the worst case
4296 : here. */
4297 :
4298 0 : bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4299 0 : signed char partial_load_store_bias
4300 : = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4301 0 : bool need_iterate_p
4302 0 : = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4303 0 : && !vect_known_niters_smaller_than_vf (loop_vinfo));
4304 :
4305 : /* Calculate how many statements to be added. */
4306 0 : unsigned int prologue_stmts = 0;
4307 0 : unsigned int body_stmts = 0;
4308 :
4309 0 : rgroup_controls *rgc;
4310 0 : unsigned int num_vectors_m1;
4311 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4312 0 : if (rgc->type)
4313 : {
4314 : /* May need one SHIFT for nitems_total computation. */
4315 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4316 0 : if (nitems != 1 && !niters_known_p)
4317 0 : prologue_stmts += 1;
4318 :
4319 : /* May need one MAX and one MINUS for wrap around. */
4320 0 : if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4321 0 : prologue_stmts += 2;
4322 :
4323 : /* Need one MAX and one MINUS for each batch limit excepting for
4324 : the 1st one. */
4325 0 : prologue_stmts += num_vectors_m1 * 2;
4326 :
4327 0 : unsigned int num_vectors = num_vectors_m1 + 1;
4328 :
4329 : /* Need to set up lengths in prologue, only one MIN required
4330 : for each since start index is zero. */
4331 0 : prologue_stmts += num_vectors;
4332 :
4333 : /* If we have a non-zero partial load bias, we need one PLUS
4334 : to adjust the load length. */
4335 0 : if (partial_load_store_bias != 0)
4336 0 : body_stmts += 1;
4337 :
4338 0 : unsigned int length_update_cost = 0;
4339 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4340 : /* For decrement IV style, Each only need a single SELECT_VL
4341 : or MIN since beginning to calculate the number of elements
4342 : need to be processed in current iteration. */
4343 : length_update_cost = 1;
4344 : else
4345 : /* For increment IV stype, Each may need two MINs and one MINUS to
4346 : update lengths in body for next iteration. */
4347 0 : length_update_cost = 3;
4348 :
4349 0 : if (need_iterate_p)
4350 0 : body_stmts += length_update_cost * num_vectors;
4351 : }
4352 :
4353 0 : (void) add_stmt_cost (target_cost_data, prologue_stmts,
4354 : scalar_stmt, vect_prologue);
4355 0 : (void) add_stmt_cost (target_cost_data, body_stmts,
4356 : scalar_stmt, vect_body);
4357 : }
4358 :
4359 : /* FORNOW: The scalar outside cost is incremented in one of the
4360 : following ways:
4361 :
4362 : 1. The vectorizer checks for alignment and aliasing and generates
4363 : a condition that allows dynamic vectorization. A cost model
4364 : check is ANDED with the versioning condition. Hence scalar code
4365 : path now has the added cost of the versioning check.
4366 :
4367 : if (cost > th & versioning_check)
4368 : jmp to vector code
4369 :
4370 : Hence run-time scalar is incremented by not-taken branch cost.
4371 :
4372 : 2. The vectorizer then checks if a prologue is required. If the
4373 : cost model check was not done before during versioning, it has to
4374 : be done before the prologue check.
4375 :
4376 : if (cost <= th)
4377 : prologue = scalar_iters
4378 : if (prologue == 0)
4379 : jmp to vector code
4380 : else
4381 : execute prologue
4382 : if (prologue == num_iters)
4383 : go to exit
4384 :
4385 : Hence the run-time scalar cost is incremented by a taken branch,
4386 : plus a not-taken branch, plus a taken branch cost.
4387 :
4388 : 3. The vectorizer then checks if an epilogue is required. If the
4389 : cost model check was not done before during prologue check, it
4390 : has to be done with the epilogue check.
4391 :
4392 : if (prologue == 0)
4393 : jmp to vector code
4394 : else
4395 : execute prologue
4396 : if (prologue == num_iters)
4397 : go to exit
4398 : vector code:
4399 : if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4400 : jmp to epilogue
4401 :
4402 : Hence the run-time scalar cost should be incremented by 2 taken
4403 : branches.
4404 :
4405 : TODO: The back end may reorder the BBS's differently and reverse
4406 : conditions/branch directions. Change the estimates below to
4407 : something more reasonable. */
4408 :
4409 : /* If the number of iterations is known and we do not do versioning, we can
4410 : decide whether to vectorize at compile time. Hence the scalar version
4411 : do not carry cost model guard costs. */
4412 33160 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4413 115885 : || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4414 : {
4415 : /* Cost model check occurs at versioning. */
4416 50177 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4417 4791 : scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4418 : else
4419 : {
4420 : /* Cost model check occurs at prologue generation. */
4421 45386 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4422 150 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4423 150 : + vect_get_stmt_cost (cond_branch_not_taken);
4424 : /* Cost model check occurs at epilogue generation. */
4425 : else
4426 45236 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4427 : }
4428 : }
4429 :
4430 : /* Complete the target-specific cost calculations. */
4431 82725 : loop_vinfo->vector_costs->finish_cost (loop_vinfo->scalar_costs);
4432 82725 : vec_prologue_cost = loop_vinfo->vector_costs->prologue_cost ();
4433 82725 : vec_inside_cost = loop_vinfo->vector_costs->body_cost ();
4434 82725 : vec_epilogue_cost = loop_vinfo->vector_costs->epilogue_cost ();
4435 82725 : if (suggested_unroll_factor)
4436 82538 : *suggested_unroll_factor
4437 82538 : = loop_vinfo->vector_costs->suggested_unroll_factor ();
4438 :
4439 82538 : if (suggested_unroll_factor && *suggested_unroll_factor > 1
4440 233 : && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4441 0 : && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4442 : *suggested_unroll_factor,
4443 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4444 : {
4445 0 : if (dump_enabled_p ())
4446 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4447 : "can't unroll as unrolled vectorization factor larger"
4448 : " than maximum vectorization factor: "
4449 : HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4450 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4451 0 : *suggested_unroll_factor = 1;
4452 : }
4453 :
4454 82725 : vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4455 :
4456 82725 : if (dump_enabled_p ())
4457 : {
4458 609 : dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4459 609 : dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4460 : vec_inside_cost);
4461 609 : dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4462 : vec_prologue_cost);
4463 609 : dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4464 : vec_epilogue_cost);
4465 609 : dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4466 : scalar_single_iter_cost);
4467 609 : dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4468 : scalar_outside_cost);
4469 609 : dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4470 : vec_outside_cost);
4471 609 : dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4472 : peel_iters_prologue);
4473 609 : dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4474 : peel_iters_epilogue);
4475 : }
4476 :
4477 : /* Calculate number of iterations required to make the vector version
4478 : profitable, relative to the loop bodies only. The following condition
4479 : must hold true:
4480 : SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4481 : where
4482 : SIC = scalar iteration cost, VIC = vector iteration cost,
4483 : VOC = vector outside cost, VF = vectorization factor,
4484 : NPEEL = prologue iterations + epilogue iterations,
4485 : SOC = scalar outside cost for run time cost model check. */
4486 :
4487 82725 : int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4488 82725 : - vec_inside_cost);
4489 82725 : if (saving_per_viter <= 0)
4490 : {
4491 24161 : if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4492 0 : warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4493 : "vectorization did not happen for a simd loop");
4494 :
4495 24161 : if (dump_enabled_p ())
4496 22 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4497 : "cost model: the vector iteration cost = %d "
4498 : "divided by the scalar iteration cost = %d "
4499 : "is greater or equal to the vectorization factor = %d"
4500 : ".\n",
4501 : vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4502 24161 : *ret_min_profitable_niters = -1;
4503 24161 : *ret_min_profitable_estimate = -1;
4504 24161 : return;
4505 : }
4506 :
4507 : /* ??? The "if" arm is written to handle all cases; see below for what
4508 : we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4509 58564 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4510 : {
4511 : /* Rewriting the condition above in terms of the number of
4512 : vector iterations (vniters) rather than the number of
4513 : scalar iterations (niters) gives:
4514 :
4515 : SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4516 :
4517 : <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4518 :
4519 : For integer N, X and Y when X > 0:
4520 :
4521 : N * X > Y <==> N >= (Y /[floor] X) + 1. */
4522 14 : int outside_overhead = (vec_outside_cost
4523 14 : - scalar_single_iter_cost * peel_iters_prologue
4524 14 : - scalar_single_iter_cost * peel_iters_epilogue
4525 : - scalar_outside_cost);
4526 : /* We're only interested in cases that require at least one
4527 : vector iteration. */
4528 14 : int min_vec_niters = 1;
4529 14 : if (outside_overhead > 0)
4530 11 : min_vec_niters = outside_overhead / saving_per_viter + 1;
4531 :
4532 14 : if (dump_enabled_p ())
4533 6 : dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4534 : min_vec_niters);
4535 :
4536 14 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4537 : {
4538 : /* Now that we know the minimum number of vector iterations,
4539 : find the minimum niters for which the scalar cost is larger:
4540 :
4541 : SIC * niters > VIC * vniters + VOC - SOC
4542 :
4543 : We know that the minimum niters is no more than
4544 : vniters * VF + NPEEL, but it might be (and often is) less
4545 : than that if a partial vector iteration is cheaper than the
4546 : equivalent scalar code. */
4547 14 : int threshold = (vec_inside_cost * min_vec_niters
4548 14 : + vec_outside_cost
4549 14 : - scalar_outside_cost);
4550 14 : if (threshold <= 0)
4551 : min_profitable_iters = 1;
4552 : else
4553 14 : min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4554 : }
4555 : else
4556 : /* Convert the number of vector iterations into a number of
4557 : scalar iterations. */
4558 0 : min_profitable_iters = (min_vec_niters * assumed_vf
4559 0 : + peel_iters_prologue
4560 : + peel_iters_epilogue);
4561 : }
4562 : else
4563 : {
4564 58550 : min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4565 58550 : * assumed_vf
4566 58550 : - vec_inside_cost * peel_iters_prologue
4567 58550 : - vec_inside_cost * peel_iters_epilogue);
4568 58550 : if (min_profitable_iters <= 0)
4569 : min_profitable_iters = 0;
4570 : else
4571 : {
4572 49260 : min_profitable_iters /= saving_per_viter;
4573 :
4574 49260 : if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4575 49260 : <= (((int) vec_inside_cost * min_profitable_iters)
4576 49260 : + (((int) vec_outside_cost - scalar_outside_cost)
4577 : * assumed_vf)))
4578 49260 : min_profitable_iters++;
4579 : }
4580 : }
4581 :
4582 58564 : if (dump_enabled_p ())
4583 587 : dump_printf (MSG_NOTE,
4584 : " Calculated minimum iters for profitability: %d\n",
4585 : min_profitable_iters);
4586 :
4587 58564 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4588 58550 : && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4589 : /* We want the vectorized loop to execute at least once. */
4590 : min_profitable_iters = assumed_vf + peel_iters_prologue;
4591 10776 : else if (min_profitable_iters < peel_iters_prologue)
4592 : /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4593 : vectorized loop executes at least once. */
4594 : min_profitable_iters = peel_iters_prologue;
4595 :
4596 58564 : if (dump_enabled_p ())
4597 587 : dump_printf_loc (MSG_NOTE, vect_location,
4598 : " Runtime profitability threshold = %d\n",
4599 : min_profitable_iters);
4600 :
4601 58564 : *ret_min_profitable_niters = min_profitable_iters;
4602 :
4603 : /* Calculate number of iterations required to make the vector version
4604 : profitable, relative to the loop bodies only.
4605 :
4606 : Non-vectorized variant is SIC * niters and it must win over vector
4607 : variant on the expected loop trip count. The following condition must hold true:
4608 : SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4609 :
4610 58564 : if (vec_outside_cost <= 0)
4611 : min_profitable_estimate = 0;
4612 : /* ??? This "else if" arm is written to handle all cases; see below for
4613 : what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4614 53053 : else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4615 : {
4616 : /* This is a repeat of the code above, but with + SOC rather
4617 : than - SOC. */
4618 14 : int outside_overhead = (vec_outside_cost
4619 14 : - scalar_single_iter_cost * peel_iters_prologue
4620 14 : - scalar_single_iter_cost * peel_iters_epilogue
4621 : + scalar_outside_cost);
4622 14 : int min_vec_niters = 1;
4623 14 : if (outside_overhead > 0)
4624 14 : min_vec_niters = outside_overhead / saving_per_viter + 1;
4625 :
4626 14 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4627 : {
4628 14 : int threshold = (vec_inside_cost * min_vec_niters
4629 14 : + vec_outside_cost
4630 14 : + scalar_outside_cost);
4631 14 : min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4632 : }
4633 : else
4634 : min_profitable_estimate = (min_vec_niters * assumed_vf
4635 : + peel_iters_prologue
4636 : + peel_iters_epilogue);
4637 : }
4638 : else
4639 : {
4640 53039 : min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4641 53039 : * assumed_vf
4642 53039 : - vec_inside_cost * peel_iters_prologue
4643 53039 : - vec_inside_cost * peel_iters_epilogue)
4644 53039 : / ((scalar_single_iter_cost * assumed_vf)
4645 : - vec_inside_cost);
4646 : }
4647 58564 : min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4648 58564 : if (dump_enabled_p ())
4649 587 : dump_printf_loc (MSG_NOTE, vect_location,
4650 : " Static estimate profitability threshold = %d\n",
4651 : min_profitable_estimate);
4652 :
4653 58564 : *ret_min_profitable_estimate = min_profitable_estimate;
4654 : }
4655 :
4656 : /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4657 : vector elements (not bits) for a vector with NELT elements. */
4658 : static void
4659 2187 : calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4660 : vec_perm_builder *sel)
4661 : {
4662 : /* The encoding is a single stepped pattern. Any wrap-around is handled
4663 : by vec_perm_indices. */
4664 2187 : sel->new_vector (nelt, 1, 3);
4665 8748 : for (unsigned int i = 0; i < 3; i++)
4666 6561 : sel->quick_push (i + offset);
4667 2187 : }
4668 :
4669 : /* Checks whether the target supports whole-vector shifts for vectors of mode
4670 : MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4671 : it supports vec_perm_const with masks for all necessary shift amounts. */
4672 : static bool
4673 7683 : have_whole_vector_shift (machine_mode mode)
4674 : {
4675 7683 : if (can_implement_p (vec_shr_optab, mode))
4676 : return true;
4677 :
4678 : /* Variable-length vectors should be handled via the optab. */
4679 61 : unsigned int nelt;
4680 122 : if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4681 : return false;
4682 :
4683 61 : vec_perm_builder sel;
4684 61 : vec_perm_indices indices;
4685 307 : for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4686 : {
4687 246 : calc_vec_perm_mask_for_shift (i, nelt, &sel);
4688 246 : indices.new_vector (sel, 2, nelt);
4689 246 : if (!can_vec_perm_const_p (mode, mode, indices, false))
4690 : return false;
4691 : }
4692 : return true;
4693 61 : }
4694 :
4695 : /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
4696 : multiplication operands have differing signs and (b) we intend
4697 : to emulate the operation using a series of signed DOT_PROD_EXPRs.
4698 : See vect_emulate_mixed_dot_prod for the actual sequence used. */
4699 :
4700 : static bool
4701 2186 : vect_is_emulated_mixed_dot_prod (slp_tree slp_node)
4702 : {
4703 2186 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
4704 2186 : gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
4705 1733 : if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
4706 : return false;
4707 :
4708 578 : tree rhs1 = gimple_assign_rhs1 (assign);
4709 578 : tree rhs2 = gimple_assign_rhs2 (assign);
4710 578 : if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
4711 : return false;
4712 :
4713 429 : return !directly_supported_p (DOT_PROD_EXPR,
4714 : SLP_TREE_VECTYPE (slp_node),
4715 143 : SLP_TREE_VECTYPE
4716 : (SLP_TREE_CHILDREN (slp_node)[0]),
4717 143 : optab_vector_mixed_sign);
4718 : }
4719 :
4720 : /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4721 : functions. Design better to avoid maintenance issues. */
4722 :
4723 : /* Function vect_model_reduction_cost.
4724 :
4725 : Models cost for a reduction operation, including the vector ops
4726 : generated within the strip-mine loop in some cases, the initial
4727 : definition before the loop, and the epilogue code that must be generated. */
4728 :
4729 : static void
4730 46923 : vect_model_reduction_cost (loop_vec_info loop_vinfo,
4731 : slp_tree node, internal_fn reduc_fn,
4732 : vect_reduction_type reduction_type,
4733 : int ncopies, stmt_vector_for_cost *cost_vec)
4734 : {
4735 46923 : int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4736 46923 : tree vectype;
4737 46923 : machine_mode mode;
4738 46923 : class loop *loop = NULL;
4739 :
4740 46923 : if (loop_vinfo)
4741 46923 : loop = LOOP_VINFO_LOOP (loop_vinfo);
4742 :
4743 : /* Condition reductions generate two reductions in the loop. */
4744 46923 : if (reduction_type == COND_REDUCTION)
4745 280 : ncopies *= 2;
4746 :
4747 46923 : vectype = SLP_TREE_VECTYPE (node);
4748 46923 : mode = TYPE_MODE (vectype);
4749 46923 : stmt_vec_info orig_stmt_info
4750 46923 : = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
4751 :
4752 46923 : gimple_match_op op;
4753 46923 : if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4754 0 : gcc_unreachable ();
4755 :
4756 46923 : if (reduction_type == EXTRACT_LAST_REDUCTION)
4757 : /* No extra instructions are needed in the prologue. The loop body
4758 : operations are costed in vectorizable_condition. */
4759 : inside_cost = 0;
4760 46923 : else if (reduction_type == FOLD_LEFT_REDUCTION)
4761 : {
4762 : /* No extra instructions needed in the prologue. */
4763 3927 : prologue_cost = 0;
4764 :
4765 3927 : if (reduc_fn != IFN_LAST)
4766 : /* Count one reduction-like operation per vector. */
4767 0 : inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4768 : node, 0, vect_body);
4769 : else
4770 : {
4771 : /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4772 3927 : unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4773 3927 : inside_cost = record_stmt_cost (cost_vec, nelements,
4774 : vec_to_scalar, node, 0,
4775 : vect_body);
4776 3927 : inside_cost += record_stmt_cost (cost_vec, nelements,
4777 : scalar_stmt, node, 0,
4778 : vect_body);
4779 : }
4780 : }
4781 : else
4782 : {
4783 : /* Add in the cost of the initial definitions. */
4784 42996 : int prologue_stmts;
4785 42996 : if (reduction_type == COND_REDUCTION)
4786 : /* For cond reductions we have four vectors: initial index, step,
4787 : initial result of the data reduction, initial value of the index
4788 : reduction. */
4789 : prologue_stmts = 4;
4790 : else
4791 : /* We need the initial reduction value. */
4792 42716 : prologue_stmts = 1;
4793 42996 : prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4794 : scalar_to_vec, node, 0,
4795 : vect_prologue);
4796 : }
4797 :
4798 : /* Determine cost of epilogue code.
4799 :
4800 : We have a reduction operator that will reduce the vector in one statement.
4801 : Also requires scalar extract. */
4802 :
4803 46923 : if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4804 : {
4805 46764 : if (reduc_fn != IFN_LAST)
4806 : {
4807 35345 : if (reduction_type == COND_REDUCTION)
4808 : {
4809 : /* An EQ stmt and an COND_EXPR stmt. */
4810 8 : epilogue_cost += record_stmt_cost (cost_vec, 2,
4811 : vector_stmt, node, 0,
4812 : vect_epilogue);
4813 : /* Reduction of the max index and a reduction of the found
4814 : values. */
4815 8 : epilogue_cost += record_stmt_cost (cost_vec, 2,
4816 : vec_to_scalar, node, 0,
4817 : vect_epilogue);
4818 : /* A broadcast of the max value. */
4819 8 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4820 : scalar_to_vec, node, 0,
4821 : vect_epilogue);
4822 : }
4823 : else
4824 : {
4825 35337 : epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4826 : node, 0, vect_epilogue);
4827 35337 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4828 : vec_to_scalar, node, 0,
4829 : vect_epilogue);
4830 : }
4831 : }
4832 11419 : else if (reduction_type == COND_REDUCTION)
4833 : {
4834 272 : unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4835 : /* Extraction of scalar elements. */
4836 544 : epilogue_cost += record_stmt_cost (cost_vec,
4837 272 : 2 * estimated_nunits,
4838 : vec_to_scalar, node, 0,
4839 : vect_epilogue);
4840 : /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4841 272 : epilogue_cost += record_stmt_cost (cost_vec,
4842 272 : 2 * estimated_nunits - 3,
4843 : scalar_stmt, node, 0,
4844 : vect_epilogue);
4845 : }
4846 11147 : else if (reduction_type == EXTRACT_LAST_REDUCTION
4847 11147 : || reduction_type == FOLD_LEFT_REDUCTION)
4848 : /* No extra instructions need in the epilogue. */
4849 : ;
4850 : else
4851 : {
4852 7220 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4853 7220 : tree bitsize = TYPE_SIZE (op.type);
4854 7220 : int element_bitsize = tree_to_uhwi (bitsize);
4855 7220 : int nelements = vec_size_in_bits / element_bitsize;
4856 :
4857 7220 : if (op.code == COND_EXPR)
4858 28 : op.code = MAX_EXPR;
4859 :
4860 : /* We have a whole vector shift available. */
4861 968 : if (VECTOR_MODE_P (mode)
4862 7220 : && directly_supported_p (op.code, vectype)
4863 13031 : && have_whole_vector_shift (mode))
4864 : {
4865 : /* Final reduction via vector shifts and the reduction operator.
4866 : Also requires scalar extract. */
4867 17433 : epilogue_cost += record_stmt_cost (cost_vec,
4868 11622 : exact_log2 (nelements) * 2,
4869 : vector_stmt, node, 0,
4870 : vect_epilogue);
4871 5811 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4872 : vec_to_scalar, node, 0,
4873 : vect_epilogue);
4874 : }
4875 : else
4876 : /* Use extracts and reduction op for final reduction. For N
4877 : elements, we have N extracts and N-1 reduction ops. */
4878 1409 : epilogue_cost += record_stmt_cost (cost_vec,
4879 1409 : nelements + nelements - 1,
4880 : vector_stmt, node, 0,
4881 : vect_epilogue);
4882 : }
4883 : }
4884 :
4885 46923 : if (dump_enabled_p ())
4886 2846 : dump_printf (MSG_NOTE,
4887 : "vect_model_reduction_cost: inside_cost = %d, "
4888 : "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4889 : prologue_cost, epilogue_cost);
4890 46923 : }
4891 :
4892 : /* SEQ is a sequence of instructions that initialize the reduction
4893 : described by REDUC_INFO. Emit them in the appropriate place. */
4894 :
4895 : static void
4896 445 : vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4897 : vect_reduc_info reduc_info, gimple *seq)
4898 : {
4899 445 : if (VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info))
4900 : {
4901 : /* When reusing an accumulator from the main loop, we only need
4902 : initialization instructions if the main loop can be skipped.
4903 : In that case, emit the initialization instructions at the end
4904 : of the guard block that does the skip. */
4905 25 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
4906 25 : gcc_assert (skip_edge);
4907 25 : gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4908 25 : gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4909 : }
4910 : else
4911 : {
4912 : /* The normal case: emit the initialization instructions on the
4913 : preheader edge. */
4914 420 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4915 420 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4916 : }
4917 445 : }
4918 :
4919 : /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4920 : which performs a reduction involving GROUP_SIZE scalar statements.
4921 : NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
4922 : is nonnull, introducing extra elements of that value will not change the
4923 : result. */
4924 :
4925 : static void
4926 21855 : get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4927 : vect_reduc_info reduc_info,
4928 : tree vector_type,
4929 : vec<tree> *vec_oprnds,
4930 : unsigned int number_of_vectors,
4931 : unsigned int group_size, tree neutral_op)
4932 : {
4933 21855 : vec<tree> &initial_values = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info);
4934 21855 : unsigned HOST_WIDE_INT nunits;
4935 21855 : unsigned j, number_of_places_left_in_vector;
4936 21855 : unsigned int i;
4937 :
4938 43710 : gcc_assert (group_size == initial_values.length () || neutral_op);
4939 :
4940 : /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4941 : created vectors. It is greater than 1 if unrolling is performed.
4942 :
4943 : For example, we have two scalar operands, s1 and s2 (e.g., group of
4944 : strided accesses of size two), while NUNITS is four (i.e., four scalars
4945 : of this type can be packed in a vector). The output vector will contain
4946 : two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4947 : will be 2).
4948 :
4949 : If GROUP_SIZE > NUNITS, the scalars will be split into several
4950 : vectors containing the operands.
4951 :
4952 : For example, NUNITS is four as before, and the group size is 8
4953 : (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4954 : {s5, s6, s7, s8}. */
4955 :
4956 21855 : if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4957 : nunits = group_size;
4958 :
4959 21855 : tree vector_elt_type = TREE_TYPE (vector_type);
4960 21855 : number_of_places_left_in_vector = nunits;
4961 21855 : bool constant_p = true;
4962 21855 : tree_vector_builder elts (vector_type, nunits, 1);
4963 21855 : elts.quick_grow (nunits);
4964 21855 : gimple_seq ctor_seq = NULL;
4965 21855 : if (neutral_op
4966 43139 : && !useless_type_conversion_p (vector_elt_type,
4967 21284 : TREE_TYPE (neutral_op)))
4968 : {
4969 220 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4970 199 : neutral_op = gimple_build (&ctor_seq, COND_EXPR,
4971 : vector_elt_type,
4972 : neutral_op,
4973 : build_all_ones_cst (vector_elt_type),
4974 : build_zero_cst (vector_elt_type));
4975 : else
4976 21 : neutral_op = gimple_convert (&ctor_seq, vector_elt_type, neutral_op);
4977 : }
4978 210511 : for (j = 0; j < nunits * number_of_vectors; ++j)
4979 : {
4980 188656 : tree op;
4981 188656 : i = j % group_size;
4982 :
4983 : /* Get the def before the loop. In reduction chain we have only
4984 : one initial value. Else we have as many as PHIs in the group. */
4985 188656 : if (i >= initial_values.length () || (j > i && neutral_op))
4986 : op = neutral_op;
4987 : else
4988 : {
4989 51304 : if (!useless_type_conversion_p (vector_elt_type,
4990 25652 : TREE_TYPE (initial_values[i])))
4991 : {
4992 235 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4993 422 : initial_values[i] = gimple_build (&ctor_seq, COND_EXPR,
4994 : vector_elt_type,
4995 211 : initial_values[i],
4996 : build_all_ones_cst
4997 : (vector_elt_type),
4998 : build_zero_cst
4999 : (vector_elt_type));
5000 : else
5001 48 : initial_values[i] = gimple_convert (&ctor_seq,
5002 : vector_elt_type,
5003 24 : initial_values[i]);
5004 : }
5005 25652 : op = initial_values[i];
5006 : }
5007 :
5008 : /* Create 'vect_ = {op0,op1,...,opn}'. */
5009 188656 : number_of_places_left_in_vector--;
5010 188656 : elts[nunits - number_of_places_left_in_vector - 1] = op;
5011 188656 : if (!CONSTANT_CLASS_P (op))
5012 2360 : constant_p = false;
5013 :
5014 188656 : if (number_of_places_left_in_vector == 0)
5015 : {
5016 23317 : tree init;
5017 46634 : if (constant_p && !neutral_op
5018 46346 : ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5019 23317 : : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5020 : /* Build the vector directly from ELTS. */
5021 23317 : init = gimple_build_vector (&ctor_seq, &elts);
5022 0 : else if (neutral_op)
5023 : {
5024 : /* Build a vector of the neutral value and shift the
5025 : other elements into place. */
5026 0 : init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5027 : neutral_op);
5028 0 : int k = nunits;
5029 0 : while (k > 0 && operand_equal_p (elts[k - 1], neutral_op))
5030 : k -= 1;
5031 0 : while (k > 0)
5032 : {
5033 0 : k -= 1;
5034 0 : init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5035 0 : vector_type, init, elts[k]);
5036 : }
5037 : }
5038 : else
5039 : {
5040 : /* First time round, duplicate ELTS to fill the
5041 : required number of vectors. */
5042 0 : duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5043 : elts, number_of_vectors, *vec_oprnds);
5044 0 : break;
5045 : }
5046 23317 : vec_oprnds->quick_push (init);
5047 :
5048 23317 : number_of_places_left_in_vector = nunits;
5049 23317 : elts.new_vector (vector_type, nunits, 1);
5050 23317 : elts.quick_grow (nunits);
5051 23317 : constant_p = true;
5052 : }
5053 : }
5054 21855 : if (ctor_seq != NULL)
5055 445 : vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5056 21855 : }
5057 :
5058 : vect_reduc_info
5059 133037 : info_for_reduction (loop_vec_info loop_vinfo, slp_tree node)
5060 : {
5061 133037 : if (node->cycle_info.id == -1)
5062 : return NULL;
5063 131127 : return loop_vinfo->reduc_infos[node->cycle_info.id];
5064 : }
5065 :
5066 : /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5067 : REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5068 : return false. */
5069 :
5070 : static bool
5071 21494 : vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5072 : vect_reduc_info reduc_info, tree vectype)
5073 : {
5074 21494 : loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5075 21494 : if (!main_loop_vinfo)
5076 : return false;
5077 :
5078 4839 : if (VECT_REDUC_INFO_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5079 : return false;
5080 :
5081 : /* We are not set up to handle vector bools when they are not mapped
5082 : to vector integer data types. */
5083 4824 : if (VECTOR_BOOLEAN_TYPE_P (vectype)
5084 4894 : && GET_MODE_CLASS (TYPE_MODE (vectype)) != MODE_VECTOR_INT)
5085 : return false;
5086 :
5087 4822 : unsigned int num_phis = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).length ();
5088 4822 : auto_vec<tree, 16> main_loop_results (num_phis);
5089 4822 : auto_vec<tree, 16> initial_values (num_phis);
5090 4822 : if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5091 : {
5092 : /* The epilogue loop can be entered either from the main loop or
5093 : from an earlier guard block. */
5094 4599 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
5095 18420 : for (tree incoming_value : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info))
5096 : {
5097 : /* Look for:
5098 :
5099 : INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5100 : INITIAL_VALUE(guard block)>. */
5101 4623 : gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5102 :
5103 4623 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5104 4623 : gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5105 :
5106 4623 : tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5107 4623 : tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5108 :
5109 4623 : main_loop_results.quick_push (from_main_loop);
5110 4623 : initial_values.quick_push (from_skip);
5111 : }
5112 : }
5113 : else
5114 : /* The main loop dominates the epilogue loop. */
5115 223 : main_loop_results.splice (VECT_REDUC_INFO_INITIAL_VALUES (reduc_info));
5116 :
5117 : /* See if the main loop has the kind of accumulator we need. */
5118 4822 : vect_reusable_accumulator *accumulator
5119 4822 : = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5120 4822 : if (!accumulator
5121 9628 : || num_phis != VECT_REDUC_INFO_SCALAR_RESULTS (accumulator->reduc_info).length ()
5122 14446 : || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5123 : VECT_REDUC_INFO_SCALAR_RESULTS (accumulator->reduc_info).begin ()))
5124 : return false;
5125 :
5126 : /* Handle the case where we can reduce wider vectors to narrower ones. */
5127 4812 : tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5128 4812 : unsigned HOST_WIDE_INT m;
5129 4812 : if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5130 4812 : TYPE_VECTOR_SUBPARTS (vectype), &m))
5131 0 : return false;
5132 : /* Check the intermediate vector types and operations are available. */
5133 4812 : tree prev_vectype = old_vectype;
5134 4812 : poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5135 13913 : while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5136 : {
5137 4811 : intermediate_nunits = exact_div (intermediate_nunits, 2);
5138 4811 : tree intermediate_vectype = get_related_vectype_for_scalar_type
5139 4811 : (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5140 4811 : if (!intermediate_vectype
5141 4811 : || !directly_supported_p (VECT_REDUC_INFO_CODE (reduc_info),
5142 : intermediate_vectype)
5143 9102 : || !can_vec_extract (TYPE_MODE (prev_vectype),
5144 4291 : TYPE_MODE (intermediate_vectype)))
5145 : return false;
5146 : prev_vectype = intermediate_vectype;
5147 : }
5148 :
5149 : /* Non-SLP reductions might apply an adjustment after the reduction
5150 : operation, in order to simplify the initialization of the accumulator.
5151 : If the epilogue loop carries on from where the main loop left off,
5152 : it should apply the same adjustment to the final reduction result.
5153 :
5154 : If the epilogue loop can also be entered directly (rather than via
5155 : the main loop), we need to be able to handle that case in the same way,
5156 : with the same adjustment. (In principle we could add a PHI node
5157 : to select the correct adjustment, but in practice that shouldn't be
5158 : necessary.) */
5159 4290 : tree main_adjustment
5160 4290 : = VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5161 4290 : if (loop_vinfo->main_loop_edge && main_adjustment)
5162 : {
5163 3631 : gcc_assert (num_phis == 1);
5164 3631 : tree initial_value = initial_values[0];
5165 : /* Check that we can use INITIAL_VALUE as the adjustment and
5166 : initialize the accumulator with a neutral value instead. */
5167 3631 : if (!operand_equal_p (initial_value, main_adjustment))
5168 : return false;
5169 3525 : initial_values[0] = VECT_REDUC_INFO_NEUTRAL_OP (reduc_info);
5170 : }
5171 4184 : VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5172 4184 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).truncate (0);
5173 4184 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).splice (initial_values);
5174 4184 : VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info) = accumulator;
5175 4184 : return true;
5176 4822 : }
5177 :
5178 : /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5179 : CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5180 :
5181 : static tree
5182 4228 : vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5183 : gimple_seq *seq)
5184 : {
5185 4228 : gcc_assert (!VECTOR_BOOLEAN_TYPE_P (TREE_TYPE (vec_def))
5186 : || (GET_MODE_CLASS (TYPE_MODE (TREE_TYPE (vec_def)))
5187 : == MODE_VECTOR_INT));
5188 4228 : unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5189 4228 : unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5190 4228 : tree stype = TREE_TYPE (vectype);
5191 4228 : tree new_temp = vec_def;
5192 8448 : while (nunits > nunits1)
5193 : {
5194 4220 : nunits /= 2;
5195 4220 : tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5196 4220 : stype, nunits);
5197 4220 : unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5198 :
5199 : /* The target has to make sure we support lowpart/highpart
5200 : extraction, either via direct vector extract or through
5201 : an integer mode punning. */
5202 4220 : tree dst1, dst2;
5203 4220 : gimple *epilog_stmt;
5204 4220 : if (convert_optab_handler (vec_extract_optab,
5205 4220 : TYPE_MODE (TREE_TYPE (new_temp)),
5206 4220 : TYPE_MODE (vectype1))
5207 : != CODE_FOR_nothing)
5208 : {
5209 : /* Extract sub-vectors directly once vec_extract becomes
5210 : a conversion optab. */
5211 2685 : dst1 = make_ssa_name (vectype1);
5212 2685 : epilog_stmt
5213 5370 : = gimple_build_assign (dst1, BIT_FIELD_REF,
5214 : build3 (BIT_FIELD_REF, vectype1,
5215 2685 : new_temp, TYPE_SIZE (vectype1),
5216 : bitsize_int (0)));
5217 2685 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5218 2685 : dst2 = make_ssa_name (vectype1);
5219 2685 : epilog_stmt
5220 2685 : = gimple_build_assign (dst2, BIT_FIELD_REF,
5221 : build3 (BIT_FIELD_REF, vectype1,
5222 2685 : new_temp, TYPE_SIZE (vectype1),
5223 2685 : bitsize_int (bitsize)));
5224 2685 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5225 : }
5226 : else
5227 : {
5228 : /* Extract via punning to appropriately sized integer mode
5229 : vector. */
5230 1535 : tree eltype = build_nonstandard_integer_type (bitsize, 1);
5231 1535 : tree etype = build_vector_type (eltype, 2);
5232 3070 : gcc_assert (convert_optab_handler (vec_extract_optab,
5233 : TYPE_MODE (etype),
5234 : TYPE_MODE (eltype))
5235 : != CODE_FOR_nothing);
5236 1535 : tree tem = make_ssa_name (etype);
5237 1535 : epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5238 : build1 (VIEW_CONVERT_EXPR,
5239 : etype, new_temp));
5240 1535 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5241 1535 : new_temp = tem;
5242 1535 : tem = make_ssa_name (eltype);
5243 1535 : epilog_stmt
5244 3070 : = gimple_build_assign (tem, BIT_FIELD_REF,
5245 : build3 (BIT_FIELD_REF, eltype,
5246 1535 : new_temp, TYPE_SIZE (eltype),
5247 : bitsize_int (0)));
5248 1535 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5249 1535 : dst1 = make_ssa_name (vectype1);
5250 1535 : epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5251 : build1 (VIEW_CONVERT_EXPR,
5252 : vectype1, tem));
5253 1535 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5254 1535 : tem = make_ssa_name (eltype);
5255 1535 : epilog_stmt
5256 1535 : = gimple_build_assign (tem, BIT_FIELD_REF,
5257 : build3 (BIT_FIELD_REF, eltype,
5258 1535 : new_temp, TYPE_SIZE (eltype),
5259 1535 : bitsize_int (bitsize)));
5260 1535 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5261 1535 : dst2 = make_ssa_name (vectype1);
5262 1535 : epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5263 : build1 (VIEW_CONVERT_EXPR,
5264 : vectype1, tem));
5265 1535 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5266 : }
5267 :
5268 4220 : new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5269 : }
5270 4228 : if (!useless_type_conversion_p (vectype, TREE_TYPE (new_temp)))
5271 : {
5272 66 : tree dst3 = make_ssa_name (vectype);
5273 66 : gimple *epilog_stmt = gimple_build_assign (dst3, VIEW_CONVERT_EXPR,
5274 : build1 (VIEW_CONVERT_EXPR,
5275 : vectype, new_temp));
5276 66 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5277 66 : new_temp = dst3;
5278 : }
5279 :
5280 4228 : return new_temp;
5281 : }
5282 :
5283 : /* Function vect_create_epilog_for_reduction
5284 :
5285 : Create code at the loop-epilog to finalize the result of a reduction
5286 : computation.
5287 :
5288 : STMT_INFO is the scalar reduction stmt that is being vectorized.
5289 : SLP_NODE is an SLP node containing a group of reduction statements. The
5290 : first one in this group is STMT_INFO.
5291 : SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5292 : REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5293 : (counting from 0)
5294 : LOOP_EXIT is the edge to update in the merge block. In the case of a single
5295 : exit this edge is always the main loop exit.
5296 :
5297 : This function:
5298 : 1. Completes the reduction def-use cycles.
5299 : 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5300 : by calling the function specified by REDUC_FN if available, or by
5301 : other means (whole-vector shifts or a scalar loop).
5302 : The function also creates a new phi node at the loop exit to preserve
5303 : loop-closed form, as illustrated below.
5304 :
5305 : The flow at the entry to this function:
5306 :
5307 : loop:
5308 : vec_def = phi <vec_init, null> # REDUCTION_PHI
5309 : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5310 : s_loop = scalar_stmt # (scalar) STMT_INFO
5311 : loop_exit:
5312 : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5313 : use <s_out0>
5314 : use <s_out0>
5315 :
5316 : The above is transformed by this function into:
5317 :
5318 : loop:
5319 : vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5320 : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5321 : s_loop = scalar_stmt # (scalar) STMT_INFO
5322 : loop_exit:
5323 : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5324 : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5325 : v_out2 = reduce <v_out1>
5326 : s_out3 = extract_field <v_out2, 0>
5327 : s_out4 = adjust_result <s_out3>
5328 : use <s_out4>
5329 : use <s_out4>
5330 : */
5331 :
5332 : static void
5333 22202 : vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5334 : stmt_vec_info stmt_info,
5335 : slp_tree slp_node,
5336 : slp_instance slp_node_instance,
5337 : edge loop_exit)
5338 : {
5339 22202 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
5340 22202 : code_helper code = VECT_REDUC_INFO_CODE (reduc_info);
5341 22202 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
5342 22202 : tree vectype;
5343 22202 : machine_mode mode;
5344 22202 : basic_block exit_bb;
5345 22202 : gimple *new_phi = NULL, *phi = NULL;
5346 22202 : gimple_stmt_iterator exit_gsi;
5347 22202 : tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5348 22202 : gimple *epilog_stmt = NULL;
5349 22202 : gimple *exit_phi;
5350 22202 : tree def;
5351 22202 : tree orig_name, scalar_result;
5352 22202 : imm_use_iterator imm_iter;
5353 22202 : use_operand_p use_p;
5354 22202 : gimple *use_stmt;
5355 22202 : auto_vec<tree> reduc_inputs;
5356 22202 : int j, i;
5357 22202 : vec<tree> &scalar_results = VECT_REDUC_INFO_SCALAR_RESULTS (reduc_info);
5358 22202 : unsigned int k;
5359 : /* SLP reduction without reduction chain, e.g.,
5360 : # a1 = phi <a2, a0>
5361 : # b1 = phi <b2, b0>
5362 : a2 = operation (a1)
5363 : b2 = operation (b1) */
5364 22202 : const bool slp_reduc = !reduc_info->is_reduc_chain;
5365 22202 : tree induction_index = NULL_TREE;
5366 :
5367 22202 : unsigned int group_size = SLP_TREE_LANES (slp_node);
5368 :
5369 22202 : bool double_reduc = false;
5370 22202 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5371 22202 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5372 : {
5373 0 : double_reduc = true;
5374 0 : gcc_assert (slp_reduc);
5375 : }
5376 :
5377 22202 : vectype = VECT_REDUC_INFO_VECTYPE (reduc_info);
5378 22202 : gcc_assert (vectype);
5379 22202 : mode = TYPE_MODE (vectype);
5380 :
5381 22202 : tree induc_val = NULL_TREE;
5382 22202 : tree adjustment_def = NULL;
5383 : /* Optimize: for induction condition reduction, if we can't use zero
5384 : for induc_val, use initial_def. */
5385 22202 : if (VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5386 62 : induc_val = VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info);
5387 22140 : else if (double_reduc)
5388 : ;
5389 : else
5390 22140 : adjustment_def = VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info);
5391 :
5392 22202 : stmt_vec_info single_live_out_stmt[] = { stmt_info };
5393 22202 : array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5394 22202 : if (slp_reduc)
5395 : /* All statements produce live-out values. */
5396 43996 : live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5397 :
5398 22202 : unsigned vec_num
5399 22202 : = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5400 :
5401 : /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5402 : which is updated with the current index of the loop for every match of
5403 : the original loop's cond_expr (VEC_STMT). This results in a vector
5404 : containing the last time the condition passed for that vector lane.
5405 : The first match will be a 1 to allow 0 to be used for non-matching
5406 : indexes. If there are no matches at all then the vector will be all
5407 : zeroes.
5408 :
5409 : PR92772: This algorithm is broken for architectures that support
5410 : masked vectors, but do not provide fold_extract_last. */
5411 22202 : if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION)
5412 : {
5413 67 : gcc_assert (!double_reduc);
5414 67 : auto_vec<std::pair<tree, bool>, 2> ccompares;
5415 67 : slp_tree cond_node = slp_node_instance->root;
5416 143 : while (cond_node != slp_node_instance->reduc_phis)
5417 : {
5418 76 : stmt_vec_info cond_info = SLP_TREE_REPRESENTATIVE (cond_node);
5419 76 : if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5420 : {
5421 76 : gimple *vec_stmt
5422 76 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (cond_node)[0]);
5423 76 : gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5424 76 : ccompares.safe_push
5425 76 : (std::make_pair (gimple_assign_rhs1 (vec_stmt),
5426 76 : SLP_TREE_REDUC_IDX (cond_node) == 2));
5427 : }
5428 76 : int slp_reduc_idx = SLP_TREE_REDUC_IDX (cond_node);
5429 76 : cond_node = SLP_TREE_CHILDREN (cond_node)[slp_reduc_idx];
5430 : }
5431 67 : gcc_assert (ccompares.length () != 0);
5432 :
5433 67 : tree indx_before_incr, indx_after_incr;
5434 67 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5435 67 : int scalar_precision
5436 67 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5437 67 : tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5438 67 : tree cr_index_vector_type = get_related_vectype_for_scalar_type
5439 67 : (TYPE_MODE (vectype), cr_index_scalar_type,
5440 : TYPE_VECTOR_SUBPARTS (vectype));
5441 :
5442 : /* First we create a simple vector induction variable which starts
5443 : with the values {1,2,3,...} (SERIES_VECT) and increments by the
5444 : vector size (STEP). */
5445 :
5446 : /* Create a {1,2,3,...} vector. */
5447 67 : tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5448 :
5449 : /* Create a vector of the step value. */
5450 67 : tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5451 67 : tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5452 :
5453 : /* Create an induction variable. */
5454 67 : gimple_stmt_iterator incr_gsi;
5455 67 : bool insert_after;
5456 67 : vect_iv_increment_position (LOOP_VINFO_MAIN_EXIT (loop_vinfo),
5457 : &incr_gsi, &insert_after);
5458 67 : create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
5459 : insert_after, &indx_before_incr, &indx_after_incr);
5460 :
5461 : /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5462 : filled with zeros (VEC_ZERO). */
5463 :
5464 : /* Create a vector of 0s. */
5465 67 : tree zero = build_zero_cst (cr_index_scalar_type);
5466 67 : tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5467 :
5468 : /* Create a vector phi node. */
5469 67 : tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5470 67 : new_phi = create_phi_node (new_phi_tree, loop->header);
5471 67 : add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5472 : loop_preheader_edge (loop), UNKNOWN_LOCATION);
5473 :
5474 : /* Now take the condition from the loops original cond_exprs
5475 : and produce a new cond_exprs (INDEX_COND_EXPR) which for
5476 : every match uses values from the induction variable
5477 : (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5478 : (NEW_PHI_TREE).
5479 : Finally, we update the phi (NEW_PHI_TREE) to take the value of
5480 : the new cond_expr (INDEX_COND_EXPR). */
5481 67 : gimple_seq stmts = NULL;
5482 210 : for (int i = ccompares.length () - 1; i != -1; --i)
5483 : {
5484 76 : tree ccompare = ccompares[i].first;
5485 76 : if (ccompares[i].second)
5486 69 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5487 : cr_index_vector_type,
5488 : ccompare,
5489 : indx_before_incr, new_phi_tree);
5490 : else
5491 7 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5492 : cr_index_vector_type,
5493 : ccompare,
5494 : new_phi_tree, indx_before_incr);
5495 : }
5496 67 : gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5497 :
5498 : /* Update the phi with the vec cond. */
5499 67 : induction_index = new_phi_tree;
5500 67 : add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5501 : loop_latch_edge (loop), UNKNOWN_LOCATION);
5502 67 : }
5503 :
5504 : /* 2. Create epilog code.
5505 : The reduction epilog code operates across the elements of the vector
5506 : of partial results computed by the vectorized loop.
5507 : The reduction epilog code consists of:
5508 :
5509 : step 1: compute the scalar result in a vector (v_out2)
5510 : step 2: extract the scalar result (s_out3) from the vector (v_out2)
5511 : step 3: adjust the scalar result (s_out3) if needed.
5512 :
5513 : Step 1 can be accomplished using one the following three schemes:
5514 : (scheme 1) using reduc_fn, if available.
5515 : (scheme 2) using whole-vector shifts, if available.
5516 : (scheme 3) using a scalar loop. In this case steps 1+2 above are
5517 : combined.
5518 :
5519 : The overall epilog code looks like this:
5520 :
5521 : s_out0 = phi <s_loop> # original EXIT_PHI
5522 : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5523 : v_out2 = reduce <v_out1> # step 1
5524 : s_out3 = extract_field <v_out2, 0> # step 2
5525 : s_out4 = adjust_result <s_out3> # step 3
5526 :
5527 : (step 3 is optional, and steps 1 and 2 may be combined).
5528 : Lastly, the uses of s_out0 are replaced by s_out4. */
5529 :
5530 :
5531 : /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5532 : v_out1 = phi <VECT_DEF>
5533 : Store them in NEW_PHIS. */
5534 : /* We need to reduce values in all exits. */
5535 22202 : exit_bb = loop_exit->dest;
5536 22202 : exit_gsi = gsi_after_labels (exit_bb);
5537 22202 : reduc_inputs.create (vec_num);
5538 45876 : for (unsigned i = 0; i < vec_num; i++)
5539 : {
5540 23674 : gimple_seq stmts = NULL;
5541 23674 : def = vect_get_slp_vect_def (slp_node, i);
5542 23674 : tree new_def = copy_ssa_name (def);
5543 23674 : phi = create_phi_node (new_def, exit_bb);
5544 23674 : if (LOOP_VINFO_MAIN_EXIT (loop_vinfo) == loop_exit)
5545 23647 : SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
5546 : else
5547 : {
5548 57 : for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
5549 30 : SET_PHI_ARG_DEF (phi, k, def);
5550 : }
5551 23674 : new_def = gimple_convert (&stmts, vectype, new_def);
5552 23674 : reduc_inputs.quick_push (new_def);
5553 23674 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5554 : }
5555 :
5556 : /* 2.2 Get the original scalar reduction variable as defined in the loop.
5557 : In case STMT is a "pattern-stmt" (i.e. - it represents a reduction
5558 : pattern), the scalar-def is taken from the original stmt that the
5559 : pattern-stmt (STMT) replaces. */
5560 :
5561 23019 : tree scalar_dest = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
5562 22202 : tree scalar_type = TREE_TYPE (scalar_dest);
5563 22202 : scalar_results.truncate (0);
5564 22202 : scalar_results.reserve_exact (group_size);
5565 22202 : new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5566 :
5567 : /* True if we should implement SLP_REDUC using native reduction operations
5568 : instead of scalar operations. */
5569 22202 : const bool direct_slp_reduc
5570 22202 : = (reduc_fn != IFN_LAST
5571 22202 : && slp_reduc
5572 22202 : && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5573 :
5574 : /* If signed overflow is undefined we might need to perform reduction
5575 : computations in an unsigned type. */
5576 22202 : tree compute_vectype = vectype;
5577 22202 : if (ANY_INTEGRAL_TYPE_P (vectype)
5578 15202 : && TYPE_OVERFLOW_UNDEFINED (vectype)
5579 5549 : && code.is_tree_code ()
5580 27751 : && arith_code_with_undefined_signed_overflow ((tree_code) code))
5581 4086 : compute_vectype = unsigned_type_for (vectype);
5582 :
5583 : /* In case of reduction chain, e.g.,
5584 : # a1 = phi <a3, a0>
5585 : a2 = operation (a1)
5586 : a3 = operation (a2),
5587 :
5588 : we may end up with more than one vector result. Here we reduce them
5589 : to one vector.
5590 :
5591 : The same is true for a SLP reduction, e.g.,
5592 : # a1 = phi <a2, a0>
5593 : # b1 = phi <b2, b0>
5594 : a2 = operation (a1)
5595 : b2 = operation (a2),
5596 :
5597 : where we can end up with more than one vector as well. We can
5598 : easily accumulate vectors when the number of vector elements is
5599 : a multiple of the SLP group size.
5600 :
5601 : The same is true if we couldn't use a single defuse cycle. */
5602 22202 : if ((!slp_reduc
5603 : || direct_slp_reduc
5604 : || (slp_reduc
5605 22202 : && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size)))
5606 44404 : && reduc_inputs.length () > 1)
5607 : {
5608 542 : gimple_seq stmts = NULL;
5609 542 : tree single_input = reduc_inputs[0];
5610 542 : if (compute_vectype != vectype)
5611 157 : single_input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5612 : compute_vectype, single_input);
5613 1849 : for (k = 1; k < reduc_inputs.length (); k++)
5614 : {
5615 1307 : tree input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5616 1307 : compute_vectype, reduc_inputs[k]);
5617 1307 : single_input = gimple_build (&stmts, code, compute_vectype,
5618 : single_input, input);
5619 : }
5620 542 : if (compute_vectype != vectype)
5621 157 : single_input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5622 : vectype, single_input);
5623 542 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5624 :
5625 542 : reduc_inputs.truncate (0);
5626 542 : reduc_inputs.safe_push (single_input);
5627 : }
5628 :
5629 22202 : tree orig_reduc_input = reduc_inputs[0];
5630 :
5631 : /* If this loop is an epilogue loop that can be skipped after the
5632 : main loop, we can only share a reduction operation between the
5633 : main loop and the epilogue if we put it at the target of the
5634 : skip edge.
5635 :
5636 : We can still reuse accumulators if this check fails. Doing so has
5637 : the minor(?) benefit of making the epilogue loop's scalar result
5638 : independent of the main loop's scalar result. */
5639 22202 : bool unify_with_main_loop_p = false;
5640 22202 : if (VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)
5641 4184 : && loop_vinfo->skip_this_loop_edge
5642 3944 : && single_succ_p (exit_bb)
5643 22223 : && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5644 : {
5645 21 : unify_with_main_loop_p = true;
5646 :
5647 21 : basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5648 21 : reduc_inputs[0] = make_ssa_name (vectype);
5649 21 : gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5650 21 : add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5651 : UNKNOWN_LOCATION);
5652 21 : add_phi_arg (new_phi,
5653 21 : VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)->reduc_input,
5654 : loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5655 21 : exit_gsi = gsi_after_labels (reduc_block);
5656 : }
5657 :
5658 : /* Shouldn't be used beyond this point. */
5659 22202 : exit_bb = nullptr;
5660 :
5661 : /* If we are operating on a mask vector and do not support direct mask
5662 : reduction, work on a bool data vector instead of a mask vector. */
5663 22202 : if (VECTOR_BOOLEAN_TYPE_P (vectype)
5664 227 : && VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info)
5665 22394 : && vectype != VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info))
5666 : {
5667 192 : compute_vectype = vectype = VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info);
5668 192 : gimple_seq stmts = NULL;
5669 392 : for (unsigned i = 0; i < reduc_inputs.length (); ++i)
5670 400 : reduc_inputs[i] = gimple_build (&stmts, VEC_COND_EXPR, vectype,
5671 200 : reduc_inputs[i],
5672 : build_one_cst (vectype),
5673 : build_zero_cst (vectype));
5674 192 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5675 : }
5676 :
5677 22202 : if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
5678 67 : && reduc_fn != IFN_LAST)
5679 : {
5680 : /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5681 : various data values where the condition matched and another vector
5682 : (INDUCTION_INDEX) containing all the indexes of those matches. We
5683 : need to extract the last matching index (which will be the index with
5684 : highest value) and use this to index into the data vector.
5685 : For the case where there were no matches, the data vector will contain
5686 : all default values and the index vector will be all zeros. */
5687 :
5688 : /* Get various versions of the type of the vector of indexes. */
5689 4 : tree index_vec_type = TREE_TYPE (induction_index);
5690 4 : gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5691 4 : tree index_scalar_type = TREE_TYPE (index_vec_type);
5692 4 : tree index_vec_cmp_type = truth_type_for (index_vec_type);
5693 :
5694 : /* Get an unsigned integer version of the type of the data vector. */
5695 4 : int scalar_precision
5696 4 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5697 4 : tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5698 4 : tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5699 : vectype);
5700 :
5701 : /* First we need to create a vector (ZERO_VEC) of zeros and another
5702 : vector (MAX_INDEX_VEC) filled with the last matching index, which we
5703 : can create using a MAX reduction and then expanding.
5704 : In the case where the loop never made any matches, the max index will
5705 : be zero. */
5706 :
5707 : /* Vector of {0, 0, 0,...}. */
5708 4 : tree zero_vec = build_zero_cst (vectype);
5709 :
5710 : /* Find maximum value from the vector of found indexes. */
5711 4 : tree max_index = make_ssa_name (index_scalar_type);
5712 4 : gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5713 : 1, induction_index);
5714 4 : gimple_call_set_lhs (max_index_stmt, max_index);
5715 4 : gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5716 :
5717 : /* Vector of {max_index, max_index, max_index,...}. */
5718 4 : tree max_index_vec = make_ssa_name (index_vec_type);
5719 4 : tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5720 : max_index);
5721 4 : gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5722 : max_index_vec_rhs);
5723 4 : gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5724 :
5725 : /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5726 : with the vector (INDUCTION_INDEX) of found indexes, choosing values
5727 : from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5728 : otherwise. Only one value should match, resulting in a vector
5729 : (VEC_COND) with one data value and the rest zeros.
5730 : In the case where the loop never made any matches, every index will
5731 : match, resulting in a vector with all data values (which will all be
5732 : the default value). */
5733 :
5734 : /* Compare the max index vector to the vector of found indexes to find
5735 : the position of the max value. */
5736 4 : tree vec_compare = make_ssa_name (index_vec_cmp_type);
5737 4 : gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5738 : induction_index,
5739 : max_index_vec);
5740 4 : gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5741 :
5742 : /* Use the compare to choose either values from the data vector or
5743 : zero. */
5744 4 : tree vec_cond = make_ssa_name (vectype);
5745 4 : gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5746 : vec_compare,
5747 4 : reduc_inputs[0],
5748 : zero_vec);
5749 4 : gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5750 :
5751 : /* Finally we need to extract the data value from the vector (VEC_COND)
5752 : into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5753 : reduction, but because this doesn't exist, we can use a MAX reduction
5754 : instead. The data value might be signed or a float so we need to cast
5755 : it first.
5756 : In the case where the loop never made any matches, the data values are
5757 : all identical, and so will reduce down correctly. */
5758 :
5759 : /* Make the matched data values unsigned. */
5760 4 : tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5761 4 : tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5762 : vec_cond);
5763 4 : gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5764 : VIEW_CONVERT_EXPR,
5765 : vec_cond_cast_rhs);
5766 4 : gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5767 :
5768 : /* Reduce down to a scalar value. */
5769 4 : tree data_reduc = make_ssa_name (scalar_type_unsigned);
5770 4 : gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5771 : 1, vec_cond_cast);
5772 4 : gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5773 4 : gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5774 :
5775 : /* Convert the reduced value back to the result type and set as the
5776 : result. */
5777 4 : gimple_seq stmts = NULL;
5778 4 : new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5779 : data_reduc);
5780 4 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5781 4 : scalar_results.safe_push (new_temp);
5782 4 : }
5783 22198 : else if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
5784 63 : && reduc_fn == IFN_LAST)
5785 : {
5786 : /* Condition reduction without supported IFN_REDUC_MAX. Generate
5787 : idx = 0;
5788 : idx_val = induction_index[0];
5789 : val = data_reduc[0];
5790 : for (idx = 0, val = init, i = 0; i < nelts; ++i)
5791 : if (induction_index[i] > idx_val)
5792 : val = data_reduc[i], idx_val = induction_index[i];
5793 : return val; */
5794 :
5795 63 : tree data_eltype = TREE_TYPE (vectype);
5796 63 : tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5797 63 : unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5798 63 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5799 : /* Enforced by vectorizable_reduction, which ensures we have target
5800 : support before allowing a conditional reduction on variable-length
5801 : vectors. */
5802 63 : unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5803 63 : tree idx_val = NULL_TREE, val = NULL_TREE;
5804 419 : for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5805 : {
5806 356 : tree old_idx_val = idx_val;
5807 356 : tree old_val = val;
5808 356 : idx_val = make_ssa_name (idx_eltype);
5809 356 : epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5810 : build3 (BIT_FIELD_REF, idx_eltype,
5811 : induction_index,
5812 356 : bitsize_int (el_size),
5813 356 : bitsize_int (off)));
5814 356 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5815 356 : val = make_ssa_name (data_eltype);
5816 712 : epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5817 : build3 (BIT_FIELD_REF,
5818 : data_eltype,
5819 356 : reduc_inputs[0],
5820 356 : bitsize_int (el_size),
5821 356 : bitsize_int (off)));
5822 356 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5823 356 : if (off != 0)
5824 : {
5825 293 : tree new_idx_val = idx_val;
5826 293 : if (off != v_size - el_size)
5827 : {
5828 230 : new_idx_val = make_ssa_name (idx_eltype);
5829 230 : epilog_stmt = gimple_build_assign (new_idx_val,
5830 : MAX_EXPR, idx_val,
5831 : old_idx_val);
5832 230 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5833 : }
5834 293 : tree cond = make_ssa_name (boolean_type_node);
5835 293 : epilog_stmt = gimple_build_assign (cond, GT_EXPR,
5836 : idx_val, old_idx_val);
5837 293 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5838 293 : tree new_val = make_ssa_name (data_eltype);
5839 293 : epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
5840 : cond, val, old_val);
5841 293 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5842 293 : idx_val = new_idx_val;
5843 293 : val = new_val;
5844 : }
5845 : }
5846 : /* Convert the reduced value back to the result type and set as the
5847 : result. */
5848 63 : gimple_seq stmts = NULL;
5849 63 : val = gimple_convert (&stmts, scalar_type, val);
5850 63 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5851 63 : scalar_results.safe_push (val);
5852 63 : }
5853 :
5854 : /* 2.3 Create the reduction code, using one of the three schemes described
5855 : above. In SLP we simply need to extract all the elements from the
5856 : vector (without reducing them), so we use scalar shifts. */
5857 22135 : else if (reduc_fn != IFN_LAST && (!slp_reduc || group_size == 1))
5858 : {
5859 20263 : tree tmp;
5860 20263 : tree vec_elem_type;
5861 :
5862 : /* Case 1: Create:
5863 : v_out2 = reduc_expr <v_out1> */
5864 :
5865 20263 : if (dump_enabled_p ())
5866 1514 : dump_printf_loc (MSG_NOTE, vect_location,
5867 : "Reduce using direct vector reduction.\n");
5868 :
5869 20263 : gimple_seq stmts = NULL;
5870 20263 : vec_elem_type = TREE_TYPE (vectype);
5871 20263 : new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5872 20263 : vec_elem_type, reduc_inputs[0]);
5873 20263 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5874 20263 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5875 :
5876 20263 : if ((VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5877 62 : && induc_val)
5878 : {
5879 : /* Earlier we set the initial value to be a vector if induc_val
5880 : values. Check the result and if it is induc_val then replace
5881 : with the original initial value, unless induc_val is
5882 : the same as initial_def already. */
5883 60 : tree zcompare = make_ssa_name (boolean_type_node);
5884 60 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
5885 : new_temp, induc_val);
5886 60 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5887 60 : tree initial_def = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[0];
5888 60 : tmp = make_ssa_name (new_scalar_dest);
5889 60 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5890 : initial_def, new_temp);
5891 60 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5892 60 : new_temp = tmp;
5893 : }
5894 :
5895 20263 : scalar_results.safe_push (new_temp);
5896 20263 : }
5897 1681 : else if (direct_slp_reduc)
5898 : {
5899 : /* Here we create one vector for each of the GROUP_SIZE results,
5900 : with the elements for other SLP statements replaced with the
5901 : neutral value. We can then do a normal reduction on each vector. */
5902 :
5903 : /* Enforced by vectorizable_reduction. */
5904 : gcc_assert (reduc_inputs.length () == 1);
5905 : gcc_assert (pow2p_hwi (group_size));
5906 :
5907 : gimple_seq seq = NULL;
5908 :
5909 : /* Build a vector {0, 1, 2, ...}, with the same number of elements
5910 : and the same element size as VECTYPE. */
5911 : tree index = build_index_vector (vectype, 0, 1);
5912 : tree index_type = TREE_TYPE (index);
5913 : tree index_elt_type = TREE_TYPE (index_type);
5914 : tree mask_type = truth_type_for (index_type);
5915 :
5916 : /* Create a vector that, for each element, identifies which of
5917 : the results should use it. */
5918 : tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5919 : index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5920 : build_vector_from_val (index_type, index_mask));
5921 :
5922 : /* Get a neutral vector value. This is simply a splat of the neutral
5923 : scalar value if we have one, otherwise the initial scalar value
5924 : is itself a neutral value. */
5925 : tree vector_identity = NULL_TREE;
5926 : tree neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5927 : NULL_TREE, false);
5928 : if (neutral_op)
5929 : vector_identity = gimple_build_vector_from_val (&seq, vectype,
5930 : neutral_op);
5931 : for (unsigned int i = 0; i < group_size; ++i)
5932 : {
5933 : /* If there's no univeral neutral value, we can use the
5934 : initial scalar value from the original PHI. This is used
5935 : for MIN and MAX reduction, for example. */
5936 : if (!neutral_op)
5937 : {
5938 : tree scalar_value
5939 : = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[i];
5940 : scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5941 : scalar_value);
5942 : vector_identity = gimple_build_vector_from_val (&seq, vectype,
5943 : scalar_value);
5944 : }
5945 :
5946 : /* Calculate the equivalent of:
5947 :
5948 : sel[j] = (index[j] == i);
5949 :
5950 : which selects the elements of REDUC_INPUTS[0] that should
5951 : be included in the result. */
5952 : tree compare_val = build_int_cst (index_elt_type, i);
5953 : compare_val = build_vector_from_val (index_type, compare_val);
5954 : tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5955 : index, compare_val);
5956 :
5957 : /* Calculate the equivalent of:
5958 :
5959 : vec = seq ? reduc_inputs[0] : vector_identity;
5960 :
5961 : VEC is now suitable for a full vector reduction. */
5962 : tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5963 : sel, reduc_inputs[0], vector_identity);
5964 :
5965 : /* Do the reduction and convert it to the appropriate type. */
5966 : tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5967 : TREE_TYPE (vectype), vec);
5968 : scalar = gimple_convert (&seq, scalar_type, scalar);
5969 : scalar_results.safe_push (scalar);
5970 : }
5971 : gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5972 : }
5973 : else
5974 : {
5975 1681 : bool reduce_with_shift;
5976 1681 : tree vec_temp;
5977 :
5978 1681 : gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5979 :
5980 : /* See if the target wants to do the final (shift) reduction
5981 : in a vector mode of smaller size and first reduce upper/lower
5982 : halves against each other. */
5983 1872 : enum machine_mode mode1 = mode;
5984 1872 : tree stype = TREE_TYPE (vectype);
5985 1872 : if (compute_vectype != vectype)
5986 : {
5987 482 : stype = unsigned_type_for (stype);
5988 482 : gimple_seq stmts = NULL;
5989 1034 : for (unsigned i = 0; i < reduc_inputs.length (); ++i)
5990 : {
5991 552 : tree new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5992 552 : compute_vectype, reduc_inputs[i]);
5993 552 : reduc_inputs[i] = new_temp;
5994 : }
5995 482 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5996 : }
5997 1872 : unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5998 1872 : unsigned nunits1 = nunits;
5999 1872 : if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6000 1872 : && reduc_inputs.length () == 1)
6001 : {
6002 41 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6003 : /* For SLP reductions we have to make sure lanes match up, but
6004 : since we're doing individual element final reduction reducing
6005 : vector width here is even more important.
6006 : ??? We can also separate lanes with permutes, for the common
6007 : case of power-of-two group-size odd/even extracts would work. */
6008 41 : if (slp_reduc && nunits != nunits1)
6009 : {
6010 41 : nunits1 = least_common_multiple (nunits1, group_size);
6011 82 : gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6012 : }
6013 : }
6014 1831 : else if (!slp_reduc
6015 1831 : && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6016 0 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6017 :
6018 1872 : tree vectype1 = compute_vectype;
6019 1872 : if (mode1 != mode)
6020 : {
6021 47 : vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6022 47 : stype, nunits1);
6023 : /* First reduce the vector to the desired vector size we should
6024 : do shift reduction on by combining upper and lower halves. */
6025 47 : gimple_seq stmts = NULL;
6026 47 : new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6027 : code, &stmts);
6028 47 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6029 47 : reduc_inputs[0] = new_temp;
6030 : }
6031 :
6032 1872 : reduce_with_shift = have_whole_vector_shift (mode1);
6033 729 : if (!VECTOR_MODE_P (mode1)
6034 2599 : || !directly_supported_p (code, vectype1))
6035 : reduce_with_shift = false;
6036 :
6037 1855 : if (reduce_with_shift && (!slp_reduc || group_size == 1))
6038 : {
6039 1631 : int element_bitsize = vector_element_bits (vectype1);
6040 : /* Enforced by vectorizable_reduction, which disallows SLP reductions
6041 : for variable-length vectors and also requires direct target support
6042 : for loop reductions. */
6043 1631 : int nelements = TYPE_VECTOR_SUBPARTS (vectype1).to_constant ();
6044 1631 : vec_perm_builder sel;
6045 1631 : vec_perm_indices indices;
6046 :
6047 1631 : int elt_offset;
6048 :
6049 1631 : tree zero_vec = build_zero_cst (vectype1);
6050 : /* Case 2: Create:
6051 : for (offset = nelements/2; offset >= 1; offset/=2)
6052 : {
6053 : Create: va' = vec_shift <va, offset>
6054 : Create: va = vop <va, va'>
6055 : } */
6056 :
6057 1631 : if (dump_enabled_p ())
6058 365 : dump_printf_loc (MSG_NOTE, vect_location,
6059 : "Reduce using vector shifts\n");
6060 :
6061 1631 : gimple_seq stmts = NULL;
6062 1631 : new_temp = gimple_convert (&stmts, vectype1, reduc_inputs[0]);
6063 1631 : for (elt_offset = nelements / 2;
6064 3572 : elt_offset >= 1;
6065 1941 : elt_offset /= 2)
6066 : {
6067 1941 : calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6068 1941 : indices.new_vector (sel, 2, nelements);
6069 1941 : tree mask = vect_gen_perm_mask_any (vectype1, indices);
6070 1941 : new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6071 : new_temp, zero_vec, mask);
6072 1941 : new_temp = gimple_build (&stmts, code,
6073 : vectype1, new_name, new_temp);
6074 : }
6075 :
6076 : /* 2.4 Extract the final scalar result. Create:
6077 : s_out3 = extract_field <v_out2, bitpos> */
6078 :
6079 1631 : if (dump_enabled_p ())
6080 365 : dump_printf_loc (MSG_NOTE, vect_location,
6081 : "extract scalar result\n");
6082 :
6083 1631 : new_temp = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype1),
6084 1631 : new_temp, bitsize_int (element_bitsize),
6085 1631 : bitsize_zero_node);
6086 1631 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6087 1631 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6088 1631 : scalar_results.safe_push (new_temp);
6089 1631 : }
6090 : else
6091 : {
6092 : /* Case 3: Create:
6093 : s = extract_field <v_out2, 0>
6094 : for (offset = element_size;
6095 : offset < vector_size;
6096 : offset += element_size;)
6097 : {
6098 : Create: s' = extract_field <v_out2, offset>
6099 : Create: s = op <s, s'> // For non SLP cases
6100 : } */
6101 :
6102 241 : if (dump_enabled_p ())
6103 150 : dump_printf_loc (MSG_NOTE, vect_location,
6104 : "Reduce using scalar code.\n");
6105 :
6106 241 : tree compute_type = TREE_TYPE (vectype1);
6107 241 : unsigned element_bitsize = vector_element_bits (vectype1);
6108 241 : unsigned vec_size_in_bits = element_bitsize
6109 241 : * TYPE_VECTOR_SUBPARTS (vectype1).to_constant ();
6110 241 : tree bitsize = bitsize_int (element_bitsize);
6111 241 : gimple_seq stmts = NULL;
6112 647 : FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6113 : {
6114 406 : unsigned bit_offset;
6115 812 : new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6116 406 : vec_temp, bitsize, bitsize_zero_node);
6117 :
6118 : /* In SLP we don't need to apply reduction operation, so we just
6119 : collect s' values in SCALAR_RESULTS. */
6120 406 : if (slp_reduc)
6121 396 : scalar_results.safe_push (new_temp);
6122 :
6123 1000 : for (bit_offset = element_bitsize;
6124 1406 : bit_offset < vec_size_in_bits;
6125 1000 : bit_offset += element_bitsize)
6126 : {
6127 1000 : tree bitpos = bitsize_int (bit_offset);
6128 1000 : new_name = gimple_build (&stmts, BIT_FIELD_REF,
6129 : compute_type, vec_temp,
6130 : bitsize, bitpos);
6131 1000 : if (slp_reduc)
6132 : {
6133 : /* In SLP we don't need to apply reduction operation, so
6134 : we just collect s' values in SCALAR_RESULTS. */
6135 990 : new_temp = new_name;
6136 990 : scalar_results.safe_push (new_name);
6137 : }
6138 : else
6139 10 : new_temp = gimple_build (&stmts, code, compute_type,
6140 : new_name, new_temp);
6141 : }
6142 : }
6143 :
6144 : /* The only case where we need to reduce scalar results in a SLP
6145 : reduction, is unrolling. If the size of SCALAR_RESULTS is
6146 : greater than GROUP_SIZE, we reduce them combining elements modulo
6147 : GROUP_SIZE. */
6148 241 : if (slp_reduc)
6149 : {
6150 231 : tree res, first_res, new_res;
6151 :
6152 : /* Reduce multiple scalar results in case of SLP unrolling. */
6153 925 : for (j = group_size; scalar_results.iterate (j, &res);
6154 : j++)
6155 : {
6156 694 : first_res = scalar_results[j % group_size];
6157 694 : new_res = gimple_build (&stmts, code, compute_type,
6158 : first_res, res);
6159 694 : scalar_results[j % group_size] = new_res;
6160 : }
6161 231 : scalar_results.truncate (group_size);
6162 1154 : for (k = 0; k < group_size; k++)
6163 1384 : scalar_results[k] = gimple_convert (&stmts, scalar_type,
6164 692 : scalar_results[k]);
6165 : }
6166 : else
6167 : {
6168 : /* Reduction chain - we have one scalar to keep in
6169 : SCALAR_RESULTS. */
6170 10 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6171 10 : scalar_results.safe_push (new_temp);
6172 : }
6173 :
6174 241 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6175 : }
6176 :
6177 1872 : if ((VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6178 0 : && induc_val)
6179 : {
6180 : /* Earlier we set the initial value to be a vector if induc_val
6181 : values. Check the result and if it is induc_val then replace
6182 : with the original initial value, unless induc_val is
6183 : the same as initial_def already. */
6184 0 : tree zcompare = make_ssa_name (boolean_type_node);
6185 0 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6186 0 : scalar_results[0], induc_val);
6187 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6188 0 : tree initial_def = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[0];
6189 0 : tree tmp = make_ssa_name (new_scalar_dest);
6190 0 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6191 0 : initial_def, scalar_results[0]);
6192 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6193 0 : scalar_results[0] = tmp;
6194 : }
6195 : }
6196 :
6197 : /* 2.5 Adjust the final result by the initial value of the reduction
6198 : variable. (When such adjustment is not needed, then
6199 : 'adjustment_def' is zero). For example, if code is PLUS we create:
6200 : new_temp = loop_exit_def + adjustment_def */
6201 :
6202 22202 : if (adjustment_def)
6203 : {
6204 15910 : gcc_assert (!slp_reduc || group_size == 1);
6205 15910 : gimple_seq stmts = NULL;
6206 15910 : if (double_reduc)
6207 : {
6208 0 : gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6209 0 : adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6210 0 : new_temp = gimple_build (&stmts, code, vectype,
6211 0 : reduc_inputs[0], adjustment_def);
6212 : }
6213 : else
6214 : {
6215 15910 : new_temp = scalar_results[0];
6216 15910 : gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6217 15910 : adjustment_def = gimple_convert (&stmts, TREE_TYPE (compute_vectype),
6218 : adjustment_def);
6219 15910 : new_temp = gimple_convert (&stmts, TREE_TYPE (compute_vectype),
6220 : new_temp);
6221 15910 : new_temp = gimple_build (&stmts, code, TREE_TYPE (compute_vectype),
6222 : new_temp, adjustment_def);
6223 15910 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6224 : }
6225 :
6226 15910 : epilog_stmt = gimple_seq_last_stmt (stmts);
6227 15910 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6228 15910 : scalar_results[0] = new_temp;
6229 : }
6230 :
6231 : /* Record this operation if it could be reused by the epilogue loop. */
6232 22202 : if (VECT_REDUC_INFO_TYPE (reduc_info) == TREE_CODE_REDUCTION
6233 22202 : && reduc_inputs.length () == 1)
6234 22016 : loop_vinfo->reusable_accumulators.put (scalar_results[0],
6235 : { orig_reduc_input, reduc_info });
6236 :
6237 : /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6238 : phis with new adjusted scalar results, i.e., replace use <s_out0>
6239 : with use <s_out4>.
6240 :
6241 : Transform:
6242 : loop_exit:
6243 : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6244 : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6245 : v_out2 = reduce <v_out1>
6246 : s_out3 = extract_field <v_out2, 0>
6247 : s_out4 = adjust_result <s_out3>
6248 : use <s_out0>
6249 : use <s_out0>
6250 :
6251 : into:
6252 :
6253 : loop_exit:
6254 : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6255 : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6256 : v_out2 = reduce <v_out1>
6257 : s_out3 = extract_field <v_out2, 0>
6258 : s_out4 = adjust_result <s_out3>
6259 : use <s_out4>
6260 : use <s_out4> */
6261 :
6262 44404 : gcc_assert (live_out_stmts.size () == scalar_results.length ());
6263 22202 : auto_vec<gimple *> phis;
6264 44865 : for (k = 0; k < live_out_stmts.size (); k++)
6265 : {
6266 22663 : stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6267 22663 : tree scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6268 :
6269 : /* Find the loop-closed-use at the loop exit of the original scalar
6270 : result. (The reduction result is expected to have two immediate uses,
6271 : one at the latch block, and one at the loop exit). Note with
6272 : early break we can have two exit blocks, so pick the correct PHI. */
6273 115113 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6274 69787 : if (!is_gimple_debug (USE_STMT (use_p))
6275 69787 : && !flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6276 : {
6277 22658 : gcc_assert (is_a <gphi *> (USE_STMT (use_p)));
6278 22658 : if (gimple_bb (USE_STMT (use_p)) == loop_exit->dest)
6279 22650 : phis.safe_push (USE_STMT (use_p));
6280 22663 : }
6281 :
6282 45313 : FOR_EACH_VEC_ELT (phis, i, exit_phi)
6283 : {
6284 : /* Replace the uses: */
6285 22650 : orig_name = PHI_RESULT (exit_phi);
6286 :
6287 : /* Look for a single use at the target of the skip edge. */
6288 22650 : if (unify_with_main_loop_p)
6289 : {
6290 38 : use_operand_p use_p;
6291 38 : gimple *user;
6292 38 : if (!single_imm_use (orig_name, &use_p, &user))
6293 0 : gcc_unreachable ();
6294 38 : orig_name = gimple_get_lhs (user);
6295 : }
6296 :
6297 22650 : scalar_result = scalar_results[k];
6298 84028 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6299 : {
6300 38728 : gphi *use_phi = dyn_cast <gphi *> (use_stmt);
6301 116228 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6302 : {
6303 38750 : if (use_phi
6304 38750 : && (phi_arg_edge_from_use (use_p)->flags & EDGE_ABNORMAL))
6305 : {
6306 0 : gcc_assert (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (orig_name));
6307 0 : SSA_NAME_OCCURS_IN_ABNORMAL_PHI (scalar_result) = 1;
6308 : }
6309 38750 : SET_USE (use_p, scalar_result);
6310 : }
6311 38728 : update_stmt (use_stmt);
6312 22650 : }
6313 : }
6314 :
6315 22663 : phis.truncate (0);
6316 : }
6317 22202 : }
6318 :
6319 : /* Return a vector of type VECTYPE that is equal to the vector select
6320 : operation "MASK ? VEC : IDENTITY". Insert the select statements
6321 : before GSI. */
6322 :
6323 : static tree
6324 9 : merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6325 : tree vec, tree identity)
6326 : {
6327 9 : tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6328 9 : gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6329 : mask, vec, identity);
6330 9 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6331 9 : return cond;
6332 : }
6333 :
6334 : /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6335 : order, starting with LHS. Insert the extraction statements before GSI and
6336 : associate the new scalar SSA names with variable SCALAR_DEST.
6337 : If MASK is nonzero mask the input and then operate on it unconditionally.
6338 : Return the SSA name for the result. */
6339 :
6340 : static tree
6341 1101 : vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6342 : tree_code code, tree lhs, tree vector_rhs,
6343 : tree mask)
6344 : {
6345 1101 : tree vectype = TREE_TYPE (vector_rhs);
6346 1101 : tree scalar_type = TREE_TYPE (vectype);
6347 1101 : tree bitsize = TYPE_SIZE (scalar_type);
6348 1101 : unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6349 1101 : unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6350 :
6351 : /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6352 : to perform an unconditional element-wise reduction of it. */
6353 1101 : if (mask)
6354 : {
6355 77 : tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
6356 : "masked_vector_rhs");
6357 77 : tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
6358 : false);
6359 77 : tree vector_identity = build_vector_from_val (vectype, neutral_op);
6360 77 : gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
6361 : mask, vector_rhs, vector_identity);
6362 77 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6363 77 : vector_rhs = masked_vector_rhs;
6364 : }
6365 :
6366 1101 : for (unsigned HOST_WIDE_INT bit_offset = 0;
6367 5141 : bit_offset < vec_size_in_bits;
6368 4040 : bit_offset += element_bitsize)
6369 : {
6370 4040 : tree bitpos = bitsize_int (bit_offset);
6371 4040 : tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6372 : bitsize, bitpos);
6373 :
6374 4040 : gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6375 4040 : rhs = make_ssa_name (scalar_dest, stmt);
6376 4040 : gimple_assign_set_lhs (stmt, rhs);
6377 4040 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6378 : /* Fold the vector extract, combining it with a previous reversal
6379 : like seen in PR90579. */
6380 4040 : auto gsi2 = gsi_for_stmt (stmt);
6381 4040 : if (fold_stmt (&gsi2, follow_all_ssa_edges))
6382 356 : update_stmt (gsi_stmt (gsi2));
6383 :
6384 4040 : stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6385 4040 : tree new_name = make_ssa_name (scalar_dest, stmt);
6386 4040 : gimple_assign_set_lhs (stmt, new_name);
6387 4040 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6388 4040 : lhs = new_name;
6389 : }
6390 1101 : return lhs;
6391 : }
6392 :
6393 : /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6394 : type of the vector input. */
6395 :
6396 : static internal_fn
6397 2538 : get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6398 : {
6399 2538 : internal_fn mask_reduc_fn;
6400 2538 : internal_fn mask_len_reduc_fn;
6401 :
6402 2538 : switch (reduc_fn)
6403 : {
6404 0 : case IFN_FOLD_LEFT_PLUS:
6405 0 : mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6406 0 : mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6407 0 : break;
6408 :
6409 : default:
6410 : return IFN_LAST;
6411 : }
6412 :
6413 0 : if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6414 : OPTIMIZE_FOR_SPEED))
6415 : return mask_reduc_fn;
6416 0 : if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6417 : OPTIMIZE_FOR_SPEED))
6418 : return mask_len_reduc_fn;
6419 : return IFN_LAST;
6420 : }
6421 :
6422 : /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6423 : statement that sets the live-out value. REDUC_DEF_STMT is the phi
6424 : statement. CODE is the operation performed by STMT_INFO and OPS are
6425 : its scalar operands. REDUC_INDEX is the index of the operand in
6426 : OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6427 : implements in-order reduction, or IFN_LAST if we should open-code it.
6428 : VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6429 : that should be used to control the operation in a fully-masked loop. */
6430 :
6431 : static bool
6432 843 : vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6433 : stmt_vec_info stmt_info,
6434 : gimple_stmt_iterator *gsi,
6435 : slp_tree slp_node,
6436 : code_helper code, internal_fn reduc_fn,
6437 : int num_ops, tree vectype_in,
6438 : int reduc_index, vec_loop_masks *masks,
6439 : vec_loop_lens *lens)
6440 : {
6441 843 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6442 843 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
6443 843 : internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6444 :
6445 843 : gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6446 :
6447 843 : bool is_cond_op = false;
6448 843 : if (!code.is_tree_code ())
6449 : {
6450 23 : code = conditional_internal_fn_code (internal_fn (code));
6451 23 : gcc_assert (code != ERROR_MARK);
6452 : is_cond_op = true;
6453 : }
6454 :
6455 843 : gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
6456 :
6457 843 : gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6458 : TYPE_VECTOR_SUBPARTS (vectype_in)));
6459 :
6460 : /* ??? We should, when transforming the cycle PHI, record the existing
6461 : scalar def as vector def so looking up the vector def works. This
6462 : would also allow generalizing this for reduction paths of length > 1
6463 : and/or SLP reductions. */
6464 843 : slp_tree reduc_node = SLP_TREE_CHILDREN (slp_node)[reduc_index];
6465 843 : stmt_vec_info reduc_var_def = SLP_TREE_SCALAR_STMTS (reduc_node)[0];
6466 843 : tree reduc_var = gimple_get_lhs (STMT_VINFO_STMT (reduc_var_def));
6467 :
6468 : /* The operands either come from a binary operation or an IFN_COND operation.
6469 : The former is a gimple assign with binary rhs and the latter is a
6470 : gimple call with four arguments. */
6471 843 : gcc_assert (num_ops == 2 || num_ops == 4);
6472 :
6473 843 : auto_vec<tree> vec_oprnds0, vec_opmask;
6474 843 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[(is_cond_op ? 2 : 0)
6475 843 : + (1 - reduc_index)],
6476 : &vec_oprnds0);
6477 : /* For an IFN_COND_OP we also need the vector mask operand. */
6478 843 : if (is_cond_op)
6479 23 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], &vec_opmask);
6480 :
6481 : /* The transform below relies on preserving the original scalar PHI
6482 : and its latch def which we replace. So work backwards from there. */
6483 843 : tree scalar_dest
6484 843 : = gimple_phi_arg_def_from_edge (as_a <gphi *> (STMT_VINFO_STMT
6485 : (reduc_var_def)),
6486 843 : loop_latch_edge (loop));
6487 843 : stmt_vec_info scalar_dest_def_info
6488 843 : = vect_stmt_to_vectorize (loop_vinfo->lookup_def (scalar_dest));
6489 843 : tree scalar_type = TREE_TYPE (scalar_dest);
6490 :
6491 843 : int vec_num = vec_oprnds0.length ();
6492 843 : tree vec_elem_type = TREE_TYPE (vectype_out);
6493 843 : gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6494 :
6495 843 : tree vector_identity = NULL_TREE;
6496 843 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6497 : {
6498 2 : vector_identity = build_zero_cst (vectype_out);
6499 2 : if (!HONOR_SIGNED_ZEROS (vectype_out))
6500 : ;
6501 : else
6502 : {
6503 2 : gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
6504 2 : vector_identity = const_unop (NEGATE_EXPR, vectype_out,
6505 : vector_identity);
6506 : }
6507 : }
6508 :
6509 843 : tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6510 843 : int i;
6511 843 : tree def0;
6512 1944 : FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6513 : {
6514 1101 : gimple *new_stmt;
6515 1101 : tree mask = NULL_TREE;
6516 1101 : tree len = NULL_TREE;
6517 1101 : tree bias = NULL_TREE;
6518 1101 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6519 : {
6520 9 : tree loop_mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
6521 : vec_num, vectype_in, i);
6522 9 : if (is_cond_op)
6523 9 : mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (loop_mask),
6524 9 : loop_mask, vec_opmask[i], gsi);
6525 : else
6526 : mask = loop_mask;
6527 : }
6528 1092 : else if (is_cond_op)
6529 68 : mask = vec_opmask[i];
6530 1101 : if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
6531 : {
6532 0 : len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
6533 : i, 1, false);
6534 0 : signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
6535 0 : bias = build_int_cst (intQI_type_node, biasval);
6536 0 : if (!is_cond_op)
6537 0 : mask = build_minus_one_cst (truth_type_for (vectype_in));
6538 : }
6539 :
6540 : /* Handle MINUS by adding the negative. */
6541 1101 : if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6542 : {
6543 0 : tree negated = make_ssa_name (vectype_out);
6544 0 : new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6545 0 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6546 0 : def0 = negated;
6547 : }
6548 :
6549 9 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
6550 1110 : && mask && mask_reduc_fn == IFN_LAST)
6551 9 : def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6552 : vector_identity);
6553 :
6554 : /* On the first iteration the input is simply the scalar phi
6555 : result, and for subsequent iterations it is the output of
6556 : the preceding operation. */
6557 1101 : if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6558 : {
6559 0 : if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6560 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
6561 : def0, mask, len, bias);
6562 0 : else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
6563 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6564 : def0, mask);
6565 : else
6566 0 : new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6567 : def0);
6568 : /* For chained SLP reductions the output of the previous reduction
6569 : operation serves as the input of the next. For the final statement
6570 : the output cannot be a temporary - we reuse the original
6571 : scalar destination of the last statement. */
6572 0 : if (i != vec_num - 1)
6573 : {
6574 0 : gimple_set_lhs (new_stmt, scalar_dest_var);
6575 0 : reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6576 0 : gimple_set_lhs (new_stmt, reduc_var);
6577 : }
6578 : }
6579 : else
6580 : {
6581 1101 : reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
6582 : tree_code (code), reduc_var, def0,
6583 : mask);
6584 1101 : new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6585 : /* Remove the statement, so that we can use the same code paths
6586 : as for statements that we've just created. */
6587 1101 : gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6588 1101 : gsi_remove (&tmp_gsi, true);
6589 : }
6590 :
6591 1101 : if (i == vec_num - 1)
6592 : {
6593 843 : gimple_set_lhs (new_stmt, scalar_dest);
6594 843 : vect_finish_replace_stmt (loop_vinfo,
6595 : scalar_dest_def_info,
6596 : new_stmt);
6597 : }
6598 : else
6599 258 : vect_finish_stmt_generation (loop_vinfo,
6600 : scalar_dest_def_info,
6601 : new_stmt, gsi);
6602 :
6603 1101 : slp_node->push_vec_def (new_stmt);
6604 : }
6605 :
6606 843 : return true;
6607 843 : }
6608 :
6609 : /* Function is_nonwrapping_integer_induction.
6610 :
6611 : Check if STMT_VINO (which is part of loop LOOP) both increments and
6612 : does not cause overflow. */
6613 :
6614 : static bool
6615 408 : is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6616 : {
6617 408 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6618 408 : tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6619 408 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6620 408 : tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6621 408 : widest_int ni, max_loop_value, lhs_max;
6622 408 : wi::overflow_type overflow = wi::OVF_NONE;
6623 :
6624 : /* Make sure the loop is integer based. */
6625 408 : if (TREE_CODE (base) != INTEGER_CST
6626 109 : || TREE_CODE (step) != INTEGER_CST)
6627 : return false;
6628 :
6629 : /* Check that the max size of the loop will not wrap. */
6630 :
6631 109 : if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6632 : return true;
6633 :
6634 8 : if (! max_stmt_executions (loop, &ni))
6635 : return false;
6636 :
6637 8 : max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6638 8 : &overflow);
6639 8 : if (overflow)
6640 : return false;
6641 :
6642 8 : max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6643 16 : TYPE_SIGN (lhs_type), &overflow);
6644 8 : if (overflow)
6645 : return false;
6646 :
6647 8 : return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6648 8 : <= TYPE_PRECISION (lhs_type));
6649 408 : }
6650 :
6651 : /* Check if masking can be supported by inserting a conditional expression.
6652 : CODE is the code for the operation. COND_FN is the conditional internal
6653 : function, if it exists. VECTYPE_IN is the type of the vector input. */
6654 : static bool
6655 5104 : use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6656 : tree vectype_in)
6657 : {
6658 5104 : if (cond_fn != IFN_LAST
6659 5104 : && direct_internal_fn_supported_p (cond_fn, vectype_in,
6660 : OPTIMIZE_FOR_SPEED))
6661 : return false;
6662 :
6663 3598 : if (code.is_tree_code ())
6664 3592 : switch (tree_code (code))
6665 : {
6666 : case DOT_PROD_EXPR:
6667 : case SAD_EXPR:
6668 : return true;
6669 :
6670 : default:
6671 : break;
6672 : }
6673 : return false;
6674 : }
6675 :
6676 : /* Insert a conditional expression to enable masked vectorization. CODE is the
6677 : code for the operation. VOP is the array of operands. MASK is the loop
6678 : mask. GSI is a statement iterator used to place the new conditional
6679 : expression. */
6680 : static void
6681 4 : build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6682 : gimple_stmt_iterator *gsi)
6683 : {
6684 4 : switch (tree_code (code))
6685 : {
6686 4 : case DOT_PROD_EXPR:
6687 4 : {
6688 4 : tree vectype = TREE_TYPE (vop[1]);
6689 4 : tree zero = build_zero_cst (vectype);
6690 4 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6691 4 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6692 : mask, vop[1], zero);
6693 4 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6694 4 : vop[1] = masked_op1;
6695 4 : break;
6696 : }
6697 :
6698 0 : case SAD_EXPR:
6699 0 : {
6700 0 : tree vectype = TREE_TYPE (vop[1]);
6701 0 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6702 0 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6703 : mask, vop[1], vop[0]);
6704 0 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6705 0 : vop[1] = masked_op1;
6706 0 : break;
6707 : }
6708 :
6709 0 : default:
6710 0 : gcc_unreachable ();
6711 : }
6712 4 : }
6713 :
6714 : /* Given an operation with CODE in loop reduction path whose reduction PHI is
6715 : specified by REDUC_INFO, the operation has TYPE of scalar result, and its
6716 : input vectype is represented by VECTYPE_IN. The vectype of vectorized result
6717 : may be different from VECTYPE_IN, either in base type or vectype lanes,
6718 : lane-reducing operation is the case. This function check if it is possible,
6719 : and how to perform partial vectorization on the operation in the context
6720 : of LOOP_VINFO. */
6721 :
6722 : static void
6723 3209 : vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo,
6724 : vect_reduc_info reduc_info,
6725 : slp_tree slp_node,
6726 : code_helper code, tree type,
6727 : tree vectype_in)
6728 : {
6729 3209 : enum vect_reduction_type reduc_type = VECT_REDUC_INFO_TYPE (reduc_info);
6730 3209 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
6731 3209 : internal_fn cond_fn
6732 924 : = ((code.is_internal_fn ()
6733 924 : && internal_fn_mask_index ((internal_fn)code) != -1)
6734 3209 : ? (internal_fn)code : get_conditional_internal_fn (code, type));
6735 :
6736 3209 : if (reduc_type != FOLD_LEFT_REDUCTION
6737 2529 : && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6738 5695 : && (cond_fn == IFN_LAST
6739 2486 : || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6740 : OPTIMIZE_FOR_SPEED)))
6741 : {
6742 1514 : if (dump_enabled_p ())
6743 98 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6744 : "can't operate on partial vectors because"
6745 : " no conditional operation is available.\n");
6746 1514 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6747 : }
6748 1695 : else if (reduc_type == FOLD_LEFT_REDUCTION
6749 1695 : && reduc_fn == IFN_LAST
6750 1695 : && !expand_vec_cond_expr_p (vectype_in, truth_type_for (vectype_in)))
6751 : {
6752 0 : if (dump_enabled_p ())
6753 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6754 : "can't operate on partial vectors because"
6755 : " no conditional operation is available.\n");
6756 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6757 : }
6758 1695 : else if (reduc_type == FOLD_LEFT_REDUCTION
6759 680 : && internal_fn_mask_index (reduc_fn) == -1
6760 680 : && FLOAT_TYPE_P (vectype_in)
6761 2370 : && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
6762 : {
6763 0 : if (dump_enabled_p ())
6764 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6765 : "can't operate on partial vectors because"
6766 : " signed zeros cannot be preserved.\n");
6767 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6768 : }
6769 : else
6770 : {
6771 1695 : internal_fn mask_reduc_fn
6772 1695 : = get_masked_reduction_fn (reduc_fn, vectype_in);
6773 1695 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6774 1695 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
6775 1695 : unsigned nvectors = vect_get_num_copies (loop_vinfo, slp_node);
6776 :
6777 1695 : if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6778 0 : vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_in, 1);
6779 : else
6780 1695 : vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_in, NULL);
6781 : }
6782 3209 : }
6783 :
6784 : /* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
6785 : the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC,
6786 : and the analysis is for slp if SLP_NODE is not NULL.
6787 :
6788 : For a lane-reducing operation, the loop reduction path that it lies in,
6789 : may contain normal operation, or other lane-reducing operation of different
6790 : input type size, an example as:
6791 :
6792 : int sum = 0;
6793 : for (i)
6794 : {
6795 : ...
6796 : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
6797 : sum += w[i]; // widen-sum <vector(16) char>
6798 : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
6799 : sum += n[i]; // normal <vector(4) int>
6800 : ...
6801 : }
6802 :
6803 : Vectorization factor is essentially determined by operation whose input
6804 : vectype has the most lanes ("vector(16) char" in the example), while we
6805 : need to choose input vectype with the least lanes ("vector(4) int" in the
6806 : example) to determine effective number of vector reduction PHIs. */
6807 :
6808 : bool
6809 334884 : vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
6810 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
6811 : {
6812 334884 : gimple *stmt = stmt_info->stmt;
6813 :
6814 334884 : if (!lane_reducing_stmt_p (stmt))
6815 : return false;
6816 :
6817 454 : tree type = TREE_TYPE (gimple_assign_lhs (stmt));
6818 :
6819 454 : if (!INTEGRAL_TYPE_P (type))
6820 : return false;
6821 :
6822 : /* Do not try to vectorize bit-precision reductions. */
6823 454 : if (!type_has_mode_precision_p (type))
6824 : return false;
6825 :
6826 454 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
6827 :
6828 : /* TODO: Support lane-reducing operation that does not directly participate
6829 : in loop reduction. */
6830 454 : if (!reduc_info)
6831 : return false;
6832 :
6833 : /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
6834 : recoginized. */
6835 454 : gcc_assert (!nested_in_vect_loop_p (LOOP_VINFO_LOOP (loop_vinfo), stmt_info));
6836 454 : gcc_assert (VECT_REDUC_INFO_TYPE (reduc_info) == TREE_CODE_REDUCTION);
6837 :
6838 1816 : for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
6839 : {
6840 1362 : slp_tree slp_op;
6841 1362 : tree op;
6842 1362 : tree vectype;
6843 1362 : enum vect_def_type dt;
6844 :
6845 1362 : if (!vect_is_simple_use (loop_vinfo, slp_node, i, &op,
6846 : &slp_op, &dt, &vectype))
6847 : {
6848 0 : if (dump_enabled_p ())
6849 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6850 : "use not simple.\n");
6851 0 : return false;
6852 : }
6853 :
6854 1362 : if (!vectype)
6855 : {
6856 6 : vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
6857 : slp_op);
6858 6 : if (!vectype)
6859 : return false;
6860 : }
6861 :
6862 1362 : if (!vect_maybe_update_slp_op_vectype (slp_op, vectype))
6863 : {
6864 0 : if (dump_enabled_p ())
6865 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6866 : "incompatible vector types for invariants\n");
6867 0 : return false;
6868 : }
6869 :
6870 1362 : if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6871 454 : continue;
6872 :
6873 : /* There should be at most one cycle def in the stmt. */
6874 908 : if (VECTORIZABLE_CYCLE_DEF (dt))
6875 : return false;
6876 : }
6877 :
6878 454 : slp_tree node_in = SLP_TREE_CHILDREN (slp_node)[0];
6879 454 : tree vectype_in = SLP_TREE_VECTYPE (node_in);
6880 454 : gcc_assert (vectype_in);
6881 :
6882 : /* Compute number of effective vector statements for costing. */
6883 454 : unsigned int ncopies_for_cost = vect_get_num_copies (loop_vinfo, node_in);
6884 454 : gcc_assert (ncopies_for_cost >= 1);
6885 :
6886 454 : if (vect_is_emulated_mixed_dot_prod (slp_node))
6887 : {
6888 : /* We need extra two invariants: one that contains the minimum signed
6889 : value and one that contains half of its negative. */
6890 9 : int prologue_stmts = 2;
6891 9 : unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
6892 : scalar_to_vec, slp_node, 0,
6893 : vect_prologue);
6894 9 : if (dump_enabled_p ())
6895 0 : dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
6896 : "extra prologue_cost = %d .\n", cost);
6897 :
6898 : /* Three dot-products and a subtraction. */
6899 9 : ncopies_for_cost *= 4;
6900 : }
6901 :
6902 454 : record_stmt_cost (cost_vec, (int) ncopies_for_cost, vector_stmt, slp_node,
6903 : 0, vect_body);
6904 :
6905 454 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
6906 : {
6907 43 : enum tree_code code = gimple_assign_rhs_code (stmt);
6908 43 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
6909 43 : node_in, code, type,
6910 : vectype_in);
6911 : }
6912 :
6913 : /* Transform via vect_transform_reduction. */
6914 454 : SLP_TREE_TYPE (slp_node) = reduc_vec_info_type;
6915 454 : return true;
6916 : }
6917 :
6918 : /* Function vectorizable_reduction.
6919 :
6920 : Check if STMT_INFO performs a reduction operation that can be vectorized.
6921 : If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6922 : stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6923 : Return true if STMT_INFO is vectorizable in this way.
6924 :
6925 : This function also handles reduction idioms (patterns) that have been
6926 : recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6927 : may be of this form:
6928 : X = pattern_expr (arg0, arg1, ..., X)
6929 : and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6930 : sequence that had been detected and replaced by the pattern-stmt
6931 : (STMT_INFO).
6932 :
6933 : This function also handles reduction of condition expressions, for example:
6934 : for (int i = 0; i < N; i++)
6935 : if (a[i] < value)
6936 : last = a[i];
6937 : This is handled by vectorising the loop and creating an additional vector
6938 : containing the loop indexes for which "a[i] < value" was true. In the
6939 : function epilogue this is reduced to a single max value and then used to
6940 : index into the vector of results.
6941 :
6942 : In some cases of reduction patterns, the type of the reduction variable X is
6943 : different than the type of the other arguments of STMT_INFO.
6944 : In such cases, the vectype that is used when transforming STMT_INFO into
6945 : a vector stmt is different than the vectype that is used to determine the
6946 : vectorization factor, because it consists of a different number of elements
6947 : than the actual number of elements that are being operated upon in parallel.
6948 :
6949 : For example, consider an accumulation of shorts into an int accumulator.
6950 : On some targets it's possible to vectorize this pattern operating on 8
6951 : shorts at a time (hence, the vectype for purposes of determining the
6952 : vectorization factor should be V8HI); on the other hand, the vectype that
6953 : is used to create the vector form is actually V4SI (the type of the result).
6954 :
6955 : Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6956 : indicates what is the actual level of parallelism (V8HI in the example), so
6957 : that the right vectorization factor would be derived. This vectype
6958 : corresponds to the type of arguments to the reduction stmt, and should *NOT*
6959 : be used to create the vectorized stmt. The right vectype for the vectorized
6960 : stmt is obtained from the type of the result X:
6961 : get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6962 :
6963 : This means that, contrary to "regular" reductions (or "regular" stmts in
6964 : general), the following equation:
6965 : STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6966 : does *NOT* necessarily hold for reduction patterns. */
6967 :
6968 : bool
6969 334430 : vectorizable_reduction (loop_vec_info loop_vinfo,
6970 : stmt_vec_info stmt_info, slp_tree slp_node,
6971 : slp_instance slp_node_instance,
6972 : stmt_vector_for_cost *cost_vec)
6973 : {
6974 334430 : tree vectype_in = NULL_TREE;
6975 334430 : enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6976 334430 : stmt_vec_info cond_stmt_vinfo = NULL;
6977 334430 : int i;
6978 334430 : int ncopies;
6979 334430 : bool single_defuse_cycle = false;
6980 334430 : tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6981 334430 : tree cond_reduc_val = NULL_TREE;
6982 :
6983 : /* Make sure it was already recognized as a reduction computation. */
6984 334430 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6985 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6986 334430 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6987 : return false;
6988 :
6989 : /* The reduction meta. */
6990 57732 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
6991 :
6992 57732 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6993 : {
6994 1427 : gcc_assert (is_a <gphi *> (stmt_info->stmt));
6995 : /* We eventually need to set a vector type on invariant arguments. */
6996 : unsigned j;
6997 : slp_tree child;
6998 4273 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6999 2854 : if (!vect_maybe_update_slp_op_vectype (child,
7000 : SLP_TREE_VECTYPE (slp_node)))
7001 : {
7002 0 : if (dump_enabled_p ())
7003 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7004 : "incompatible vector types for "
7005 : "invariants\n");
7006 0 : return false;
7007 : }
7008 2854 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7009 2854 : && !useless_type_conversion_p (SLP_TREE_VECTYPE (slp_node),
7010 : SLP_TREE_VECTYPE (child)))
7011 : {
7012 : /* With bools we can have mask and non-mask precision vectors
7013 : or different non-mask precisions. while pattern recog is
7014 : supposed to guarantee consistency here, we do not have
7015 : pattern stmts for PHIs (PR123316).
7016 : Deal with that here instead of ICEing later. */
7017 8 : if (dump_enabled_p ())
7018 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7019 : "incompatible vector type setup from "
7020 : "bool pattern detection\n");
7021 8 : return false;
7022 : }
7023 : /* Analysis for double-reduction is done on the outer
7024 : loop PHI, nested cycles have no further restrictions. */
7025 1419 : SLP_TREE_TYPE (slp_node) = cycle_phi_info_type;
7026 1419 : return true;
7027 : }
7028 :
7029 56305 : if (!is_a <gphi *> (stmt_info->stmt))
7030 : {
7031 6880 : gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def);
7032 6880 : SLP_TREE_TYPE (slp_node) = reduc_vec_info_type;
7033 6880 : return true;
7034 : }
7035 :
7036 49425 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7037 49425 : stmt_vec_info phi_info = stmt_info;
7038 49425 : bool double_reduc = false;
7039 49425 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7040 : {
7041 : /* We arrive here for both the inner loop LC PHI and the
7042 : outer loop PHI. The latter is what we want to analyze the
7043 : reduction with. The LC PHI is handled by vectorizable_lc_phi. */
7044 287 : if (gimple_bb (stmt_info->stmt) != loop->header)
7045 0 : return false;
7046 :
7047 : /* Set loop and phi_info to the inner loop. */
7048 287 : use_operand_p use_p;
7049 287 : gimple *use_stmt;
7050 287 : bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7051 : &use_p, &use_stmt);
7052 287 : gcc_assert (res);
7053 287 : phi_info = loop_vinfo->lookup_stmt (use_stmt);
7054 287 : loop = loop->inner;
7055 287 : double_reduc = true;
7056 : }
7057 :
7058 49425 : const bool reduc_chain = reduc_info->is_reduc_chain;
7059 49425 : slp_node_instance->reduc_phis = slp_node;
7060 : /* ??? We're leaving slp_node to point to the PHIs, we only
7061 : need it to get at the number of vector stmts which wasn't
7062 : yet initialized for the instance root. */
7063 :
7064 : /* PHIs should not participate in patterns. */
7065 49425 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7066 49425 : gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7067 :
7068 : /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7069 : and compute the reduction chain length. Discover the real
7070 : reduction operation stmt on the way (slp_for_stmt_info). */
7071 49425 : unsigned reduc_chain_length = 0;
7072 49425 : stmt_info = NULL;
7073 49425 : slp_tree slp_for_stmt_info = NULL;
7074 49425 : slp_tree vdef_slp = slp_node_instance->root;
7075 108694 : while (vdef_slp != slp_node)
7076 : {
7077 60021 : int reduc_idx = SLP_TREE_REDUC_IDX (vdef_slp);
7078 60021 : if (reduc_idx == -1)
7079 : {
7080 744 : if (dump_enabled_p ())
7081 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7082 : "reduction chain broken by patterns.\n");
7083 752 : return false;
7084 : }
7085 59277 : stmt_vec_info vdef = SLP_TREE_REPRESENTATIVE (vdef_slp);
7086 59277 : if (is_a <gphi *> (vdef->stmt))
7087 : {
7088 574 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[reduc_idx];
7089 : /* Do not count PHIs towards the chain length. */
7090 574 : continue;
7091 : }
7092 58703 : gimple_match_op op;
7093 58703 : if (!gimple_extract_op (vdef->stmt, &op))
7094 : {
7095 0 : if (dump_enabled_p ())
7096 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7097 : "reduction chain includes unsupported"
7098 : " statement type.\n");
7099 0 : return false;
7100 : }
7101 58703 : if (CONVERT_EXPR_CODE_P (op.code))
7102 : {
7103 3312 : if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7104 : {
7105 8 : if (dump_enabled_p ())
7106 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7107 : "conversion in the reduction chain.\n");
7108 8 : return false;
7109 : }
7110 3304 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[0];
7111 : }
7112 : else
7113 : {
7114 : /* First non-conversion stmt. */
7115 55391 : if (!slp_for_stmt_info)
7116 48673 : slp_for_stmt_info = vdef_slp;
7117 :
7118 55391 : if (lane_reducing_op_p (op.code))
7119 : {
7120 : /* The last operand of lane-reducing operation is for
7121 : reduction. */
7122 454 : gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
7123 :
7124 454 : slp_tree op_node = SLP_TREE_CHILDREN (vdef_slp)[0];
7125 454 : tree vectype_op = SLP_TREE_VECTYPE (op_node);
7126 454 : tree type_op = TREE_TYPE (op.ops[0]);
7127 454 : if (!vectype_op)
7128 : {
7129 9 : vectype_op = get_vectype_for_scalar_type (loop_vinfo,
7130 : type_op);
7131 9 : if (!vectype_op
7132 9 : || !vect_maybe_update_slp_op_vectype (op_node,
7133 : vectype_op))
7134 0 : return false;
7135 : }
7136 :
7137 : /* To accommodate lane-reducing operations of mixed input
7138 : vectypes, choose input vectype with the least lanes for the
7139 : reduction PHI statement, which would result in the most
7140 : ncopies for vectorized reduction results. */
7141 454 : if (!vectype_in
7142 454 : || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7143 46 : < GET_MODE_SIZE (SCALAR_TYPE_MODE (type_op))))
7144 431 : vectype_in = vectype_op;
7145 : }
7146 54937 : else if (!vectype_in)
7147 48242 : vectype_in = SLP_TREE_VECTYPE (slp_node);
7148 55391 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[reduc_idx];
7149 : }
7150 58695 : reduc_chain_length++;
7151 : }
7152 48673 : if (!slp_for_stmt_info)
7153 : {
7154 0 : if (dump_enabled_p ())
7155 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7156 : "only noop-conversions in the reduction chain.\n");
7157 0 : return false;
7158 : }
7159 48673 : stmt_info = SLP_TREE_REPRESENTATIVE (slp_for_stmt_info);
7160 :
7161 : /* PHIs should not participate in patterns. */
7162 48673 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7163 :
7164 : /* 1. Is vectorizable reduction? */
7165 : /* Not supportable if the reduction variable is used in the loop, unless
7166 : it's a reduction chain. */
7167 48673 : if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7168 0 : && !reduc_chain)
7169 : return false;
7170 :
7171 : /* Reductions that are not used even in an enclosing outer-loop,
7172 : are expected to be "live" (used out of the loop). */
7173 48673 : if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7174 0 : && !STMT_VINFO_LIVE_P (stmt_info))
7175 : return false;
7176 :
7177 : /* 2. Has this been recognized as a reduction pattern?
7178 :
7179 : Check if STMT represents a pattern that has been recognized
7180 : in earlier analysis stages. For stmts that represent a pattern,
7181 : the STMT_VINFO_RELATED_STMT field records the last stmt in
7182 : the original sequence that constitutes the pattern. */
7183 :
7184 48673 : stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7185 48673 : if (orig_stmt_info)
7186 : {
7187 3271 : gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7188 3271 : gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7189 : }
7190 :
7191 : /* 3. Check the operands of the operation. The first operands are defined
7192 : inside the loop body. The last operand is the reduction variable,
7193 : which is defined by the loop-header-phi. */
7194 :
7195 48673 : tree vectype_out = SLP_TREE_VECTYPE (slp_for_stmt_info);
7196 48673 : VECT_REDUC_INFO_VECTYPE (reduc_info) = vectype_out;
7197 :
7198 48673 : gimple_match_op op;
7199 48673 : if (!gimple_extract_op (stmt_info->stmt, &op))
7200 0 : gcc_unreachable ();
7201 48673 : bool lane_reducing = lane_reducing_op_p (op.code);
7202 :
7203 48673 : if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7204 15140 : && !SCALAR_FLOAT_TYPE_P (op.type))
7205 : return false;
7206 :
7207 : /* Do not try to vectorize bit-precision reductions. */
7208 48673 : if (!type_has_mode_precision_p (op.type)
7209 1552 : && op.code != BIT_AND_EXPR
7210 1472 : && op.code != BIT_IOR_EXPR
7211 49108 : && op.code != BIT_XOR_EXPR)
7212 : return false;
7213 :
7214 : /* Lane-reducing ops also never can be used in a SLP reduction group
7215 : since we'll mix lanes belonging to different reductions. But it's
7216 : OK to use them in a reduction chain or when the reduction group
7217 : has just one element. */
7218 48363 : if (lane_reducing
7219 48363 : && !reduc_chain
7220 404 : && SLP_TREE_LANES (slp_node) > 1)
7221 : {
7222 0 : if (dump_enabled_p ())
7223 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7224 : "lane-reducing reduction in reduction group.\n");
7225 0 : return false;
7226 : }
7227 :
7228 : /* All uses but the last are expected to be defined in the loop.
7229 : The last use is the reduction variable. In case of nested cycle this
7230 : assumption is not true: we use reduc_index to record the index of the
7231 : reduction variable. */
7232 48363 : slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7233 48363 : tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7234 48363 : gcc_assert (op.code != COND_EXPR || !COMPARISON_CLASS_P (op.ops[0]));
7235 153555 : for (i = 0; i < (int) op.num_ops; i++)
7236 : {
7237 : /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7238 105192 : if (i == 0 && op.code == COND_EXPR)
7239 52784 : continue;
7240 :
7241 104385 : stmt_vec_info def_stmt_info;
7242 104385 : enum vect_def_type dt;
7243 104385 : if (!vect_is_simple_use (loop_vinfo, slp_for_stmt_info,
7244 : i, &op.ops[i], &slp_op[i], &dt,
7245 104385 : &vectype_op[i], &def_stmt_info))
7246 : {
7247 0 : if (dump_enabled_p ())
7248 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7249 : "use not simple.\n");
7250 0 : return false;
7251 : }
7252 :
7253 : /* Skip reduction operands, and for an IFN_COND_OP we might hit the
7254 : reduction operand twice (once as definition, once as else). */
7255 104385 : if (SLP_TREE_CHILDREN (slp_for_stmt_info)[i]
7256 208770 : == SLP_TREE_CHILDREN
7257 104385 : (slp_for_stmt_info)[SLP_TREE_REDUC_IDX (slp_for_stmt_info)])
7258 51977 : continue;
7259 :
7260 : /* There should be only one cycle def in the stmt, the one
7261 : leading to reduc_def. */
7262 52408 : if (SLP_TREE_CHILDREN (slp_for_stmt_info)[i]->cycle_info.id != -1)
7263 : return false;
7264 :
7265 52408 : if (!vectype_op[i])
7266 4506 : vectype_op[i]
7267 4506 : = get_vectype_for_scalar_type (loop_vinfo,
7268 4506 : TREE_TYPE (op.ops[i]), slp_op[i]);
7269 :
7270 : /* Record how the non-reduction-def value of COND_EXPR is defined.
7271 : ??? For a chain of multiple CONDs we'd have to match them up all. */
7272 52408 : if (op.code == COND_EXPR && reduc_chain_length == 1)
7273 : {
7274 784 : if (dt == vect_constant_def)
7275 : {
7276 95 : cond_reduc_dt = dt;
7277 95 : cond_reduc_val = op.ops[i];
7278 : }
7279 689 : else if (dt == vect_induction_def
7280 408 : && def_stmt_info
7281 1097 : && is_nonwrapping_integer_induction (def_stmt_info, loop))
7282 : {
7283 109 : cond_reduc_dt = dt;
7284 109 : cond_stmt_vinfo = def_stmt_info;
7285 : }
7286 : }
7287 : }
7288 :
7289 48363 : enum vect_reduction_type reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7290 : /* If we have a condition reduction, see if we can simplify it further. */
7291 48363 : if (reduction_type == COND_REDUCTION)
7292 : {
7293 795 : if (SLP_TREE_LANES (slp_node) != 1)
7294 : return false;
7295 :
7296 : /* When the condition uses the reduction value in the condition, fail. */
7297 771 : if (SLP_TREE_REDUC_IDX (slp_node) == 0)
7298 : {
7299 0 : if (dump_enabled_p ())
7300 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7301 : "condition depends on previous iteration\n");
7302 0 : return false;
7303 : }
7304 :
7305 771 : if (reduc_chain_length == 1
7306 771 : && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7307 : OPTIMIZE_FOR_SPEED)
7308 748 : || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7309 : vectype_in,
7310 : OPTIMIZE_FOR_SPEED)))
7311 : {
7312 0 : if (dump_enabled_p ())
7313 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7314 : "optimizing condition reduction with"
7315 : " FOLD_EXTRACT_LAST.\n");
7316 0 : VECT_REDUC_INFO_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7317 : }
7318 771 : else if (cond_reduc_dt == vect_induction_def)
7319 : {
7320 109 : tree base
7321 : = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7322 109 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7323 :
7324 109 : gcc_assert (TREE_CODE (base) == INTEGER_CST
7325 : && TREE_CODE (step) == INTEGER_CST);
7326 109 : cond_reduc_val = NULL_TREE;
7327 109 : enum tree_code cond_reduc_op_code = ERROR_MARK;
7328 109 : tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7329 109 : if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7330 : ;
7331 : /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7332 : above base; punt if base is the minimum value of the type for
7333 : MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7334 97 : else if (tree_int_cst_sgn (step) == -1)
7335 : {
7336 18 : cond_reduc_op_code = MIN_EXPR;
7337 18 : if (tree_int_cst_sgn (base) == -1)
7338 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7339 18 : else if (tree_int_cst_lt (base,
7340 18 : TYPE_MAX_VALUE (TREE_TYPE (base))))
7341 18 : cond_reduc_val
7342 18 : = int_const_binop (PLUS_EXPR, base, integer_one_node);
7343 : }
7344 : else
7345 : {
7346 79 : cond_reduc_op_code = MAX_EXPR;
7347 79 : if (tree_int_cst_sgn (base) == 1)
7348 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7349 79 : else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7350 : base))
7351 79 : cond_reduc_val
7352 79 : = int_const_binop (MINUS_EXPR, base, integer_one_node);
7353 : }
7354 97 : if (cond_reduc_val)
7355 : {
7356 97 : if (dump_enabled_p ())
7357 61 : dump_printf_loc (MSG_NOTE, vect_location,
7358 : "condition expression based on "
7359 : "integer induction.\n");
7360 97 : VECT_REDUC_INFO_CODE (reduc_info) = cond_reduc_op_code;
7361 97 : VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info)
7362 97 : = cond_reduc_val;
7363 97 : VECT_REDUC_INFO_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7364 : }
7365 : }
7366 662 : else if (cond_reduc_dt == vect_constant_def)
7367 : {
7368 85 : enum vect_def_type cond_initial_dt;
7369 85 : tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7370 85 : vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7371 85 : if (cond_initial_dt == vect_constant_def
7372 107 : && types_compatible_p (TREE_TYPE (cond_initial_val),
7373 22 : TREE_TYPE (cond_reduc_val)))
7374 : {
7375 22 : tree e = fold_binary (LE_EXPR, boolean_type_node,
7376 : cond_initial_val, cond_reduc_val);
7377 22 : if (e && (integer_onep (e) || integer_zerop (e)))
7378 : {
7379 22 : if (dump_enabled_p ())
7380 16 : dump_printf_loc (MSG_NOTE, vect_location,
7381 : "condition expression based on "
7382 : "compile time constant.\n");
7383 : /* Record reduction code at analysis stage. */
7384 22 : VECT_REDUC_INFO_CODE (reduc_info)
7385 22 : = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7386 22 : VECT_REDUC_INFO_TYPE (reduc_info) = CONST_COND_REDUCTION;
7387 : }
7388 : }
7389 : }
7390 : }
7391 :
7392 48339 : if (STMT_VINFO_LIVE_P (phi_info))
7393 : return false;
7394 :
7395 48339 : ncopies = vect_get_num_copies (loop_vinfo, slp_node);
7396 :
7397 48339 : gcc_assert (ncopies >= 1);
7398 :
7399 48339 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7400 :
7401 : /* 4.2. Check support for the epilog operation.
7402 :
7403 : If STMT represents a reduction pattern, then the type of the
7404 : reduction variable may be different than the type of the rest
7405 : of the arguments. For example, consider the case of accumulation
7406 : of shorts into an int accumulator; The original code:
7407 : S1: int_a = (int) short_a;
7408 : orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7409 :
7410 : was replaced with:
7411 : STMT: int_acc = widen_sum <short_a, int_acc>
7412 :
7413 : This means that:
7414 : 1. The tree-code that is used to create the vector operation in the
7415 : epilog code (that reduces the partial results) is not the
7416 : tree-code of STMT, but is rather the tree-code of the original
7417 : stmt from the pattern that STMT is replacing. I.e, in the example
7418 : above we want to use 'widen_sum' in the loop, but 'plus' in the
7419 : epilog.
7420 : 2. The type (mode) we use to check available target support
7421 : for the vector operation to be created in the *epilog*, is
7422 : determined by the type of the reduction variable (in the example
7423 : above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7424 : However the type (mode) we use to check available target support
7425 : for the vector operation to be created *inside the loop*, is
7426 : determined by the type of the other arguments to STMT (in the
7427 : example we'd check this: optab_handler (widen_sum_optab,
7428 : vect_short_mode)).
7429 :
7430 : This is contrary to "regular" reductions, in which the types of all
7431 : the arguments are the same as the type of the reduction variable.
7432 : For "regular" reductions we can therefore use the same vector type
7433 : (and also the same tree-code) when generating the epilog code and
7434 : when generating the code inside the loop. */
7435 :
7436 48339 : code_helper orig_code = VECT_REDUC_INFO_CODE (reduc_info);
7437 :
7438 : /* If conversion might have created a conditional operation like
7439 : IFN_COND_ADD already. Use the internal code for the following checks. */
7440 48339 : if (orig_code.is_internal_fn ())
7441 : {
7442 3682 : tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7443 3682 : orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7444 : }
7445 :
7446 48339 : VECT_REDUC_INFO_CODE (reduc_info) = orig_code;
7447 :
7448 48339 : reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7449 48339 : if (reduction_type == TREE_CODE_REDUCTION)
7450 : {
7451 : /* Check whether it's ok to change the order of the computation.
7452 : Generally, when vectorizing a reduction we change the order of the
7453 : computation. This may change the behavior of the program in some
7454 : cases, so we need to check that this is ok. One exception is when
7455 : vectorizing an outer-loop: the inner-loop is executed sequentially,
7456 : and therefore vectorizing reductions in the inner-loop during
7457 : outer-loop vectorization is safe. Likewise when we are vectorizing
7458 : a series of reductions using SLP and the VF is one the reductions
7459 : are performed in scalar order. */
7460 47568 : if (!reduc_chain
7461 47568 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7462 : ;
7463 47427 : else if (needs_fold_left_reduction_p (op.type, orig_code))
7464 : {
7465 : /* When vectorizing a reduction chain w/o SLP the reduction PHI
7466 : is not directy used in stmt. */
7467 4799 : if (reduc_chain_length != 1)
7468 : {
7469 67 : if (dump_enabled_p ())
7470 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7471 : "in-order reduction chain without SLP.\n");
7472 67 : return false;
7473 : }
7474 : /* Code generation doesn't support function calls other
7475 : than .COND_*. */
7476 4732 : if (!op.code.is_tree_code ()
7477 4866 : && !(op.code.is_internal_fn ()
7478 67 : && conditional_internal_fn_code (internal_fn (op.code))
7479 : != ERROR_MARK))
7480 : {
7481 18 : if (dump_enabled_p ())
7482 16 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7483 : "in-order reduction chain operation not "
7484 : "supported.\n");
7485 18 : return false;
7486 : }
7487 4714 : VECT_REDUC_INFO_TYPE (reduc_info)
7488 4714 : = reduction_type = FOLD_LEFT_REDUCTION;
7489 : }
7490 42628 : else if (!commutative_binary_op_p (orig_code, op.type)
7491 42628 : || !associative_binary_op_p (orig_code, op.type))
7492 : {
7493 152 : if (dump_enabled_p ())
7494 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7495 : "reduction: not commutative/associative\n");
7496 152 : return false;
7497 : }
7498 : }
7499 :
7500 4714 : if ((reduction_type == COND_REDUCTION
7501 : || reduction_type == INTEGER_INDUC_COND_REDUCTION
7502 : || reduction_type == CONST_COND_REDUCTION
7503 43388 : || reduction_type == EXTRACT_LAST_REDUCTION)
7504 : && 1
7505 771 : && ncopies > 1)
7506 : {
7507 276 : if (dump_enabled_p ())
7508 60 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7509 : "multiple types in condition reduction.\n");
7510 276 : return false;
7511 : }
7512 :
7513 : /* See if we can convert a mask vector to a corresponding bool data vector
7514 : to perform the epilogue reduction. */
7515 47826 : tree alt_vectype_out = NULL_TREE;
7516 47826 : if (VECTOR_BOOLEAN_TYPE_P (vectype_out))
7517 : {
7518 968 : alt_vectype_out
7519 1936 : = get_related_vectype_for_scalar_type (loop_vinfo->vector_mode,
7520 968 : TREE_TYPE (vectype_out),
7521 : TYPE_VECTOR_SUBPARTS
7522 : (vectype_out));
7523 968 : if (!alt_vectype_out
7524 968 : || maybe_ne (TYPE_VECTOR_SUBPARTS (alt_vectype_out),
7525 1917 : TYPE_VECTOR_SUBPARTS (vectype_out))
7526 1936 : || !expand_vec_cond_expr_p (alt_vectype_out, vectype_out))
7527 19 : alt_vectype_out = NULL_TREE;
7528 : }
7529 :
7530 47826 : internal_fn reduc_fn = IFN_LAST;
7531 47826 : if (reduction_type == TREE_CODE_REDUCTION
7532 47826 : || reduction_type == FOLD_LEFT_REDUCTION
7533 : || reduction_type == INTEGER_INDUC_COND_REDUCTION
7534 495 : || reduction_type == CONST_COND_REDUCTION)
7535 : {
7536 42728 : if (reduction_type == FOLD_LEFT_REDUCTION
7537 51369 : ? fold_left_reduction_fn (orig_code, &reduc_fn)
7538 42728 : : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7539 : {
7540 46766 : internal_fn sbool_fn = IFN_LAST;
7541 46766 : if (reduc_fn == IFN_LAST)
7542 : ;
7543 44848 : else if ((!VECTOR_BOOLEAN_TYPE_P (vectype_out)
7544 968 : || (GET_MODE_CLASS (TYPE_MODE (vectype_out))
7545 : == MODE_VECTOR_BOOL))
7546 88728 : && direct_internal_fn_supported_p (reduc_fn, vectype_out,
7547 : OPTIMIZE_FOR_SPEED))
7548 : ;
7549 10205 : else if (VECTOR_BOOLEAN_TYPE_P (vectype_out)
7550 968 : && sbool_reduction_fn_for_fn (reduc_fn, &sbool_fn)
7551 11173 : && direct_internal_fn_supported_p (sbool_fn, vectype_out,
7552 : OPTIMIZE_FOR_SPEED))
7553 73 : reduc_fn = sbool_fn;
7554 10132 : else if (reduction_type != FOLD_LEFT_REDUCTION
7555 10132 : && alt_vectype_out
7556 10132 : && direct_internal_fn_supported_p (reduc_fn, alt_vectype_out,
7557 : OPTIMIZE_FOR_SPEED))
7558 724 : VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info) = alt_vectype_out;
7559 : else
7560 : {
7561 9408 : if (dump_enabled_p ())
7562 846 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7563 : "reduc op not supported by target.\n");
7564 :
7565 9408 : reduc_fn = IFN_LAST;
7566 : }
7567 : }
7568 : else
7569 : {
7570 676 : if (dump_enabled_p ())
7571 48 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7572 : "no reduc code for scalar code.\n");
7573 :
7574 676 : return false;
7575 : }
7576 46766 : if (reduc_fn == IFN_LAST
7577 46766 : && VECTOR_BOOLEAN_TYPE_P (vectype_out))
7578 : {
7579 171 : if (!alt_vectype_out)
7580 : {
7581 12 : if (dump_enabled_p ())
7582 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7583 : "cannot turn mask into bool data vector for "
7584 : "reduction epilogue.\n");
7585 12 : return false;
7586 : }
7587 159 : VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info) = alt_vectype_out;
7588 : }
7589 : }
7590 384 : else if (reduction_type == COND_REDUCTION)
7591 : {
7592 384 : int scalar_precision
7593 384 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7594 384 : cr_index_scalar_type = make_unsigned_type (scalar_precision);
7595 384 : cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7596 : vectype_out);
7597 :
7598 384 : if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7599 : OPTIMIZE_FOR_SPEED))
7600 12 : reduc_fn = IFN_REDUC_MAX;
7601 : }
7602 47138 : VECT_REDUC_INFO_FN (reduc_info) = reduc_fn;
7603 :
7604 47138 : if (reduction_type != EXTRACT_LAST_REDUCTION
7605 : && reduc_fn == IFN_LAST
7606 : && !nunits_out.is_constant ())
7607 : {
7608 : if (dump_enabled_p ())
7609 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7610 : "missing target support for reduction on"
7611 : " variable-length vectors.\n");
7612 : return false;
7613 : }
7614 :
7615 : /* For SLP reductions, see if there is a neutral value we can use. */
7616 47138 : tree neutral_op = NULL_TREE;
7617 47138 : tree initial_value = NULL_TREE;
7618 47138 : if (reduc_chain)
7619 1374 : initial_value = vect_phi_initial_value (reduc_def_phi);
7620 47138 : neutral_op = neutral_op_for_reduction (TREE_TYPE
7621 : (gimple_phi_result (reduc_def_phi)),
7622 : orig_code, initial_value);
7623 47138 : VECT_REDUC_INFO_NEUTRAL_OP (reduc_info) = neutral_op;
7624 :
7625 47138 : if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7626 : {
7627 : /* We can't support in-order reductions of code such as this:
7628 :
7629 : for (int i = 0; i < n1; ++i)
7630 : for (int j = 0; j < n2; ++j)
7631 : l += a[j];
7632 :
7633 : since GCC effectively transforms the loop when vectorizing:
7634 :
7635 : for (int i = 0; i < n1 / VF; ++i)
7636 : for (int j = 0; j < n2; ++j)
7637 : for (int k = 0; k < VF; ++k)
7638 : l += a[j];
7639 :
7640 : which is a reassociation of the original operation. */
7641 56 : if (dump_enabled_p ())
7642 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7643 : "in-order double reduction not supported.\n");
7644 :
7645 56 : return false;
7646 : }
7647 :
7648 47082 : if (reduction_type == FOLD_LEFT_REDUCTION
7649 3982 : && SLP_TREE_LANES (slp_node) > 1
7650 117 : && !reduc_chain)
7651 : {
7652 : /* We cannot use in-order reductions in this case because there is
7653 : an implicit reassociation of the operations involved. */
7654 55 : if (dump_enabled_p ())
7655 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7656 : "in-order unchained SLP reductions not supported.\n");
7657 55 : return false;
7658 : }
7659 :
7660 : /* For double reductions, and for SLP reductions with a neutral value,
7661 : we construct a variable-length initial vector by loading a vector
7662 : full of the neutral value and then shift-and-inserting the start
7663 : values into the low-numbered elements. This is however not needed
7664 : when neutral and initial value are equal or we can handle the
7665 : initial value via adjustment in the epilogue. */
7666 47027 : if ((double_reduc || neutral_op)
7667 : && !nunits_out.is_constant ()
7668 : && reduction_type != INTEGER_INDUC_COND_REDUCTION
7669 : && !((SLP_TREE_LANES (slp_node) == 1 || reduc_chain)
7670 : && neutral_op
7671 : && (!double_reduc
7672 : || operand_equal_p (neutral_op,
7673 : vect_phi_initial_value (reduc_def_phi))))
7674 : && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7675 : vectype_out, OPTIMIZE_FOR_BOTH))
7676 : {
7677 : if (dump_enabled_p ())
7678 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7679 : "reduction on variable-length vectors requires"
7680 : " target support for a vector-shift-and-insert"
7681 : " operation.\n");
7682 : return false;
7683 : }
7684 :
7685 : /* Check extra constraints for variable-length unchained SLP reductions. */
7686 47027 : if (!reduc_chain
7687 : && !nunits_out.is_constant ())
7688 : {
7689 : /* We checked above that we could build the initial vector when
7690 : there's a neutral element value. Check here for the case in
7691 : which each SLP statement has its own initial value and in which
7692 : that value needs to be repeated for every instance of the
7693 : statement within the initial vector. */
7694 : unsigned int group_size = SLP_TREE_LANES (slp_node);
7695 : if (!neutral_op
7696 : && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7697 : TREE_TYPE (vectype_out)))
7698 : {
7699 : if (dump_enabled_p ())
7700 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7701 : "unsupported form of SLP reduction for"
7702 : " variable-length vectors: cannot build"
7703 : " initial vector.\n");
7704 : return false;
7705 : }
7706 : /* The epilogue code relies on the number of elements being a multiple
7707 : of the group size. The duplicate-and-interleave approach to setting
7708 : up the initial vector does too. */
7709 : if (!multiple_p (nunits_out, group_size))
7710 : {
7711 : if (dump_enabled_p ())
7712 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7713 : "unsupported form of SLP reduction for"
7714 : " variable-length vectors: the vector size"
7715 : " is not a multiple of the number of results.\n");
7716 : return false;
7717 : }
7718 : }
7719 :
7720 47027 : if (reduction_type == COND_REDUCTION)
7721 : {
7722 384 : widest_int ni;
7723 :
7724 384 : if (! max_loop_iterations (loop, &ni))
7725 : {
7726 14 : if (dump_enabled_p ())
7727 0 : dump_printf_loc (MSG_NOTE, vect_location,
7728 : "loop count not known, cannot create cond "
7729 : "reduction.\n");
7730 14 : return false;
7731 : }
7732 : /* Convert backedges to iterations. */
7733 370 : ni += 1;
7734 :
7735 : /* The additional index will be the same type as the condition. Check
7736 : that the loop can fit into this less one (because we'll use up the
7737 : zero slot for when there are no matches). */
7738 370 : tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7739 370 : if (wi::geu_p (ni, wi::to_widest (max_index)))
7740 : {
7741 90 : if (dump_enabled_p ())
7742 54 : dump_printf_loc (MSG_NOTE, vect_location,
7743 : "loop size is greater than data size.\n");
7744 90 : return false;
7745 : }
7746 384 : }
7747 :
7748 : /* In case the vectorization factor (VF) is bigger than the number
7749 : of elements that we can fit in a vectype (nunits), we have to generate
7750 : more than one vector stmt - i.e - we need to "unroll" the
7751 : vector stmt by a factor VF/nunits. For more details see documentation
7752 : in vectorizable_operation. */
7753 :
7754 : /* If the reduction is used in an outer loop we need to generate
7755 : VF intermediate results, like so (e.g. for ncopies=2):
7756 : r0 = phi (init, r0)
7757 : r1 = phi (init, r1)
7758 : r0 = x0 + r0;
7759 : r1 = x1 + r1;
7760 : (i.e. we generate VF results in 2 registers).
7761 : In this case we have a separate def-use cycle for each copy, and therefore
7762 : for each copy we get the vector def for the reduction variable from the
7763 : respective phi node created for this copy.
7764 :
7765 : Otherwise (the reduction is unused in the loop nest), we can combine
7766 : together intermediate results, like so (e.g. for ncopies=2):
7767 : r = phi (init, r)
7768 : r = x0 + r;
7769 : r = x1 + r;
7770 : (i.e. we generate VF/2 results in a single register).
7771 : In this case for each copy we get the vector def for the reduction variable
7772 : from the vectorized reduction operation generated in the previous iteration.
7773 :
7774 : This only works when we see both the reduction PHI and its only consumer
7775 : in vectorizable_reduction and there are no intermediate stmts
7776 : participating. When unrolling we want each unrolled iteration to have its
7777 : own reduction accumulator since one of the main goals of unrolling a
7778 : reduction is to reduce the aggregate loop-carried latency. */
7779 46923 : if (ncopies > 1
7780 46923 : && !reduc_chain
7781 5252 : && SLP_TREE_LANES (slp_node) == 1
7782 5092 : && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7783 5073 : && reduc_chain_length == 1
7784 4770 : && loop_vinfo->suggested_unroll_factor == 1)
7785 46923 : single_defuse_cycle = true;
7786 :
7787 46923 : if (single_defuse_cycle && !lane_reducing)
7788 : {
7789 4203 : gcc_assert (op.code != COND_EXPR);
7790 :
7791 : /* 4. check support for the operation in the loop
7792 :
7793 : This isn't necessary for the lane reduction codes, since they
7794 : can only be produced by pattern matching, and it's up to the
7795 : pattern matcher to test for support. The main reason for
7796 : specifically skipping this step is to avoid rechecking whether
7797 : mixed-sign dot-products can be implemented using signed
7798 : dot-products. */
7799 4203 : machine_mode vec_mode = TYPE_MODE (vectype_in);
7800 4203 : if (!directly_supported_p (op.code, vectype_in, optab_vector))
7801 : {
7802 711 : if (dump_enabled_p ())
7803 24 : dump_printf (MSG_NOTE, "op not supported by target.\n");
7804 1422 : if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7805 711 : || !vect_can_vectorize_without_simd_p (op.code))
7806 : single_defuse_cycle = false;
7807 : else
7808 5 : if (dump_enabled_p ())
7809 0 : dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7810 : }
7811 :
7812 4203 : if (vect_emulated_vector_p (vectype_in)
7813 4203 : && !vect_can_vectorize_without_simd_p (op.code))
7814 : {
7815 0 : if (dump_enabled_p ())
7816 0 : dump_printf (MSG_NOTE, "using word mode not possible.\n");
7817 0 : return false;
7818 : }
7819 : }
7820 46923 : if (dump_enabled_p () && single_defuse_cycle)
7821 650 : dump_printf_loc (MSG_NOTE, vect_location,
7822 : "using single def-use cycle for reduction by reducing "
7823 : "multiple vectors to one in the loop body\n");
7824 46923 : VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7825 :
7826 : /* For lane-reducing operation, the below processing related to single
7827 : defuse-cycle will be done in its own vectorizable function. One more
7828 : thing to note is that the operation must not be involved in fold-left
7829 : reduction. */
7830 46923 : single_defuse_cycle &= !lane_reducing;
7831 :
7832 46923 : if (single_defuse_cycle || reduction_type == FOLD_LEFT_REDUCTION)
7833 23976 : for (i = 0; i < (int) op.num_ops; i++)
7834 16612 : if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7835 : {
7836 0 : if (dump_enabled_p ())
7837 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7838 : "incompatible vector types for invariants\n");
7839 0 : return false;
7840 : }
7841 :
7842 46923 : vect_model_reduction_cost (loop_vinfo, slp_for_stmt_info, reduc_fn,
7843 : reduction_type, ncopies, cost_vec);
7844 : /* Cost the reduction op inside the loop if transformed via
7845 : vect_transform_reduction for non-lane-reducing operation. Otherwise
7846 : this is costed by the separate vectorizable_* routines. */
7847 46923 : if (single_defuse_cycle)
7848 3497 : record_stmt_cost (cost_vec, ncopies, vector_stmt,
7849 : slp_for_stmt_info, 0, vect_body);
7850 :
7851 46923 : if (dump_enabled_p ()
7852 46923 : && reduction_type == FOLD_LEFT_REDUCTION)
7853 219 : dump_printf_loc (MSG_NOTE, vect_location,
7854 : "using an in-order (fold-left) reduction.\n");
7855 46923 : SLP_TREE_TYPE (slp_node) = cycle_phi_info_type;
7856 :
7857 : /* All but single defuse-cycle optimized and fold-left reductions go
7858 : through their own vectorizable_* routines. */
7859 46923 : stmt_vec_info tem
7860 46923 : = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (slp_node_instance));
7861 46923 : if (!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
7862 39559 : STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7863 : else
7864 : {
7865 7364 : STMT_VINFO_DEF_TYPE (tem) = vect_reduction_def;
7866 7364 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7867 3166 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
7868 : slp_node, op.code, op.type,
7869 : vectype_in);
7870 : }
7871 : return true;
7872 : }
7873 :
7874 : /* STMT_INFO is a dot-product reduction whose multiplication operands
7875 : have different signs. Emit a sequence to emulate the operation
7876 : using a series of signed DOT_PROD_EXPRs and return the last
7877 : statement generated. VEC_DEST is the result of the vector operation
7878 : and VOP lists its inputs. */
7879 :
7880 : static gassign *
7881 4 : vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7882 : gimple_stmt_iterator *gsi, tree vec_dest,
7883 : tree vop[3])
7884 : {
7885 4 : tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
7886 4 : tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
7887 4 : tree narrow_elttype = TREE_TYPE (narrow_vectype);
7888 4 : gimple *new_stmt;
7889 :
7890 : /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
7891 4 : if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
7892 0 : std::swap (vop[0], vop[1]);
7893 :
7894 : /* Convert all inputs to signed types. */
7895 16 : for (int i = 0; i < 3; ++i)
7896 12 : if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
7897 : {
7898 4 : tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
7899 4 : new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
7900 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7901 4 : vop[i] = tmp;
7902 : }
7903 :
7904 : /* In the comments below we assume 8-bit inputs for simplicity,
7905 : but the approach works for any full integer type. */
7906 :
7907 : /* Create a vector of -128. */
7908 4 : tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
7909 4 : tree min_narrow = build_vector_from_val (narrow_vectype,
7910 : min_narrow_elttype);
7911 :
7912 : /* Create a vector of 64. */
7913 4 : auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
7914 4 : tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
7915 4 : half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
7916 :
7917 : /* Emit: SUB_RES = VOP[0] - 128. */
7918 4 : tree sub_res = make_ssa_name (narrow_vectype);
7919 4 : new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
7920 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7921 :
7922 : /* Emit:
7923 :
7924 : STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
7925 : STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
7926 : STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
7927 :
7928 : on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
7929 : Doing the two 64 * y steps first allows more time to compute x. */
7930 4 : tree stage1 = make_ssa_name (wide_vectype);
7931 4 : new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
7932 : vop[1], half_narrow, vop[2]);
7933 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7934 :
7935 4 : tree stage2 = make_ssa_name (wide_vectype);
7936 4 : new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
7937 : vop[1], half_narrow, stage1);
7938 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7939 :
7940 4 : tree stage3 = make_ssa_name (wide_vectype);
7941 4 : new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
7942 : sub_res, vop[1], stage2);
7943 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7944 :
7945 : /* Convert STAGE3 to the reduction type. */
7946 4 : return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
7947 4 : }
7948 :
7949 : /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7950 : value. */
7951 :
7952 : bool
7953 2575 : vect_transform_reduction (loop_vec_info loop_vinfo,
7954 : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7955 : slp_tree slp_node)
7956 : {
7957 2575 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
7958 2575 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7959 2575 : unsigned vec_num;
7960 :
7961 2575 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
7962 :
7963 2575 : if (nested_in_vect_loop_p (loop, stmt_info))
7964 : {
7965 0 : loop = loop->inner;
7966 0 : gcc_assert (VECT_REDUC_INFO_DEF_TYPE (reduc_info)
7967 : == vect_double_reduction_def);
7968 : }
7969 :
7970 2575 : gimple_match_op op;
7971 2575 : if (!gimple_extract_op (stmt_info->stmt, &op))
7972 0 : gcc_unreachable ();
7973 :
7974 : /* All uses but the last are expected to be defined in the loop.
7975 : The last use is the reduction variable. In case of nested cycle this
7976 : assumption is not true: we use reduc_index to record the index of the
7977 : reduction variable. */
7978 2575 : int reduc_index = SLP_TREE_REDUC_IDX (slp_node);
7979 2575 : tree vectype_in = SLP_TREE_VECTYPE (slp_node);
7980 2575 : if (lane_reducing_op_p (op.code))
7981 252 : vectype_in = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (slp_node)[0]);
7982 :
7983 2575 : vec_num = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
7984 :
7985 2575 : code_helper code = canonicalize_code (op.code, op.type);
7986 2575 : internal_fn cond_fn
7987 476 : = ((code.is_internal_fn ()
7988 476 : && internal_fn_mask_index ((internal_fn)code) != -1)
7989 2575 : ? (internal_fn)code : get_conditional_internal_fn (code, op.type));
7990 :
7991 2575 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7992 2575 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
7993 2575 : bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7994 :
7995 : /* Transform. */
7996 2575 : tree new_temp = NULL_TREE;
7997 18025 : auto_vec<tree> vec_oprnds[3];
7998 :
7999 2575 : if (dump_enabled_p ())
8000 745 : dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8001 :
8002 : /* A binary COND_OP reduction must have the same definition and else
8003 : value. */
8004 3051 : bool cond_fn_p = code.is_internal_fn ()
8005 476 : && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8006 476 : if (cond_fn_p)
8007 : {
8008 476 : gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8009 : || code == IFN_COND_MUL || code == IFN_COND_AND
8010 : || code == IFN_COND_IOR || code == IFN_COND_XOR
8011 : || code == IFN_COND_MIN || code == IFN_COND_MAX);
8012 476 : gcc_assert (op.num_ops == 4
8013 : && (op.ops[reduc_index]
8014 : == op.ops[internal_fn_else_index ((internal_fn) code)]));
8015 : }
8016 :
8017 2575 : bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8018 :
8019 2575 : vect_reduction_type reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
8020 2575 : if (reduction_type == FOLD_LEFT_REDUCTION)
8021 : {
8022 843 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
8023 843 : gcc_assert (code.is_tree_code () || cond_fn_p);
8024 843 : return vectorize_fold_left_reduction
8025 843 : (loop_vinfo, stmt_info, gsi, slp_node,
8026 843 : code, reduc_fn, op.num_ops, vectype_in,
8027 843 : reduc_index, masks, lens);
8028 : }
8029 :
8030 1732 : bool single_defuse_cycle = VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info);
8031 1732 : bool lane_reducing = lane_reducing_op_p (code);
8032 1480 : gcc_assert (single_defuse_cycle || lane_reducing);
8033 :
8034 1732 : if (lane_reducing)
8035 : {
8036 : /* The last operand of lane-reducing op is for reduction. */
8037 252 : gcc_assert (reduc_index == (int) op.num_ops - 1);
8038 : }
8039 :
8040 : /* Create the destination vector */
8041 1732 : tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8042 1732 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8043 :
8044 : /* Get NCOPIES vector definitions for all operands except the reduction
8045 : definition. */
8046 1732 : if (!cond_fn_p)
8047 : {
8048 1279 : gcc_assert (reduc_index >= 0 && reduc_index <= 2);
8049 2109 : vect_get_vec_defs (loop_vinfo, slp_node,
8050 1279 : single_defuse_cycle && reduc_index == 0
8051 : ? NULL_TREE : op.ops[0], &vec_oprnds[0],
8052 1279 : single_defuse_cycle && reduc_index == 1
8053 : ? NULL_TREE : op.ops[1], &vec_oprnds[1],
8054 1279 : op.num_ops == 3
8055 252 : && !(single_defuse_cycle && reduc_index == 2)
8056 : ? op.ops[2] : NULL_TREE, &vec_oprnds[2]);
8057 : }
8058 : else
8059 : {
8060 : /* For a conditional operation pass the truth type as mask
8061 : vectype. */
8062 453 : gcc_assert (single_defuse_cycle
8063 : && (reduc_index == 1 || reduc_index == 2));
8064 453 : vect_get_vec_defs (loop_vinfo, slp_node, op.ops[0],
8065 : &vec_oprnds[0],
8066 : reduc_index == 1 ? NULL_TREE : op.ops[1],
8067 : &vec_oprnds[1],
8068 : reduc_index == 2 ? NULL_TREE : op.ops[2],
8069 : &vec_oprnds[2]);
8070 : }
8071 :
8072 : /* For single def-use cycles get one copy of the vectorized reduction
8073 : definition. */
8074 1732 : if (single_defuse_cycle)
8075 : {
8076 1647 : vect_get_vec_defs (loop_vinfo, slp_node,
8077 : reduc_index == 0 ? op.ops[0] : NULL_TREE,
8078 : &vec_oprnds[0],
8079 : reduc_index == 1 ? op.ops[1] : NULL_TREE,
8080 : &vec_oprnds[1],
8081 : reduc_index == 2 ? op.ops[2] : NULL_TREE,
8082 : &vec_oprnds[2]);
8083 : }
8084 85 : else if (lane_reducing)
8085 : {
8086 : /* For normal reduction, consistency between vectorized def/use is
8087 : naturally ensured when mapping from scalar statement. But if lane-
8088 : reducing op is involved in reduction, thing would become somewhat
8089 : complicated in that the op's result and operand for accumulation are
8090 : limited to less lanes than other operands, which certainly causes
8091 : def/use mismatch on adjacent statements around the op if do not have
8092 : any kind of specific adjustment. One approach is to refit lane-
8093 : reducing op in the way of introducing new trivial pass-through copies
8094 : to fix possible def/use gap, so as to make it behave like a normal op.
8095 : And vector reduction PHIs are always generated to the full extent, no
8096 : matter lane-reducing op exists or not. If some copies or PHIs are
8097 : actually superfluous, they would be cleaned up by passes after
8098 : vectorization. An example for single-lane slp, lane-reducing ops
8099 : with mixed input vectypes in a reduction chain, is given as below.
8100 : Similarly, this handling is applicable for multiple-lane slp as well.
8101 :
8102 : int sum = 1;
8103 : for (i)
8104 : {
8105 : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
8106 : sum += w[i]; // widen-sum <vector(16) char>
8107 : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
8108 : sum += n[i]; // normal <vector(4) int>
8109 : }
8110 :
8111 : The vector size is 128-bit,vectorization factor is 16. Reduction
8112 : statements would be transformed as:
8113 :
8114 : vector<4> int sum_v0 = { 0, 0, 0, 1 };
8115 : vector<4> int sum_v1 = { 0, 0, 0, 0 };
8116 : vector<4> int sum_v2 = { 0, 0, 0, 0 };
8117 : vector<4> int sum_v3 = { 0, 0, 0, 0 };
8118 :
8119 : for (i / 16)
8120 : {
8121 : sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
8122 : sum_v1 = sum_v1; // copy
8123 : sum_v2 = sum_v2; // copy
8124 : sum_v3 = sum_v3; // copy
8125 :
8126 : sum_v0 = sum_v0; // copy
8127 : sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
8128 : sum_v2 = sum_v2; // copy
8129 : sum_v3 = sum_v3; // copy
8130 :
8131 : sum_v0 = sum_v0; // copy
8132 : sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1);
8133 : sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2);
8134 : sum_v3 = sum_v3; // copy
8135 :
8136 : sum_v0 += n_v0[i: 0 ~ 3 ];
8137 : sum_v1 += n_v1[i: 4 ~ 7 ];
8138 : sum_v2 += n_v2[i: 8 ~ 11];
8139 : sum_v3 += n_v3[i: 12 ~ 15];
8140 : }
8141 :
8142 : Moreover, for a higher instruction parallelism in final vectorized
8143 : loop, it is considered to make those effective vector lane-reducing
8144 : ops be distributed evenly among all def-use cycles. In the above
8145 : example, DOT_PROD, WIDEN_SUM and SADs are generated into disparate
8146 : cycles, instruction dependency among them could be eliminated. */
8147 85 : unsigned effec_ncopies = vec_oprnds[0].length ();
8148 85 : unsigned total_ncopies = vec_oprnds[reduc_index].length ();
8149 :
8150 85 : gcc_assert (effec_ncopies <= total_ncopies);
8151 :
8152 85 : if (effec_ncopies < total_ncopies)
8153 : {
8154 255 : for (unsigned i = 0; i < op.num_ops - 1; i++)
8155 : {
8156 340 : gcc_assert (vec_oprnds[i].length () == effec_ncopies);
8157 170 : vec_oprnds[i].safe_grow_cleared (total_ncopies);
8158 : }
8159 : }
8160 :
8161 85 : tree reduc_vectype_in = vectype_in;
8162 85 : gcc_assert (reduc_vectype_in);
8163 :
8164 85 : unsigned effec_reduc_ncopies
8165 85 : = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
8166 :
8167 85 : gcc_assert (effec_ncopies <= effec_reduc_ncopies);
8168 :
8169 85 : if (effec_ncopies < effec_reduc_ncopies)
8170 : {
8171 : /* Find suitable def-use cycles to generate vectorized statements
8172 : into, and reorder operands based on the selection. */
8173 0 : unsigned curr_pos = VECT_REDUC_INFO_RESULT_POS (reduc_info);
8174 0 : unsigned next_pos = (curr_pos + effec_ncopies) % effec_reduc_ncopies;
8175 :
8176 0 : gcc_assert (curr_pos < effec_reduc_ncopies);
8177 0 : VECT_REDUC_INFO_RESULT_POS (reduc_info) = next_pos;
8178 :
8179 0 : if (curr_pos)
8180 : {
8181 0 : unsigned count = effec_reduc_ncopies - effec_ncopies;
8182 0 : unsigned start = curr_pos - count;
8183 :
8184 0 : if ((int) start < 0)
8185 : {
8186 0 : count = curr_pos;
8187 0 : start = 0;
8188 : }
8189 :
8190 0 : for (unsigned i = 0; i < op.num_ops - 1; i++)
8191 : {
8192 0 : for (unsigned j = effec_ncopies; j > start; j--)
8193 : {
8194 0 : unsigned k = j - 1;
8195 0 : std::swap (vec_oprnds[i][k], vec_oprnds[i][k + count]);
8196 0 : gcc_assert (!vec_oprnds[i][k]);
8197 : }
8198 : }
8199 : }
8200 : }
8201 : }
8202 :
8203 1732 : bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (slp_node);
8204 2971 : unsigned num = vec_oprnds[reduc_index == 0 ? 1 : 0].length ();
8205 1732 : unsigned mask_index = 0;
8206 :
8207 7593 : for (unsigned i = 0; i < num; ++i)
8208 : {
8209 5861 : gimple *new_stmt;
8210 5861 : tree vop[3] = { vec_oprnds[0][i], vec_oprnds[1][i], NULL_TREE };
8211 5861 : if (!vop[0] || !vop[1])
8212 : {
8213 456 : tree reduc_vop = vec_oprnds[reduc_index][i];
8214 :
8215 : /* If could not generate an effective vector statement for current
8216 : portion of reduction operand, insert a trivial copy to simply
8217 : handle over the operand to other dependent statements. */
8218 456 : gcc_assert (reduc_vop);
8219 :
8220 456 : if (TREE_CODE (reduc_vop) == SSA_NAME
8221 456 : && !SSA_NAME_IS_DEFAULT_DEF (reduc_vop))
8222 456 : new_stmt = SSA_NAME_DEF_STMT (reduc_vop);
8223 : else
8224 : {
8225 0 : new_temp = make_ssa_name (vec_dest);
8226 0 : new_stmt = gimple_build_assign (new_temp, reduc_vop);
8227 0 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
8228 : gsi);
8229 : }
8230 : }
8231 5405 : else if (masked_loop_p && !mask_by_cond_expr)
8232 : {
8233 : /* No conditional ifns have been defined for lane-reducing op
8234 : yet. */
8235 16 : gcc_assert (!lane_reducing);
8236 :
8237 16 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8238 : vec_num, vectype_in,
8239 : mask_index++);
8240 16 : gcall *call;
8241 24 : if (code.is_internal_fn () && cond_fn_p)
8242 : {
8243 16 : gcc_assert (op.num_ops >= 3
8244 : && internal_fn_mask_index (internal_fn (code)) == 0);
8245 8 : vop[2] = vec_oprnds[2][i];
8246 8 : mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask),
8247 : mask, vop[0], gsi);
8248 8 : call = gimple_build_call_internal (cond_fn, 4, mask, vop[1],
8249 : vop[2], vop[reduc_index]);
8250 : }
8251 : else
8252 8 : call = gimple_build_call_internal (cond_fn, 4, mask, vop[0],
8253 : vop[1], vop[reduc_index]);
8254 16 : new_temp = make_ssa_name (vec_dest, call);
8255 16 : gimple_call_set_lhs (call, new_temp);
8256 16 : gimple_call_set_nothrow (call, true);
8257 16 : vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8258 16 : new_stmt = call;
8259 : }
8260 : else
8261 : {
8262 5389 : if (op.num_ops >= 3)
8263 1747 : vop[2] = vec_oprnds[2][i];
8264 :
8265 5389 : if (masked_loop_p && mask_by_cond_expr)
8266 : {
8267 4 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8268 : vec_num, vectype_in,
8269 : mask_index++);
8270 4 : build_vect_cond_expr (code, vop, mask, gsi);
8271 : }
8272 :
8273 5389 : if (emulated_mixed_dot_prod)
8274 4 : new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8275 : vec_dest, vop);
8276 :
8277 6727 : else if (code.is_internal_fn () && !cond_fn_p)
8278 0 : new_stmt = gimple_build_call_internal (internal_fn (code),
8279 : op.num_ops,
8280 : vop[0], vop[1], vop[2]);
8281 6727 : else if (code.is_internal_fn () && cond_fn_p)
8282 1342 : new_stmt = gimple_build_call_internal (internal_fn (code),
8283 : op.num_ops,
8284 : vop[0], vop[1], vop[2],
8285 : vop[reduc_index]);
8286 : else
8287 4043 : new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8288 : vop[0], vop[1], vop[2]);
8289 5389 : new_temp = make_ssa_name (vec_dest, new_stmt);
8290 5389 : gimple_set_lhs (new_stmt, new_temp);
8291 5389 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8292 : }
8293 :
8294 5861 : if (single_defuse_cycle && i < num - 1)
8295 3530 : vec_oprnds[reduc_index].safe_push (gimple_get_lhs (new_stmt));
8296 : else
8297 2331 : slp_node->push_vec_def (new_stmt);
8298 : }
8299 :
8300 : return true;
8301 10300 : }
8302 :
8303 : /* Transform phase of a cycle PHI. */
8304 :
8305 : bool
8306 23709 : vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8307 : stmt_vec_info stmt_info,
8308 : slp_tree slp_node, slp_instance slp_node_instance)
8309 : {
8310 23709 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
8311 23709 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8312 23709 : int i;
8313 23709 : bool nested_cycle = false;
8314 23709 : int vec_num;
8315 :
8316 23843 : if (nested_in_vect_loop_p (loop, stmt_info))
8317 : {
8318 : loop = loop->inner;
8319 : nested_cycle = true;
8320 : }
8321 :
8322 23709 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
8323 23709 : if (reduc_info
8324 23067 : && (VECT_REDUC_INFO_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8325 23067 : || VECT_REDUC_INFO_TYPE (reduc_info) == FOLD_LEFT_REDUCTION))
8326 : /* Leave the scalar phi in place. */
8327 : return true;
8328 :
8329 22224 : if (reduc_info && reduc_info->is_reduc_chain && dump_enabled_p ())
8330 118 : dump_printf_loc (MSG_NOTE, vect_location,
8331 : "vectorizing a reduction chain\n");
8332 :
8333 22866 : vec_num = vect_get_num_copies (loop_vinfo, slp_node);
8334 :
8335 : /* Check whether we should use a single PHI node and accumulate
8336 : vectors to one before the backedge. */
8337 22866 : if (reduc_info && VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info))
8338 22866 : vec_num = 1;
8339 :
8340 : /* Create the destination vector */
8341 22866 : gphi *phi = as_a <gphi *> (stmt_info->stmt);
8342 22866 : tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8343 : vectype_out);
8344 :
8345 : /* Get the loop-entry arguments. */
8346 22866 : auto_vec<tree> vec_initial_defs;
8347 22866 : vec_initial_defs.reserve (vec_num);
8348 : /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8349 : and we can't use zero for induc_val, use initial_def. Similarly
8350 : for REDUC_MIN and initial_def larger than the base. */
8351 22866 : if (reduc_info
8352 22224 : && VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8353 : {
8354 62 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
8355 62 : tree initial_def = vect_phi_initial_value (phi);
8356 62 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).safe_push (initial_def);
8357 62 : tree induc_val = VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info);
8358 62 : if (TREE_CODE (initial_def) == INTEGER_CST
8359 60 : && !integer_zerop (induc_val)
8360 122 : && ((VECT_REDUC_INFO_CODE (reduc_info) == MAX_EXPR
8361 42 : && tree_int_cst_lt (initial_def, induc_val))
8362 58 : || (VECT_REDUC_INFO_CODE (reduc_info) == MIN_EXPR
8363 18 : && tree_int_cst_lt (induc_val, initial_def))))
8364 : {
8365 2 : induc_val = initial_def;
8366 : /* Communicate we used the initial_def to epilouge
8367 : generation. */
8368 2 : VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8369 : }
8370 62 : vec_initial_defs.quick_push
8371 62 : (build_vector_from_val (vectype_out, induc_val));
8372 62 : }
8373 22804 : else if (nested_cycle)
8374 : {
8375 726 : unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8376 726 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8377 : &vec_initial_defs);
8378 : }
8379 : else
8380 : {
8381 22078 : gcc_assert (slp_node == slp_node_instance->reduc_phis);
8382 22078 : vec<tree> &initial_values = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info);
8383 22078 : vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8384 :
8385 22078 : unsigned int num_phis = stmts.length ();
8386 22078 : if (reduc_info->is_reduc_chain)
8387 188 : num_phis = 1;
8388 22078 : initial_values.reserve (num_phis);
8389 44612 : for (unsigned int i = 0; i < num_phis; ++i)
8390 : {
8391 22534 : gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8392 22534 : initial_values.quick_push (vect_phi_initial_value (this_phi));
8393 : }
8394 22078 : tree neutral_op = VECT_REDUC_INFO_NEUTRAL_OP (reduc_info);
8395 22078 : if (vec_num == 1
8396 22078 : && vect_find_reusable_accumulator (loop_vinfo,
8397 : reduc_info, vectype_out))
8398 : ;
8399 : /* Try to simplify the vector initialization by applying an
8400 : adjustment after the reduction has been performed. This
8401 : can also break a critical path but on the other hand
8402 : requires to keep the initial value live across the loop. */
8403 17894 : else if (neutral_op
8404 17332 : && initial_values.length () == 1
8405 17147 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8406 34966 : && !operand_equal_p (neutral_op, initial_values[0]))
8407 : {
8408 12227 : VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info)
8409 12227 : = initial_values[0];
8410 12227 : initial_values[0] = neutral_op;
8411 : }
8412 22078 : if (!VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)
8413 4184 : || loop_vinfo->main_loop_edge)
8414 43710 : get_initial_defs_for_reduction (loop_vinfo, reduc_info, vectype_out,
8415 : &vec_initial_defs, vec_num,
8416 : stmts.length (), neutral_op);
8417 : }
8418 :
8419 22866 : if (reduc_info)
8420 22224 : if (auto *accumulator = VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info))
8421 : {
8422 4184 : tree def = accumulator->reduc_input;
8423 4184 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8424 : {
8425 4181 : unsigned int nreduc;
8426 8362 : bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8427 4181 : (TREE_TYPE (def)),
8428 4181 : TYPE_VECTOR_SUBPARTS (vectype_out),
8429 : &nreduc);
8430 0 : gcc_assert (res);
8431 4181 : gimple_seq stmts = NULL;
8432 : /* Reduce the single vector to a smaller one. */
8433 4181 : if (nreduc != 1)
8434 : {
8435 : /* Perform the reduction in the appropriate type. */
8436 4181 : tree rvectype = vectype_out;
8437 4181 : if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8438 4181 : TREE_TYPE (TREE_TYPE (def))))
8439 235 : rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8440 : TYPE_VECTOR_SUBPARTS
8441 470 : (vectype_out));
8442 4181 : def = vect_create_partial_epilog (def, rvectype,
8443 : VECT_REDUC_INFO_CODE
8444 : (reduc_info),
8445 : &stmts);
8446 : }
8447 : /* The epilogue loop might use a different vector mode, like
8448 : VNx2DI vs. V2DI. */
8449 4181 : if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8450 : {
8451 0 : tree reduc_type = build_vector_type_for_mode
8452 0 : (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8453 0 : def = gimple_convert (&stmts, reduc_type, def);
8454 : }
8455 : /* Adjust the input so we pick up the partially reduced value
8456 : for the skip edge in vect_create_epilog_for_reduction. */
8457 4181 : accumulator->reduc_input = def;
8458 : /* And the reduction could be carried out using a different sign. */
8459 4181 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8460 235 : def = gimple_convert (&stmts, vectype_out, def);
8461 4181 : edge e;
8462 4181 : if ((e = loop_vinfo->main_loop_edge)
8463 4181 : || (e = loop_vinfo->skip_this_loop_edge))
8464 : {
8465 : /* While we'd like to insert on the edge this will split
8466 : blocks and disturb bookkeeping, we also will eventually
8467 : need this on the skip edge. Rely on sinking to
8468 : fixup optimal placement and insert in the pred. */
8469 3958 : gimple_stmt_iterator gsi = gsi_last_bb (e->src);
8470 : /* Insert before a cond that eventually skips the
8471 : epilogue. */
8472 3958 : if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8473 3941 : gsi_prev (&gsi);
8474 3958 : gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8475 : }
8476 : else
8477 223 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8478 : stmts);
8479 : }
8480 4184 : if (loop_vinfo->main_loop_edge)
8481 3961 : vec_initial_defs[0]
8482 3961 : = vect_get_main_loop_result (loop_vinfo, def,
8483 3961 : vec_initial_defs[0]);
8484 : else
8485 223 : vec_initial_defs.safe_push (def);
8486 : }
8487 :
8488 : /* Generate the reduction PHIs upfront. */
8489 47512 : for (i = 0; i < vec_num; i++)
8490 : {
8491 24646 : tree vec_init_def = vec_initial_defs[i];
8492 : /* Create the reduction-phi that defines the reduction
8493 : operand. */
8494 24646 : gphi *new_phi = create_phi_node (vec_dest, loop->header);
8495 24646 : add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8496 : UNKNOWN_LOCATION);
8497 :
8498 : /* The loop-latch arg is set in epilogue processing. */
8499 :
8500 24646 : slp_node->push_vec_def (new_phi);
8501 : }
8502 :
8503 22866 : return true;
8504 22866 : }
8505 :
8506 : /* Vectorizes LC PHIs. */
8507 :
8508 : bool
8509 173884 : vectorizable_lc_phi (loop_vec_info loop_vinfo,
8510 : stmt_vec_info stmt_info,
8511 : slp_tree slp_node)
8512 : {
8513 173884 : if (!loop_vinfo
8514 173884 : || !is_a <gphi *> (stmt_info->stmt)
8515 208615 : || gimple_phi_num_args (stmt_info->stmt) != 1)
8516 : return false;
8517 :
8518 761 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8519 0 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8520 : return false;
8521 :
8522 : /* Deal with copies from externs or constants that disguise as
8523 : loop-closed PHI nodes (PR97886). */
8524 761 : if (!vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8525 : SLP_TREE_VECTYPE (slp_node)))
8526 : {
8527 0 : if (dump_enabled_p ())
8528 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8529 : "incompatible vector types for invariants\n");
8530 0 : return false;
8531 : }
8532 :
8533 : /* ??? This can happen with data vs. mask uses of boolean. */
8534 761 : if (!useless_type_conversion_p (SLP_TREE_VECTYPE (slp_node),
8535 761 : SLP_TREE_VECTYPE
8536 : (SLP_TREE_CHILDREN (slp_node)[0])))
8537 : {
8538 0 : if (dump_enabled_p ())
8539 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8540 : "missed mask promotion\n");
8541 0 : return false;
8542 : }
8543 :
8544 761 : SLP_TREE_TYPE (slp_node) = lc_phi_info_type;
8545 761 : return true;
8546 : }
8547 :
8548 : bool
8549 504 : vect_transform_lc_phi (loop_vec_info loop_vinfo,
8550 : stmt_vec_info stmt_info,
8551 : slp_tree slp_node)
8552 : {
8553 :
8554 504 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8555 504 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8556 504 : basic_block bb = gimple_bb (stmt_info->stmt);
8557 504 : edge e = single_pred_edge (bb);
8558 504 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8559 504 : auto_vec<tree> vec_oprnds;
8560 1008 : vect_get_vec_defs (loop_vinfo, slp_node,
8561 504 : gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8562 1118 : for (unsigned i = 0; i < vec_oprnds.length (); i++)
8563 : {
8564 : /* Create the vectorized LC PHI node. */
8565 614 : gphi *new_phi = create_phi_node (vec_dest, bb);
8566 614 : add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8567 614 : slp_node->push_vec_def (new_phi);
8568 : }
8569 :
8570 504 : return true;
8571 504 : }
8572 :
8573 : /* Vectorizes PHIs. */
8574 :
8575 : bool
8576 138530 : vectorizable_phi (bb_vec_info vinfo,
8577 : stmt_vec_info stmt_info,
8578 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8579 : {
8580 138530 : if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8581 : return false;
8582 :
8583 71130 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8584 : return false;
8585 :
8586 71130 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8587 :
8588 71130 : if (cost_vec) /* transformation not required. */
8589 : {
8590 : slp_tree child;
8591 : unsigned i;
8592 194269 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8593 137290 : if (!child)
8594 : {
8595 0 : if (dump_enabled_p ())
8596 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8597 : "PHI node with unvectorized backedge def\n");
8598 0 : return false;
8599 : }
8600 137290 : else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8601 : {
8602 18 : if (dump_enabled_p ())
8603 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8604 : "incompatible vector types for invariants\n");
8605 18 : return false;
8606 : }
8607 137272 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8608 137272 : && !useless_type_conversion_p (vectype,
8609 : SLP_TREE_VECTYPE (child)))
8610 : {
8611 : /* With bools we can have mask and non-mask precision vectors
8612 : or different non-mask precisions. while pattern recog is
8613 : supposed to guarantee consistency here bugs in it can cause
8614 : mismatches (PR103489 and PR103800 for example).
8615 : Deal with them here instead of ICEing later. */
8616 18 : if (dump_enabled_p ())
8617 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8618 : "incompatible vector type setup from "
8619 : "bool pattern detection\n");
8620 18 : return false;
8621 : }
8622 :
8623 : /* For single-argument PHIs assume coalescing which means zero cost
8624 : for the scalar and the vector PHIs. This avoids artificially
8625 : favoring the vector path (but may pessimize it in some cases). */
8626 56979 : if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8627 51676 : record_stmt_cost (cost_vec, vect_get_num_copies (vinfo, slp_node),
8628 : vector_stmt, slp_node, vectype, 0, vect_body);
8629 56979 : SLP_TREE_TYPE (slp_node) = phi_info_type;
8630 56979 : return true;
8631 : }
8632 :
8633 14115 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8634 14115 : basic_block bb = gimple_bb (stmt_info->stmt);
8635 14115 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8636 14115 : auto_vec<gphi *> new_phis;
8637 51149 : for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8638 : {
8639 37034 : slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8640 :
8641 : /* Skip not yet vectorized defs. */
8642 37481 : if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8643 37034 : && SLP_TREE_VEC_DEFS (child).is_empty ())
8644 447 : continue;
8645 :
8646 36587 : auto_vec<tree> vec_oprnds;
8647 36587 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8648 36587 : if (!new_phis.exists ())
8649 : {
8650 14115 : new_phis.create (vec_oprnds.length ());
8651 29843 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
8652 : {
8653 : /* Create the vectorized LC PHI node. */
8654 15728 : new_phis.quick_push (create_phi_node (vec_dest, bb));
8655 15728 : slp_node->push_vec_def (new_phis[j]);
8656 : }
8657 : }
8658 36587 : edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8659 79858 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
8660 43271 : add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8661 36587 : }
8662 : /* We should have at least one already vectorized child. */
8663 14115 : gcc_assert (new_phis.exists ());
8664 :
8665 14115 : return true;
8666 14115 : }
8667 :
8668 : /* Vectorizes first order recurrences. An overview of the transformation
8669 : is described below. Suppose we have the following loop.
8670 :
8671 : int t = 0;
8672 : for (int i = 0; i < n; ++i)
8673 : {
8674 : b[i] = a[i] - t;
8675 : t = a[i];
8676 : }
8677 :
8678 : There is a first-order recurrence on 'a'. For this loop, the scalar IR
8679 : looks (simplified) like:
8680 :
8681 : scalar.preheader:
8682 : init = 0;
8683 :
8684 : scalar.body:
8685 : i = PHI <0(scalar.preheader), i+1(scalar.body)>
8686 : _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8687 : _1 = a[i]
8688 : b[i] = _1 - _2
8689 : if (i < n) goto scalar.body
8690 :
8691 : In this example, _2 is a recurrence because it's value depends on the
8692 : previous iteration. We vectorize this as (VF = 4)
8693 :
8694 : vector.preheader:
8695 : vect_init = vect_cst(..., ..., ..., 0)
8696 :
8697 : vector.body
8698 : i = PHI <0(vector.preheader), i+4(vector.body)>
8699 : vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8700 : vect_2 = a[i, i+1, i+2, i+3];
8701 : vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8702 : b[i, i+1, i+2, i+3] = vect_2 - vect_3
8703 : if (..) goto vector.body
8704 :
8705 : In this function, vectorizable_recurr, we code generate both the
8706 : vector PHI node and the permute since those together compute the
8707 : vectorized value of the scalar PHI. We do not yet have the
8708 : backedge value to fill in there nor into the vec_perm. Those
8709 : are filled in vect_schedule_scc.
8710 :
8711 : TODO: Since the scalar loop does not have a use of the recurrence
8712 : outside of the loop the natural way to implement peeling via
8713 : vectorizing the live value doesn't work. For now peeling of loops
8714 : with a recurrence is not implemented. For SLP the supported cases
8715 : are restricted to those requiring a single vector recurrence PHI. */
8716 :
8717 : bool
8718 173163 : vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8719 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8720 : {
8721 173163 : if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8722 : return false;
8723 :
8724 34010 : gphi *phi = as_a<gphi *> (stmt_info->stmt);
8725 :
8726 : /* So far we only support first-order recurrence auto-vectorization. */
8727 34010 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8728 : return false;
8729 :
8730 400 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8731 400 : unsigned ncopies = vect_get_num_copies (loop_vinfo, slp_node);
8732 400 : poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8733 400 : unsigned dist = SLP_TREE_LANES (slp_node);
8734 : /* We need to be able to make progress with a single vector. */
8735 400 : if (maybe_gt (dist * 2, nunits))
8736 : {
8737 0 : if (dump_enabled_p ())
8738 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8739 : "first order recurrence exceeds half of "
8740 : "a vector\n");
8741 0 : return false;
8742 : }
8743 :
8744 : /* We need to be able to build a { ..., a, b } init vector with
8745 : dist number of distinct trailing values. Always possible
8746 : when dist == 1 or when nunits is constant or when the initializations
8747 : are uniform. */
8748 400 : tree uniform_initval = NULL_TREE;
8749 400 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8750 1624 : for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
8751 : {
8752 436 : gphi *phi = as_a <gphi *> (s->stmt);
8753 436 : if (! uniform_initval)
8754 400 : uniform_initval = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8755 36 : else if (! operand_equal_p (uniform_initval,
8756 36 : PHI_ARG_DEF_FROM_EDGE (phi, pe)))
8757 : {
8758 : uniform_initval = NULL_TREE;
8759 : break;
8760 : }
8761 : }
8762 400 : if (!uniform_initval && !nunits.is_constant ())
8763 : {
8764 : if (dump_enabled_p ())
8765 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8766 : "cannot build initialization vector for "
8767 : "first order recurrence\n");
8768 : return false;
8769 : }
8770 :
8771 : /* First-order recurrence autovectorization needs to handle permutation
8772 : with indices = [nunits-1, nunits, nunits+1, ...]. */
8773 400 : vec_perm_builder sel (nunits, 1, 3);
8774 1600 : for (int i = 0; i < 3; ++i)
8775 1200 : sel.quick_push (nunits - dist + i);
8776 400 : vec_perm_indices indices (sel, 2, nunits);
8777 :
8778 400 : if (cost_vec) /* transformation not required. */
8779 : {
8780 360 : if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8781 : indices))
8782 : return false;
8783 :
8784 : /* We eventually need to set a vector type on invariant
8785 : arguments. */
8786 : unsigned j;
8787 : slp_tree child;
8788 744 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8789 496 : if (!vect_maybe_update_slp_op_vectype (child, vectype))
8790 : {
8791 0 : if (dump_enabled_p ())
8792 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8793 : "incompatible vector types for "
8794 : "invariants\n");
8795 0 : return false;
8796 : }
8797 :
8798 : /* Verify we have set up compatible types. */
8799 248 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8800 248 : slp_tree latch_def = SLP_TREE_CHILDREN (slp_node)[le->dest_idx];
8801 248 : tree latch_vectype = SLP_TREE_VECTYPE (latch_def);
8802 248 : if (!types_compatible_p (latch_vectype, vectype))
8803 : return false;
8804 :
8805 : /* The recurrence costs the initialization vector and one permute
8806 : for each copy. With SLP the prologue value is explicitly
8807 : represented and costed separately. */
8808 248 : unsigned prologue_cost = 0;
8809 248 : unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8810 : slp_node, 0, vect_body);
8811 248 : if (dump_enabled_p ())
8812 48 : dump_printf_loc (MSG_NOTE, vect_location,
8813 : "vectorizable_recurr: inside_cost = %d, "
8814 : "prologue_cost = %d .\n", inside_cost,
8815 : prologue_cost);
8816 :
8817 248 : SLP_TREE_TYPE (slp_node) = recurr_info_type;
8818 248 : return true;
8819 : }
8820 :
8821 40 : tree vec_init;
8822 40 : if (! uniform_initval)
8823 : {
8824 6 : vec<constructor_elt, va_gc> *v = NULL;
8825 6 : vec_alloc (v, nunits.to_constant ());
8826 33 : for (unsigned i = 0; i < nunits.to_constant () - dist; ++i)
8827 27 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
8828 : build_zero_cst (TREE_TYPE (vectype)));
8829 39 : for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
8830 : {
8831 21 : gphi *phi = as_a <gphi *> (s->stmt);
8832 21 : tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8833 21 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
8834 21 : TREE_TYPE (preheader)))
8835 : {
8836 0 : gimple_seq stmts = NULL;
8837 0 : preheader = gimple_convert (&stmts,
8838 0 : TREE_TYPE (vectype), preheader);
8839 0 : gsi_insert_seq_on_edge_immediate (pe, stmts);
8840 : }
8841 21 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, preheader);
8842 : }
8843 6 : vec_init = build_constructor (vectype, v);
8844 : }
8845 : else
8846 : vec_init = uniform_initval;
8847 40 : vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8848 :
8849 : /* Create the vectorized first-order PHI node. */
8850 40 : tree vec_dest = vect_get_new_vect_var (vectype,
8851 : vect_simple_var, "vec_recur_");
8852 40 : basic_block bb = gimple_bb (phi);
8853 40 : gphi *new_phi = create_phi_node (vec_dest, bb);
8854 40 : add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8855 :
8856 : /* Insert shuffles the first-order recurrence autovectorization.
8857 : result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
8858 40 : tree perm = vect_gen_perm_mask_checked (vectype, indices);
8859 :
8860 : /* Insert the required permute after the latch definition. The
8861 : second and later operands are tentative and will be updated when we have
8862 : vectorized the latch definition. */
8863 40 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8864 40 : gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8865 40 : gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8866 40 : gsi_next (&gsi2);
8867 :
8868 117 : for (unsigned i = 0; i < ncopies; ++i)
8869 : {
8870 77 : vec_dest = make_ssa_name (vectype);
8871 77 : gassign *vperm
8872 117 : = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8873 40 : i == 0 ? gimple_phi_result (new_phi) : NULL,
8874 : NULL, perm);
8875 77 : vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8876 :
8877 77 : slp_node->push_vec_def (vperm);
8878 : }
8879 :
8880 : return true;
8881 400 : }
8882 :
8883 : /* Return true if VECTYPE represents a vector that requires lowering
8884 : by the vector lowering pass. */
8885 :
8886 : bool
8887 647463 : vect_emulated_vector_p (tree vectype)
8888 : {
8889 1294926 : return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8890 650170 : && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8891 2689 : || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8892 : }
8893 :
8894 : /* Return true if we can emulate CODE on an integer mode representation
8895 : of a vector. */
8896 :
8897 : bool
8898 10523 : vect_can_vectorize_without_simd_p (tree_code code)
8899 : {
8900 10523 : switch (code)
8901 : {
8902 : case PLUS_EXPR:
8903 : case MINUS_EXPR:
8904 : case NEGATE_EXPR:
8905 : case BIT_AND_EXPR:
8906 : case BIT_IOR_EXPR:
8907 : case BIT_XOR_EXPR:
8908 : case BIT_NOT_EXPR:
8909 : return true;
8910 :
8911 9972 : default:
8912 9972 : return false;
8913 : }
8914 : }
8915 :
8916 : /* Likewise, but taking a code_helper. */
8917 :
8918 : bool
8919 154 : vect_can_vectorize_without_simd_p (code_helper code)
8920 : {
8921 154 : return (code.is_tree_code ()
8922 154 : && vect_can_vectorize_without_simd_p (tree_code (code)));
8923 : }
8924 :
8925 : /* Create vector init for vectorized iv. */
8926 : static tree
8927 916 : vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8928 : tree step_expr, poly_uint64 nunits,
8929 : tree vectype,
8930 : enum vect_induction_op_type induction_type)
8931 : {
8932 916 : unsigned HOST_WIDE_INT const_nunits;
8933 916 : tree vec_shift, vec_init, new_name;
8934 916 : unsigned i;
8935 916 : tree itype = TREE_TYPE (vectype);
8936 :
8937 : /* iv_loop is the loop to be vectorized. Create:
8938 : vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
8939 916 : new_name = gimple_convert (stmts, itype, init_expr);
8940 916 : switch (induction_type)
8941 : {
8942 18 : case vect_step_op_shr:
8943 18 : case vect_step_op_shl:
8944 : /* Build the Initial value from shift_expr. */
8945 18 : vec_init = gimple_build_vector_from_val (stmts,
8946 : vectype,
8947 : new_name);
8948 18 : vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
8949 : build_zero_cst (itype), step_expr);
8950 18 : vec_init = gimple_build (stmts,
8951 : (induction_type == vect_step_op_shr
8952 : ? RSHIFT_EXPR : LSHIFT_EXPR),
8953 : vectype, vec_init, vec_shift);
8954 18 : break;
8955 :
8956 822 : case vect_step_op_neg:
8957 822 : {
8958 822 : vec_init = gimple_build_vector_from_val (stmts,
8959 : vectype,
8960 : new_name);
8961 822 : tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
8962 : vectype, vec_init);
8963 : /* The encoding has 2 interleaved stepped patterns. */
8964 822 : vec_perm_builder sel (nunits, 2, 3);
8965 822 : sel.quick_grow (6);
8966 4110 : for (i = 0; i < 3; i++)
8967 : {
8968 2466 : sel[2 * i] = i;
8969 2466 : sel[2 * i + 1] = i + nunits;
8970 : }
8971 822 : vec_perm_indices indices (sel, 2, nunits);
8972 : /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
8973 : fail when vec_init is const vector. In that situation vec_perm is not
8974 : really needed. */
8975 822 : tree perm_mask_even
8976 822 : = vect_gen_perm_mask_any (vectype, indices);
8977 822 : vec_init = gimple_build (stmts, VEC_PERM_EXPR,
8978 : vectype,
8979 : vec_init, vec_neg,
8980 : perm_mask_even);
8981 822 : }
8982 822 : break;
8983 :
8984 76 : case vect_step_op_mul:
8985 76 : {
8986 : /* Use unsigned mult to avoid UD integer overflow. */
8987 76 : gcc_assert (nunits.is_constant (&const_nunits));
8988 76 : tree utype = unsigned_type_for (itype);
8989 76 : tree uvectype = build_vector_type (utype,
8990 76 : TYPE_VECTOR_SUBPARTS (vectype));
8991 76 : new_name = gimple_convert (stmts, utype, new_name);
8992 76 : vec_init = gimple_build_vector_from_val (stmts,
8993 : uvectype,
8994 : new_name);
8995 76 : tree_vector_builder elts (uvectype, const_nunits, 1);
8996 76 : tree elt_step = build_one_cst (utype);
8997 :
8998 76 : elts.quick_push (elt_step);
8999 660 : for (i = 1; i < const_nunits; i++)
9000 : {
9001 : /* Create: new_name_i = new_name + step_expr. */
9002 508 : elt_step = gimple_build (stmts, MULT_EXPR,
9003 : utype, elt_step, step_expr);
9004 508 : elts.quick_push (elt_step);
9005 : }
9006 : /* Create a vector from [new_name_0, new_name_1, ...,
9007 : new_name_nunits-1]. */
9008 76 : tree vec_mul = gimple_build_vector (stmts, &elts);
9009 76 : vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9010 : vec_init, vec_mul);
9011 76 : vec_init = gimple_convert (stmts, vectype, vec_init);
9012 76 : }
9013 76 : break;
9014 :
9015 0 : default:
9016 0 : gcc_unreachable ();
9017 : }
9018 :
9019 916 : return vec_init;
9020 : }
9021 :
9022 : /* Peel init_expr by skip_niter for induction_type. */
9023 : tree
9024 84 : vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9025 : tree skip_niters, tree step_expr,
9026 : enum vect_induction_op_type induction_type,
9027 : bool early_exit_p)
9028 : {
9029 84 : gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST || early_exit_p);
9030 84 : tree type = TREE_TYPE (init_expr);
9031 84 : unsigned prec = TYPE_PRECISION (type);
9032 84 : switch (induction_type)
9033 : {
9034 : /* neg inductions are typically not used for loop termination conditions but
9035 : are typically implemented as b = -b. That is every scalar iteration b is
9036 : negated. That means that for the initial value of b we will have to
9037 : determine whether the number of skipped iteration is a multiple of 2
9038 : because every 2 scalar iterations we are back at "b". */
9039 0 : case vect_step_op_neg:
9040 : /* For early exits the neg induction will always be the same value at the
9041 : start of the iteration. */
9042 0 : if (early_exit_p)
9043 : break;
9044 :
9045 0 : if (TREE_INT_CST_LOW (skip_niters) % 2)
9046 0 : init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9047 : /* else no change. */
9048 : break;
9049 :
9050 12 : case vect_step_op_shr:
9051 12 : case vect_step_op_shl:
9052 12 : skip_niters = fold_build1 (NOP_EXPR, type, skip_niters);
9053 12 : step_expr = fold_build1 (NOP_EXPR, type, step_expr);
9054 12 : step_expr = fold_build2 (MULT_EXPR, type, step_expr, skip_niters);
9055 : /* When shift mount >= precision, need to avoid UD.
9056 : In the original loop, there's no UD, and according to semantic,
9057 : init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9058 12 : if ((!tree_fits_uhwi_p (step_expr)
9059 12 : || tree_to_uhwi (step_expr) >= prec)
9060 6 : && !early_exit_p)
9061 : {
9062 6 : if (induction_type == vect_step_op_shl
9063 6 : || TYPE_UNSIGNED (type))
9064 4 : init_expr = build_zero_cst (type);
9065 : else
9066 2 : init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9067 : init_expr,
9068 4 : wide_int_to_tree (type, prec - 1));
9069 : }
9070 : else
9071 : {
9072 8 : init_expr = fold_build2 ((induction_type == vect_step_op_shr
9073 : ? RSHIFT_EXPR : LSHIFT_EXPR),
9074 : type, init_expr, step_expr);
9075 6 : init_expr = force_gimple_operand (init_expr, stmts, false, NULL);
9076 : }
9077 : break;
9078 :
9079 72 : case vect_step_op_mul:
9080 72 : {
9081 : /* Due to UB we can't support vect_step_op_mul with early break for now.
9082 : so assert and block. */
9083 72 : gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9084 72 : tree utype = unsigned_type_for (type);
9085 72 : init_expr = gimple_convert (stmts, utype, init_expr);
9086 72 : wide_int skipn = wi::to_wide (skip_niters);
9087 72 : wide_int begin = wi::to_wide (step_expr);
9088 72 : auto_mpz base, exp, mod, res;
9089 72 : wi::to_mpz (begin, base, TYPE_SIGN (type));
9090 72 : wi::to_mpz (skipn, exp, UNSIGNED);
9091 72 : mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9092 72 : mpz_powm (res, base, exp, mod);
9093 72 : begin = wi::from_mpz (utype, res, true);
9094 72 : tree mult_expr = wide_int_to_tree (utype, begin);
9095 72 : init_expr = gimple_build (stmts, MULT_EXPR, utype,
9096 : init_expr, mult_expr);
9097 72 : init_expr = gimple_convert (stmts, type, init_expr);
9098 72 : }
9099 72 : break;
9100 :
9101 0 : default:
9102 0 : gcc_unreachable ();
9103 : }
9104 :
9105 84 : return init_expr;
9106 : }
9107 :
9108 : /* Create vector step for vectorized iv. */
9109 : static tree
9110 1202 : vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9111 : poly_uint64 vf,
9112 : enum vect_induction_op_type induction_type)
9113 : {
9114 1202 : tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9115 1202 : tree new_name = NULL;
9116 : /* Step should be pow (step, vf) for mult induction. */
9117 1202 : if (induction_type == vect_step_op_mul)
9118 : {
9119 76 : gcc_assert (vf.is_constant ());
9120 76 : wide_int begin = wi::to_wide (step_expr);
9121 :
9122 584 : for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9123 508 : begin = wi::mul (begin, wi::to_wide (step_expr));
9124 :
9125 76 : new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9126 76 : }
9127 1126 : else if (induction_type == vect_step_op_neg)
9128 : /* Do nothing. */
9129 : ;
9130 : else
9131 18 : new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9132 : expr, step_expr);
9133 1202 : return new_name;
9134 : }
9135 :
9136 : static tree
9137 1202 : vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9138 : stmt_vec_info stmt_info,
9139 : tree new_name, tree vectype,
9140 : enum vect_induction_op_type induction_type)
9141 : {
9142 : /* No step is needed for neg induction. */
9143 1202 : if (induction_type == vect_step_op_neg)
9144 : return NULL;
9145 :
9146 94 : tree t = unshare_expr (new_name);
9147 94 : gcc_assert (CONSTANT_CLASS_P (new_name)
9148 : || TREE_CODE (new_name) == SSA_NAME);
9149 94 : tree new_vec = build_vector_from_val (vectype, t);
9150 94 : tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9151 : new_vec, vectype, NULL);
9152 94 : return vec_step;
9153 : }
9154 :
9155 : /* Update vectorized iv with vect_step, induc_def is init. */
9156 : static tree
9157 1390 : vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9158 : tree induc_def, tree vec_step,
9159 : enum vect_induction_op_type induction_type)
9160 : {
9161 1390 : tree vec_def = induc_def;
9162 1390 : switch (induction_type)
9163 : {
9164 76 : case vect_step_op_mul:
9165 76 : {
9166 : /* Use unsigned mult to avoid UD integer overflow. */
9167 76 : tree uvectype = unsigned_type_for (vectype);
9168 76 : vec_def = gimple_convert (stmts, uvectype, vec_def);
9169 76 : vec_step = gimple_convert (stmts, uvectype, vec_step);
9170 76 : vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9171 : vec_def, vec_step);
9172 76 : vec_def = gimple_convert (stmts, vectype, vec_def);
9173 : }
9174 76 : break;
9175 :
9176 12 : case vect_step_op_shr:
9177 12 : vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9178 : vec_def, vec_step);
9179 12 : break;
9180 :
9181 6 : case vect_step_op_shl:
9182 6 : vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9183 : vec_def, vec_step);
9184 6 : break;
9185 : case vect_step_op_neg:
9186 : vec_def = induc_def;
9187 : /* Do nothing. */
9188 : break;
9189 0 : default:
9190 0 : gcc_unreachable ();
9191 : }
9192 :
9193 1390 : return vec_def;
9194 :
9195 : }
9196 :
9197 : /* Function vectorizable_nonlinear_induction
9198 :
9199 : Check if STMT_INFO performs an nonlinear induction computation that can be
9200 : vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9201 : a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9202 : basic block.
9203 : Return true if STMT_INFO is vectorizable in this way. */
9204 :
9205 : static bool
9206 10194 : vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9207 : stmt_vec_info stmt_info,
9208 : slp_tree slp_node,
9209 : stmt_vector_for_cost *cost_vec)
9210 : {
9211 10194 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9212 10194 : unsigned ncopies;
9213 10194 : bool nested_in_vect_loop = false;
9214 10194 : class loop *iv_loop;
9215 10194 : tree vec_def;
9216 10194 : edge pe = loop_preheader_edge (loop);
9217 10194 : basic_block new_bb;
9218 10194 : tree vec_init, vec_step;
9219 10194 : tree new_name;
9220 10194 : gimple *new_stmt;
9221 10194 : gphi *induction_phi;
9222 10194 : tree induc_def, vec_dest;
9223 10194 : tree init_expr, step_expr;
9224 10194 : tree niters_skip;
9225 10194 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9226 10194 : unsigned i;
9227 10194 : gimple_stmt_iterator si;
9228 :
9229 10194 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9230 :
9231 10194 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9232 10194 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9233 10194 : enum vect_induction_op_type induction_type
9234 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9235 :
9236 10194 : gcc_assert (induction_type > vect_step_op_add);
9237 :
9238 10194 : ncopies = vect_get_num_copies (loop_vinfo, slp_node);
9239 10194 : gcc_assert (ncopies >= 1);
9240 :
9241 : /* FORNOW. Only handle nonlinear induction in the same loop. */
9242 10194 : if (nested_in_vect_loop_p (loop, stmt_info))
9243 : {
9244 0 : if (dump_enabled_p ())
9245 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9246 : "nonlinear induction in nested loop.\n");
9247 0 : return false;
9248 : }
9249 :
9250 10194 : iv_loop = loop;
9251 10194 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9252 :
9253 : /* TODO: Support multi-lane SLP for nonlinear iv. There should be separate
9254 : vector iv update for each iv and a permutation to generate wanted
9255 : vector iv. */
9256 10194 : if (SLP_TREE_LANES (slp_node) > 1)
9257 : {
9258 0 : if (dump_enabled_p ())
9259 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9260 : "SLP induction not supported for nonlinear"
9261 : " induction.\n");
9262 0 : return false;
9263 : }
9264 :
9265 10194 : if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9266 : {
9267 0 : if (dump_enabled_p ())
9268 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9269 : "floating point nonlinear induction vectorization"
9270 : " not supported.\n");
9271 0 : return false;
9272 : }
9273 :
9274 10194 : step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9275 10194 : init_expr = vect_phi_initial_value (phi);
9276 10194 : gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9277 : && TREE_CODE (step_expr) == INTEGER_CST);
9278 : /* step_expr should be aligned with init_expr,
9279 : .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9280 10194 : step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9281 :
9282 10194 : if (TREE_CODE (init_expr) == INTEGER_CST)
9283 3009 : init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9284 7185 : else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9285 : {
9286 : /* INIT_EXPR could be a bit_field, bail out for such case. */
9287 4 : if (dump_enabled_p ())
9288 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9289 : "nonlinear induction vectorization failed:"
9290 : " component type of vectype is not a nop conversion"
9291 : " from type of init_expr.\n");
9292 4 : return false;
9293 : }
9294 :
9295 10190 : switch (induction_type)
9296 : {
9297 2538 : case vect_step_op_neg:
9298 2538 : if (maybe_eq (TYPE_VECTOR_SUBPARTS (vectype), 1u))
9299 : return false;
9300 2534 : if (TREE_CODE (init_expr) != INTEGER_CST
9301 190 : && TREE_CODE (init_expr) != REAL_CST)
9302 : {
9303 : /* Check for backend support of NEGATE_EXPR and vec_perm. */
9304 190 : if (!directly_supported_p (NEGATE_EXPR, vectype))
9305 0 : return false;
9306 :
9307 : /* The encoding has 2 interleaved stepped patterns. */
9308 190 : vec_perm_builder sel (nunits, 2, 3);
9309 190 : machine_mode mode = TYPE_MODE (vectype);
9310 190 : sel.quick_grow (6);
9311 950 : for (i = 0; i < 3; i++)
9312 : {
9313 570 : sel[i * 2] = i;
9314 570 : sel[i * 2 + 1] = i + nunits;
9315 : }
9316 190 : vec_perm_indices indices (sel, 2, nunits);
9317 190 : if (!can_vec_perm_const_p (mode, mode, indices))
9318 0 : return false;
9319 190 : }
9320 : break;
9321 :
9322 1058 : case vect_step_op_mul:
9323 1058 : {
9324 : /* Check for backend support of MULT_EXPR. */
9325 1058 : if (!directly_supported_p (MULT_EXPR, vectype))
9326 : return false;
9327 :
9328 : /* ?? How to construct vector step for variable number vector.
9329 : [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9330 : if (!vf.is_constant ())
9331 : return false;
9332 : }
9333 : break;
9334 :
9335 6276 : case vect_step_op_shr:
9336 : /* Check for backend support of RSHIFT_EXPR. */
9337 6276 : if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9338 : return false;
9339 :
9340 : /* Don't shift more than type precision to avoid UD. */
9341 26 : if (!tree_fits_uhwi_p (step_expr)
9342 26 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9343 : TYPE_PRECISION (TREE_TYPE (init_expr))))
9344 : return false;
9345 : break;
9346 :
9347 318 : case vect_step_op_shl:
9348 : /* Check for backend support of RSHIFT_EXPR. */
9349 318 : if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9350 : return false;
9351 :
9352 : /* Don't shift more than type precision to avoid UD. */
9353 12 : if (!tree_fits_uhwi_p (step_expr)
9354 12 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9355 : TYPE_PRECISION (TREE_TYPE (init_expr))))
9356 : return false;
9357 :
9358 : break;
9359 :
9360 0 : default:
9361 0 : gcc_unreachable ();
9362 : }
9363 :
9364 3390 : if (cost_vec) /* transformation not required. */
9365 : {
9366 2474 : unsigned inside_cost = 0, prologue_cost = 0;
9367 : /* loop cost for vec_loop. Neg induction doesn't have any
9368 : inside_cost. */
9369 2474 : inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9370 : slp_node, 0, vect_body);
9371 :
9372 : /* loop cost for vec_loop. Neg induction doesn't have any
9373 : inside_cost. */
9374 2474 : if (induction_type == vect_step_op_neg)
9375 1712 : inside_cost = 0;
9376 :
9377 : /* prologue cost for vec_init and vec_step. */
9378 2474 : prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9379 : slp_node, 0, vect_prologue);
9380 :
9381 2474 : if (dump_enabled_p ())
9382 60 : dump_printf_loc (MSG_NOTE, vect_location,
9383 : "vect_model_induction_cost: inside_cost = %d, "
9384 : "prologue_cost = %d. \n", inside_cost,
9385 : prologue_cost);
9386 :
9387 2474 : SLP_TREE_TYPE (slp_node) = induc_vec_info_type;
9388 2474 : DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9389 2474 : return true;
9390 : }
9391 :
9392 : /* Transform. */
9393 :
9394 : /* Compute a vector variable, initialized with the first VF values of
9395 : the induction variable. E.g., for an iv with IV_PHI='X' and
9396 : evolution S, for a vector of 4 units, we want to compute:
9397 : [X, X + S, X + 2*S, X + 3*S]. */
9398 :
9399 916 : if (dump_enabled_p ())
9400 32 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9401 :
9402 916 : pe = loop_preheader_edge (iv_loop);
9403 : /* Find the first insertion point in the BB. */
9404 916 : basic_block bb = gimple_bb (phi);
9405 916 : si = gsi_after_labels (bb);
9406 :
9407 916 : gimple_seq stmts = NULL;
9408 :
9409 916 : niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9410 : /* If we are using the loop mask to "peel" for alignment then we need
9411 : to adjust the start value here. */
9412 916 : if (niters_skip != NULL_TREE)
9413 0 : init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9414 : step_expr, induction_type, false);
9415 :
9416 916 : vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9417 : step_expr, nunits, vectype,
9418 : induction_type);
9419 916 : if (stmts)
9420 : {
9421 162 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9422 162 : gcc_assert (!new_bb);
9423 : }
9424 :
9425 916 : stmts = NULL;
9426 916 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9427 : vf, induction_type);
9428 916 : if (stmts)
9429 : {
9430 0 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9431 0 : gcc_assert (!new_bb);
9432 : }
9433 :
9434 916 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9435 : new_name, vectype,
9436 : induction_type);
9437 : /* Create the following def-use cycle:
9438 : loop prolog:
9439 : vec_init = ...
9440 : vec_step = ...
9441 : loop:
9442 : vec_iv = PHI <vec_init, vec_loop>
9443 : ...
9444 : STMT
9445 : ...
9446 : vec_loop = vec_iv + vec_step; */
9447 :
9448 : /* Create the induction-phi that defines the induction-operand. */
9449 916 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9450 916 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
9451 916 : induc_def = PHI_RESULT (induction_phi);
9452 :
9453 : /* Create the iv update inside the loop. */
9454 916 : stmts = NULL;
9455 916 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9456 : induc_def, vec_step,
9457 : induction_type);
9458 :
9459 916 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9460 916 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9461 :
9462 : /* Set the arguments of the phi node: */
9463 916 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9464 916 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9465 : UNKNOWN_LOCATION);
9466 :
9467 916 : slp_node->push_vec_def (induction_phi);
9468 :
9469 : /* In case that vectorization factor (VF) is bigger than the number
9470 : of elements that we can fit in a vectype (nunits), we have to generate
9471 : more than one vector stmt - i.e - we need to "unroll" the
9472 : vector stmt by a factor VF/nunits. For more details see documentation
9473 : in vectorizable_operation. */
9474 :
9475 916 : if (ncopies > 1)
9476 : {
9477 286 : stmts = NULL;
9478 : /* FORNOW. This restriction should be relaxed. */
9479 286 : gcc_assert (!nested_in_vect_loop);
9480 :
9481 286 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9482 : nunits, induction_type);
9483 :
9484 286 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9485 : new_name, vectype,
9486 : induction_type);
9487 286 : vec_def = induc_def;
9488 1046 : for (i = 1; i < ncopies; i++)
9489 : {
9490 : /* vec_i = vec_prev + vec_step. */
9491 474 : stmts = NULL;
9492 474 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9493 : vec_def, vec_step,
9494 : induction_type);
9495 474 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9496 474 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9497 474 : slp_node->push_vec_def (new_stmt);
9498 : }
9499 : }
9500 :
9501 916 : if (dump_enabled_p ())
9502 64 : dump_printf_loc (MSG_NOTE, vect_location,
9503 : "transform induction: created def-use cycle: %G%G",
9504 32 : (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9505 :
9506 : return true;
9507 : }
9508 :
9509 : /* Function vectorizable_induction
9510 :
9511 : Check if STMT_INFO performs an induction computation that can be vectorized.
9512 : If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9513 : phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9514 : Return true if STMT_INFO is vectorizable in this way. */
9515 :
9516 : bool
9517 295381 : vectorizable_induction (loop_vec_info loop_vinfo,
9518 : stmt_vec_info stmt_info,
9519 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9520 : {
9521 295381 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9522 295381 : bool nested_in_vect_loop = false;
9523 295381 : class loop *iv_loop;
9524 295381 : tree vec_def;
9525 295381 : edge pe = loop_preheader_edge (loop);
9526 295381 : basic_block new_bb;
9527 295381 : tree vec_init = NULL_TREE, vec_step, t;
9528 295381 : tree new_name;
9529 295381 : gphi *induction_phi;
9530 295381 : tree induc_def, vec_dest;
9531 295381 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9532 295381 : unsigned i;
9533 295381 : tree expr;
9534 295381 : tree index_vectype = NULL_TREE;
9535 295381 : gimple_stmt_iterator si;
9536 295381 : enum vect_induction_op_type induction_type
9537 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9538 :
9539 323308 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9540 156228 : if (!phi)
9541 : return false;
9542 :
9543 156228 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
9544 : return false;
9545 :
9546 : /* Make sure it was recognized as induction computation. */
9547 156228 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9548 : return false;
9549 :
9550 : /* Handle nonlinear induction in a separate place. */
9551 152597 : if (induction_type != vect_step_op_add)
9552 10194 : return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9553 10194 : slp_node, cost_vec);
9554 :
9555 142403 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9556 142403 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9557 :
9558 : /* FORNOW. These restrictions should be relaxed. */
9559 142403 : if (nested_in_vect_loop_p (loop, stmt_info))
9560 : {
9561 740 : imm_use_iterator imm_iter;
9562 740 : use_operand_p use_p;
9563 740 : gimple *exit_phi;
9564 740 : edge latch_e;
9565 740 : tree loop_arg;
9566 :
9567 740 : exit_phi = NULL;
9568 740 : latch_e = loop_latch_edge (loop->inner);
9569 740 : loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9570 2256 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9571 : {
9572 800 : gimple *use_stmt = USE_STMT (use_p);
9573 800 : if (is_gimple_debug (use_stmt))
9574 36 : continue;
9575 :
9576 764 : if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9577 : {
9578 : exit_phi = use_stmt;
9579 : break;
9580 : }
9581 740 : }
9582 740 : if (exit_phi)
9583 : {
9584 24 : stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9585 24 : if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9586 8 : && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9587 : {
9588 16 : if (dump_enabled_p ())
9589 16 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9590 : "inner-loop induction only used outside "
9591 : "of the outer vectorized loop.\n");
9592 16 : return false;
9593 : }
9594 : }
9595 :
9596 724 : nested_in_vect_loop = true;
9597 724 : iv_loop = loop->inner;
9598 : }
9599 : else
9600 : iv_loop = loop;
9601 142387 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9602 :
9603 142387 : if (!nunits.is_constant () && SLP_TREE_LANES (slp_node) != 1)
9604 : {
9605 : /* The current SLP code creates the step value element-by-element. */
9606 : if (dump_enabled_p ())
9607 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9608 : "SLP induction not supported for variable-length"
9609 : " vectors.\n");
9610 : return false;
9611 : }
9612 :
9613 142387 : if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9614 : {
9615 12 : if (dump_enabled_p ())
9616 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9617 : "floating point induction vectorization disabled\n");
9618 12 : return false;
9619 : }
9620 :
9621 142375 : tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9622 142375 : gcc_assert (step_expr != NULL_TREE);
9623 284704 : if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
9624 284608 : && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
9625 : {
9626 12 : if (dump_enabled_p ())
9627 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9628 : "bit-precision induction vectorization not "
9629 : "supported.\n");
9630 12 : return false;
9631 : }
9632 142363 : tree stept = TREE_TYPE (step_expr);
9633 142363 : tree step_vectype = get_same_sized_vectype (stept, vectype);
9634 142363 : stept = TREE_TYPE (step_vectype);
9635 :
9636 : /* Check for target support of the vectorized arithmetic used here. */
9637 142363 : if (!target_supports_op_p (step_vectype, PLUS_EXPR, optab_default)
9638 142363 : || !target_supports_op_p (step_vectype, MINUS_EXPR, optab_default))
9639 24256 : return false;
9640 118107 : if (!nunits.is_constant ())
9641 : {
9642 : if (!target_supports_op_p (step_vectype, MULT_EXPR, optab_default))
9643 : return false;
9644 : /* FLOAT_EXPR when computing VEC_INIT for float inductions. */
9645 : if (SCALAR_FLOAT_TYPE_P (stept))
9646 : {
9647 : tree index_type = build_nonstandard_integer_type
9648 : (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (stept)), 1);
9649 :
9650 : index_vectype = build_vector_type (index_type, nunits);
9651 : if (!can_float_p (TYPE_MODE (step_vectype),
9652 : TYPE_MODE (index_vectype), 1))
9653 : return false;
9654 : }
9655 : }
9656 :
9657 118107 : unsigned nvects = vect_get_num_copies (loop_vinfo, slp_node);
9658 118107 : if (cost_vec) /* transformation not required. */
9659 : {
9660 308550 : unsigned inside_cost = 0, prologue_cost = 0;
9661 : /* We eventually need to set a vector type on invariant
9662 : arguments. */
9663 : unsigned j;
9664 : slp_tree child;
9665 308550 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9666 205700 : if (!vect_maybe_update_slp_op_vectype
9667 205700 : (child, SLP_TREE_VECTYPE (slp_node)))
9668 : {
9669 0 : if (dump_enabled_p ())
9670 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9671 : "incompatible vector types for "
9672 : "invariants\n");
9673 0 : return false;
9674 : }
9675 : /* loop cost for vec_loop. */
9676 102850 : inside_cost = record_stmt_cost (cost_vec, nvects,
9677 : vector_stmt, slp_node, 0, vect_body);
9678 : /* prologue cost for vec_init (if not nested) and step. */
9679 102850 : prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9680 : scalar_to_vec,
9681 : slp_node, 0, vect_prologue);
9682 102850 : if (dump_enabled_p ())
9683 4025 : dump_printf_loc (MSG_NOTE, vect_location,
9684 : "vect_model_induction_cost: inside_cost = %d, "
9685 : "prologue_cost = %d .\n", inside_cost,
9686 : prologue_cost);
9687 :
9688 102850 : SLP_TREE_TYPE (slp_node) = induc_vec_info_type;
9689 102850 : DUMP_VECT_SCOPE ("vectorizable_induction");
9690 102850 : return true;
9691 : }
9692 :
9693 : /* Transform. */
9694 :
9695 : /* Compute a vector variable, initialized with the first VF values of
9696 : the induction variable. E.g., for an iv with IV_PHI='X' and
9697 : evolution S, for a vector of 4 units, we want to compute:
9698 : [X, X + S, X + 2*S, X + 3*S]. */
9699 :
9700 15257 : if (dump_enabled_p ())
9701 2770 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9702 :
9703 15257 : pe = loop_preheader_edge (iv_loop);
9704 : /* Find the first insertion point in the BB. */
9705 15257 : basic_block bb = gimple_bb (phi);
9706 15257 : si = gsi_after_labels (bb);
9707 :
9708 : /* For SLP induction we have to generate several IVs as for example
9709 : with group size 3 we need
9710 : [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9711 : [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9712 15257 : gimple_stmt_iterator incr_si;
9713 15257 : bool insert_after;
9714 15257 : standard_iv_increment_position (iv_loop, &incr_si, &insert_after);
9715 :
9716 : /* The initial values are vectorized, but any lanes > group_size
9717 : need adjustment. */
9718 15257 : slp_tree init_node
9719 15257 : = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9720 :
9721 : /* Gather steps. Since we do not vectorize inductions as
9722 : cycles we have to reconstruct the step from SCEV data. */
9723 15257 : unsigned group_size = SLP_TREE_LANES (slp_node);
9724 15257 : tree *steps = XALLOCAVEC (tree, group_size);
9725 15257 : tree *inits = XALLOCAVEC (tree, group_size);
9726 15257 : stmt_vec_info phi_info;
9727 47005 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9728 : {
9729 16491 : steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9730 16491 : if (!init_node)
9731 16252 : inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9732 : pe->dest_idx);
9733 : }
9734 :
9735 : /* Now generate the IVs. */
9736 30514 : gcc_assert (multiple_p (nunits * nvects, group_size));
9737 15257 : unsigned nivs;
9738 15257 : unsigned HOST_WIDE_INT const_nunits;
9739 15257 : if (nested_in_vect_loop)
9740 : nivs = nvects;
9741 15039 : else if (nunits.is_constant (&const_nunits))
9742 : {
9743 : /* Compute the number of distinct IVs we need. First reduce
9744 : group_size if it is a multiple of const_nunits so we get
9745 : one IV for a group_size of 4 but const_nunits 2. */
9746 15039 : unsigned group_sizep = group_size;
9747 15039 : if (group_sizep % const_nunits == 0)
9748 111 : group_sizep = group_sizep / const_nunits;
9749 15039 : nivs = least_common_multiple (group_sizep, const_nunits) / const_nunits;
9750 : }
9751 : else
9752 : {
9753 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
9754 : nivs = 1;
9755 : }
9756 15257 : gimple_seq init_stmts = NULL;
9757 15257 : tree lupdate_mul = NULL_TREE;
9758 218 : if (!nested_in_vect_loop)
9759 : {
9760 15039 : if (nunits.is_constant (&const_nunits))
9761 : {
9762 : /* The number of iterations covered in one vector iteration. */
9763 15039 : unsigned lup_mul = (nvects * const_nunits) / group_size;
9764 15039 : lupdate_mul
9765 15039 : = build_vector_from_val (step_vectype,
9766 15039 : SCALAR_FLOAT_TYPE_P (stept)
9767 28 : ? build_real_from_wide (stept, lup_mul,
9768 : UNSIGNED)
9769 30050 : : build_int_cstu (stept, lup_mul));
9770 : }
9771 : else
9772 : {
9773 : if (SCALAR_FLOAT_TYPE_P (stept))
9774 : {
9775 : tree tem = build_int_cst (integer_type_node, vf);
9776 : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
9777 : }
9778 : else
9779 : lupdate_mul = build_int_cst (stept, vf);
9780 : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
9781 : lupdate_mul);
9782 : }
9783 : }
9784 15257 : tree peel_mul = NULL_TREE;
9785 15257 : if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9786 : {
9787 0 : if (SCALAR_FLOAT_TYPE_P (stept))
9788 0 : peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9789 : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9790 : else
9791 0 : peel_mul = gimple_convert (&init_stmts, stept,
9792 : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9793 0 : peel_mul = gimple_build_vector_from_val (&init_stmts,
9794 : step_vectype, peel_mul);
9795 : }
9796 15257 : tree step_mul = NULL_TREE;
9797 15257 : unsigned ivn;
9798 15257 : auto_vec<tree> vec_steps;
9799 31090 : for (ivn = 0; ivn < nivs; ++ivn)
9800 : {
9801 15833 : gimple_seq stmts = NULL;
9802 15833 : bool invariant = true;
9803 15833 : if (nunits.is_constant (&const_nunits))
9804 : {
9805 15833 : tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9806 15833 : tree_vector_builder init_elts (vectype, const_nunits, 1);
9807 15833 : tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9808 103439 : for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9809 : {
9810 : /* The scalar steps of the IVs. */
9811 87606 : tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9812 87606 : elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9813 87606 : step_elts.quick_push (elt);
9814 87606 : if (!init_node)
9815 : {
9816 : /* The scalar inits of the IVs if not vectorized. */
9817 86356 : elt = inits[(ivn*const_nunits + eltn) % group_size];
9818 86356 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
9819 86356 : TREE_TYPE (elt)))
9820 264 : elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9821 264 : TREE_TYPE (vectype), elt);
9822 86356 : init_elts.quick_push (elt);
9823 : }
9824 : /* The number of steps to add to the initial values. */
9825 87606 : unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9826 175212 : mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9827 175110 : ? build_real_from_wide (stept, mul_elt,
9828 : UNSIGNED)
9829 175110 : : build_int_cstu (stept, mul_elt));
9830 : }
9831 15833 : vec_step = gimple_build_vector (&init_stmts, &step_elts);
9832 15833 : step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9833 15833 : if (!init_node)
9834 15581 : vec_init = gimple_build_vector (&init_stmts, &init_elts);
9835 15833 : }
9836 : else
9837 : {
9838 : tree step = gimple_convert (&init_stmts, stept, steps[0]);
9839 : if (init_node)
9840 : ;
9841 : else if (INTEGRAL_TYPE_P (stept))
9842 : {
9843 : new_name = gimple_convert (&init_stmts, stept, inits[0]);
9844 : /* Build the initial value directly as a VEC_SERIES_EXPR. */
9845 : vec_init = gimple_build (&init_stmts, VEC_SERIES_EXPR,
9846 : step_vectype, new_name, step);
9847 : if (!useless_type_conversion_p (vectype, step_vectype))
9848 : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9849 : vectype, vec_init);
9850 : }
9851 : else
9852 : {
9853 : /* Build:
9854 : [base, base, base, ...]
9855 : + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
9856 : gcc_assert (SCALAR_FLOAT_TYPE_P (stept));
9857 : gcc_assert (flag_associative_math);
9858 : gcc_assert (index_vectype != NULL_TREE);
9859 :
9860 : tree index = build_index_vector (index_vectype, 0, 1);
9861 : new_name = gimple_convert (&init_stmts, stept, inits[0]);
9862 : tree base_vec = gimple_build_vector_from_val (&init_stmts,
9863 : step_vectype,
9864 : new_name);
9865 : tree step_vec = gimple_build_vector_from_val (&init_stmts,
9866 : step_vectype,
9867 : step);
9868 : vec_init = gimple_build (&init_stmts, FLOAT_EXPR,
9869 : step_vectype, index);
9870 : vec_init = gimple_build (&init_stmts, MULT_EXPR,
9871 : step_vectype, vec_init, step_vec);
9872 : vec_init = gimple_build (&init_stmts, PLUS_EXPR,
9873 : step_vectype, vec_init, base_vec);
9874 : if (!useless_type_conversion_p (vectype, step_vectype))
9875 : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9876 : vectype, vec_init);
9877 : }
9878 : /* iv_loop is nested in the loop to be vectorized. Generate:
9879 : vec_step = [S, S, S, S] */
9880 : t = unshare_expr (step);
9881 : gcc_assert (CONSTANT_CLASS_P (t)
9882 : || TREE_CODE (t) == SSA_NAME);
9883 : vec_step = gimple_build_vector_from_val (&init_stmts,
9884 : step_vectype, t);
9885 : }
9886 15833 : vec_steps.safe_push (vec_step);
9887 15833 : if (peel_mul)
9888 : {
9889 0 : if (!step_mul)
9890 : {
9891 0 : gcc_assert (!nunits.is_constant ());
9892 : step_mul = gimple_build (&init_stmts,
9893 : MINUS_EXPR, step_vectype,
9894 : build_zero_cst (step_vectype), peel_mul);
9895 : }
9896 : else
9897 0 : step_mul = gimple_build (&init_stmts,
9898 : MINUS_EXPR, step_vectype,
9899 : step_mul, peel_mul);
9900 : }
9901 :
9902 : /* Create the induction-phi that defines the induction-operand. */
9903 15833 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9904 : "vec_iv_");
9905 15833 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
9906 15833 : induc_def = PHI_RESULT (induction_phi);
9907 :
9908 : /* Create the iv update inside the loop */
9909 15833 : tree up = vec_step;
9910 15833 : if (lupdate_mul)
9911 : {
9912 15581 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
9913 : {
9914 : /* When we're using loop_len produced by SELEC_VL, the
9915 : non-final iterations are not always processing VF
9916 : elements. So vectorize induction variable instead of
9917 :
9918 : _21 = vect_vec_iv_.6_22 + { VF, ... };
9919 :
9920 : We should generate:
9921 :
9922 : _35 = .SELECT_VL (ivtmp_33, VF);
9923 : vect_cst__22 = [vec_duplicate_expr] _35;
9924 : _21 = vect_vec_iv_.6_22 + vect_cst__22; */
9925 0 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
9926 0 : tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1,
9927 : vectype, 0, 0, false);
9928 0 : if (SCALAR_FLOAT_TYPE_P (stept))
9929 0 : expr = gimple_build (&stmts, FLOAT_EXPR, stept, len);
9930 : else
9931 0 : expr = gimple_convert (&stmts, stept, len);
9932 0 : lupdate_mul = gimple_build_vector_from_val (&stmts, step_vectype,
9933 : expr);
9934 0 : up = gimple_build (&stmts, MULT_EXPR,
9935 : step_vectype, vec_step, lupdate_mul);
9936 : }
9937 : else
9938 15581 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9939 : vec_step, lupdate_mul);
9940 : }
9941 15833 : vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9942 15833 : vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, up);
9943 15833 : vec_def = gimple_convert (&stmts, vectype, vec_def);
9944 15833 : insert_iv_increment (&incr_si, insert_after, stmts);
9945 15833 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9946 : UNKNOWN_LOCATION);
9947 :
9948 15833 : if (init_node)
9949 252 : vec_init = vect_get_slp_vect_def (init_node, ivn);
9950 15833 : if (!nested_in_vect_loop
9951 15833 : && step_mul
9952 15833 : && !integer_zerop (step_mul))
9953 : {
9954 15140 : gcc_assert (invariant);
9955 15140 : vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9956 15140 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9957 : vec_step, step_mul);
9958 15140 : vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9959 : vec_def, up);
9960 15140 : vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9961 : }
9962 :
9963 : /* Set the arguments of the phi node: */
9964 15833 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9965 :
9966 15833 : slp_node->push_vec_def (induction_phi);
9967 : }
9968 15257 : if (!nested_in_vect_loop)
9969 : {
9970 : /* Fill up to the number of vectors we need for the whole group. */
9971 15039 : if (nunits.is_constant (&const_nunits))
9972 15039 : nivs = least_common_multiple (group_size, const_nunits) / const_nunits;
9973 : else
9974 : nivs = 1;
9975 15039 : vec_steps.reserve (nivs-ivn);
9976 30099 : for (; ivn < nivs; ++ivn)
9977 : {
9978 21 : slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
9979 21 : vec_steps.quick_push (vec_steps[0]);
9980 : }
9981 : }
9982 :
9983 : /* Re-use IVs when we can. We are generating further vector
9984 : stmts by adding VF' * stride to the IVs generated above. */
9985 15257 : if (ivn < nvects)
9986 : {
9987 3391 : if (nunits.is_constant (&const_nunits))
9988 : {
9989 3391 : unsigned vfp = (least_common_multiple (group_size, const_nunits)
9990 3391 : / group_size);
9991 3391 : lupdate_mul
9992 3391 : = build_vector_from_val (step_vectype,
9993 3391 : SCALAR_FLOAT_TYPE_P (stept)
9994 8 : ? build_real_from_wide (stept,
9995 8 : vfp, UNSIGNED)
9996 6774 : : build_int_cstu (stept, vfp));
9997 : }
9998 : else
9999 : {
10000 : if (SCALAR_FLOAT_TYPE_P (stept))
10001 : {
10002 : tree tem = build_int_cst (integer_type_node, nunits);
10003 : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
10004 : }
10005 : else
10006 : lupdate_mul = build_int_cst (stept, nunits);
10007 : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
10008 : lupdate_mul);
10009 : }
10010 10902 : for (; ivn < nvects; ++ivn)
10011 : {
10012 7511 : gimple *iv
10013 7511 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10014 7511 : tree def = gimple_get_lhs (iv);
10015 7511 : if (ivn < 2*nivs)
10016 3483 : vec_steps[ivn - nivs]
10017 3483 : = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10018 3483 : vec_steps[ivn - nivs], lupdate_mul);
10019 7511 : gimple_seq stmts = NULL;
10020 7511 : def = gimple_convert (&stmts, step_vectype, def);
10021 22533 : def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10022 7511 : def, vec_steps[ivn % nivs]);
10023 7511 : def = gimple_convert (&stmts, vectype, def);
10024 7511 : if (gimple_code (iv) == GIMPLE_PHI)
10025 3483 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10026 : else
10027 : {
10028 4028 : gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10029 4028 : gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10030 : }
10031 7511 : slp_node->push_vec_def (def);
10032 : }
10033 : }
10034 :
10035 15257 : new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10036 15257 : gcc_assert (!new_bb);
10037 :
10038 15257 : return true;
10039 15257 : }
10040 :
10041 : /* Function vectorizable_live_operation_1.
10042 :
10043 : helper function for vectorizable_live_operation. */
10044 :
10045 : static tree
10046 2823 : vectorizable_live_operation_1 (loop_vec_info loop_vinfo, basic_block exit_bb,
10047 : tree vectype, slp_tree slp_node,
10048 : tree bitsize, tree bitstart, tree vec_lhs,
10049 : tree lhs_type, gimple_stmt_iterator *exit_gsi)
10050 : {
10051 2823 : gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10052 :
10053 2823 : tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10054 2823 : gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10055 5648 : for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10056 2825 : SET_PHI_ARG_DEF (phi, i, vec_lhs);
10057 :
10058 2823 : gimple_seq stmts = NULL;
10059 2823 : tree new_tree;
10060 :
10061 : /* If bitstart is 0 then we can use a BIT_FIELD_REF */
10062 2823 : if (integer_zerop (bitstart))
10063 : {
10064 213 : tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10065 : vec_lhs_phi, bitsize, bitstart);
10066 :
10067 : /* Convert the extracted vector element to the scalar type. */
10068 213 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10069 : }
10070 2610 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10071 : {
10072 : /* Emit:
10073 :
10074 : SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - 1>
10075 :
10076 : where VEC_LHS is the vectorized live-out result, LEN is the length of
10077 : the vector, BIAS is the load-store bias. The bias should not be used
10078 : at all since we are not using load/store operations, but LEN will be
10079 : REALLEN + BIAS, so subtract it to get to the correct position. */
10080 0 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
10081 0 : gimple_seq tem = NULL;
10082 0 : gimple_stmt_iterator gsi = gsi_last (tem);
10083 0 : tree len = vect_get_loop_len (loop_vinfo, &gsi,
10084 : &LOOP_VINFO_LENS (loop_vinfo),
10085 : 1, vectype, 0, 1, false);
10086 0 : gimple_seq_add_seq (&stmts, tem);
10087 :
10088 : /* LAST_INDEX = LEN - 1. */
10089 0 : tree last_index = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (len),
10090 0 : len, build_one_cst (TREE_TYPE (len)));
10091 :
10092 : /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - 1>. */
10093 0 : tree scalar_res
10094 0 : = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10095 : vec_lhs_phi, last_index);
10096 :
10097 : /* Convert the extracted vector element to the scalar type. */
10098 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10099 : }
10100 2610 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10101 : {
10102 : /* Emit:
10103 :
10104 : SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10105 :
10106 : where VEC_LHS is the vectorized live-out result and MASK is
10107 : the loop mask for the final iteration. */
10108 0 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
10109 0 : tree scalar_type = TREE_TYPE (vectype);
10110 0 : gimple_seq tem = NULL;
10111 0 : gimple_stmt_iterator gsi = gsi_last (tem);
10112 0 : tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10113 : &LOOP_VINFO_MASKS (loop_vinfo),
10114 : 1, vectype, 0);
10115 0 : tree scalar_res;
10116 0 : gimple_seq_add_seq (&stmts, tem);
10117 :
10118 0 : scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10119 : mask, vec_lhs_phi);
10120 :
10121 : /* Convert the extracted vector element to the scalar type. */
10122 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10123 : }
10124 : else
10125 : {
10126 2610 : tree bftype = TREE_TYPE (vectype);
10127 2610 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10128 85 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10129 2610 : new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10130 2610 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10131 : &stmts, true, NULL_TREE);
10132 : }
10133 :
10134 2823 : *exit_gsi = gsi_after_labels (exit_bb);
10135 2823 : if (stmts)
10136 2823 : gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10137 :
10138 2823 : return new_tree;
10139 : }
10140 :
10141 : /* Function vectorizable_live_operation.
10142 :
10143 : STMT_INFO computes a value that is used outside the loop. Check if
10144 : it can be supported. */
10145 :
10146 : bool
10147 264786 : vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10148 : slp_tree slp_node, slp_instance slp_node_instance,
10149 : int slp_index, bool vec_stmt_p,
10150 : stmt_vector_for_cost *cost_vec)
10151 : {
10152 264786 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10153 264786 : imm_use_iterator imm_iter;
10154 264786 : tree lhs, lhs_type, bitsize;
10155 264786 : tree vectype = SLP_TREE_VECTYPE (slp_node);
10156 264786 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10157 264786 : gimple *use_stmt;
10158 264786 : use_operand_p use_p;
10159 264786 : auto_vec<tree> vec_oprnds;
10160 264786 : int vec_entry = 0;
10161 264786 : poly_uint64 vec_index = 0;
10162 :
10163 264786 : gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10164 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10165 :
10166 : /* If a stmt of a reduction is live, vectorize it via
10167 : vect_create_epilog_for_reduction. vectorizable_reduction assessed
10168 : validity so just trigger the transform here. */
10169 264786 : if (vect_is_reduction (slp_node))
10170 : {
10171 57808 : if (!vec_stmt_p)
10172 : return true;
10173 : /* For SLP reductions we vectorize the epilogue for all involved stmts
10174 : together. For SLP reduction chains we only get here once. */
10175 23481 : if (SLP_INSTANCE_KIND (slp_node_instance) == slp_inst_kind_reduc_group
10176 23222 : && slp_index != 0)
10177 : return true;
10178 23022 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
10179 23022 : if (VECT_REDUC_INFO_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10180 23022 : || VECT_REDUC_INFO_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10181 : return true;
10182 :
10183 22179 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
10184 22179 : || !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10185 22170 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10186 : slp_node_instance,
10187 : LOOP_VINFO_MAIN_EXIT (loop_vinfo));
10188 :
10189 : /* If early break we only have to materialize the reduction on the merge
10190 : block, but we have to find an alternate exit first. */
10191 22179 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10192 : {
10193 28 : slp_tree phis_node = slp_node_instance->reduc_phis;
10194 28 : stmt_info = SLP_TREE_REPRESENTATIVE (phis_node);
10195 89 : for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10196 28 : if (exit != LOOP_VINFO_MAIN_EXIT (loop_vinfo))
10197 : {
10198 23 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10199 : phis_node, slp_node_instance,
10200 : exit);
10201 23 : break;
10202 28 : }
10203 28 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10204 9 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10205 : phis_node, slp_node_instance,
10206 : LOOP_VINFO_MAIN_EXIT
10207 : (loop_vinfo));
10208 : }
10209 :
10210 22179 : return true;
10211 : }
10212 :
10213 : /* If STMT is not relevant and it is a simple assignment and its inputs are
10214 : invariant then it can remain in place, unvectorized. The original last
10215 : scalar value that it computes will be used. */
10216 206978 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
10217 : {
10218 0 : gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10219 0 : if (dump_enabled_p ())
10220 0 : dump_printf_loc (MSG_NOTE, vect_location,
10221 : "statement is simple and uses invariant. Leaving in "
10222 : "place.\n");
10223 0 : return true;
10224 : }
10225 :
10226 206978 : gcc_assert (slp_index >= 0);
10227 :
10228 : /* Get the last occurrence of the scalar index from the concatenation of
10229 : all the slp vectors. Calculate which slp vector it is and the index
10230 : within. */
10231 206978 : int num_scalar = SLP_TREE_LANES (slp_node);
10232 206978 : int num_vec = vect_get_num_copies (vinfo, slp_node);
10233 206978 : poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10234 :
10235 : /* Calculate which vector contains the result, and which lane of
10236 : that vector we need. */
10237 206978 : if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10238 : {
10239 : if (dump_enabled_p ())
10240 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10241 : "Cannot determine which vector holds the"
10242 : " final result.\n");
10243 : return false;
10244 : }
10245 :
10246 206978 : if (!vec_stmt_p)
10247 : {
10248 : /* No transformation required. */
10249 164672 : if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10250 : {
10251 27648 : if (SLP_TREE_LANES (slp_node) != 1)
10252 : {
10253 19 : if (dump_enabled_p ())
10254 19 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10255 : "can't operate on partial vectors "
10256 : "because an SLP statement is live after "
10257 : "the loop.\n");
10258 19 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10259 : }
10260 27629 : else if (num_vec > 1)
10261 : {
10262 15998 : if (dump_enabled_p ())
10263 51 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10264 : "can't operate on partial vectors "
10265 : "because ncopies is greater than 1.\n");
10266 15998 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10267 : }
10268 : else
10269 : {
10270 11631 : if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10271 : OPTIMIZE_FOR_SPEED))
10272 0 : vect_record_loop_mask (loop_vinfo,
10273 : &LOOP_VINFO_MASKS (loop_vinfo),
10274 : 1, vectype, NULL);
10275 11631 : else if (can_vec_extract_var_idx_p (
10276 11631 : TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10277 0 : vect_record_loop_len (loop_vinfo,
10278 : &LOOP_VINFO_LENS (loop_vinfo),
10279 : 1, vectype, 1);
10280 : else
10281 : {
10282 11631 : if (dump_enabled_p ())
10283 645 : dump_printf_loc (
10284 645 : MSG_MISSED_OPTIMIZATION, vect_location,
10285 : "can't operate on partial vectors "
10286 : "because the target doesn't support extract "
10287 : "last reduction.\n");
10288 11631 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10289 : }
10290 : }
10291 : }
10292 : /* ??? Enable for loop costing as well. */
10293 27648 : if (!loop_vinfo)
10294 93938 : record_stmt_cost (cost_vec, 1, vec_to_scalar, slp_node,
10295 : 0, vect_epilogue);
10296 164672 : return true;
10297 : }
10298 :
10299 : /* Use the lhs of the original scalar statement. */
10300 42306 : gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10301 42306 : if (dump_enabled_p ())
10302 1026 : dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10303 : "stmt %G", stmt);
10304 :
10305 42306 : lhs = gimple_get_lhs (stmt);
10306 42306 : lhs_type = TREE_TYPE (lhs);
10307 :
10308 42306 : bitsize = vector_element_bits_tree (vectype);
10309 :
10310 : /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10311 42306 : gcc_assert (!loop_vinfo
10312 : || ((!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10313 : && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10314 : || SLP_TREE_LANES (slp_node) == 1));
10315 :
10316 : /* Get the correct slp vectorized stmt. */
10317 42306 : tree vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10318 42306 : gimple *vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10319 :
10320 : /* In case we need to early break vectorize also get the first stmt. */
10321 42306 : tree vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10322 :
10323 : /* Get entry to use. */
10324 42306 : tree bitstart = bitsize_int (vec_index);
10325 42306 : bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10326 :
10327 42306 : if (loop_vinfo)
10328 : {
10329 : /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10330 : requirement, insert one phi node for it. It looks like:
10331 : loop;
10332 : BB:
10333 : # lhs' = PHI <lhs>
10334 : ==>
10335 : loop;
10336 : BB:
10337 : # vec_lhs' = PHI <vec_lhs>
10338 : new_tree = lane_extract <vec_lhs', ...>;
10339 : lhs' = new_tree; */
10340 :
10341 2882 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10342 : /* Check if we have a loop where the chosen exit is not the main exit,
10343 : in these cases for an early break we restart the iteration the vector code
10344 : did. For the live values we want the value at the start of the iteration
10345 : rather than at the end. */
10346 2882 : edge main_e = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
10347 2882 : bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10348 14981 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10349 9217 : if (!is_gimple_debug (use_stmt)
10350 9217 : && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10351 2823 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10352 : {
10353 2823 : edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10354 2823 : phi_arg_index_from_use (use_p));
10355 2823 : gcc_assert (loop_exit_edge_p (loop, e));
10356 2823 : bool main_exit_edge = e == main_e;
10357 2823 : tree tmp_vec_lhs = vec_lhs;
10358 2823 : tree tmp_bitstart = bitstart;
10359 :
10360 : /* For early exit where the exit is not in the BB that leads
10361 : to the latch then we're restarting the iteration in the
10362 : scalar loop. So get the first live value. */
10363 2823 : bool early_break_first_element_p
10364 2823 : = all_exits_as_early_p || !main_exit_edge;
10365 2823 : if (early_break_first_element_p)
10366 : {
10367 195 : tmp_vec_lhs = vec_lhs0;
10368 195 : tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10369 : }
10370 :
10371 2823 : gimple_stmt_iterator exit_gsi;
10372 2823 : tree new_tree
10373 2823 : = vectorizable_live_operation_1 (loop_vinfo,
10374 : e->dest, vectype,
10375 : slp_node, bitsize,
10376 : tmp_bitstart, tmp_vec_lhs,
10377 : lhs_type, &exit_gsi);
10378 :
10379 2823 : auto gsi = gsi_for_stmt (use_stmt);
10380 2823 : tree lhs_phi = gimple_phi_result (use_stmt);
10381 2823 : remove_phi_node (&gsi, false);
10382 2823 : gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10383 2823 : gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10384 2823 : break;
10385 2882 : }
10386 :
10387 : /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
10388 12158 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10389 6394 : gcc_assert (is_gimple_debug (use_stmt)
10390 2882 : || flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10391 : }
10392 : else
10393 : {
10394 : /* For basic-block vectorization simply insert the lane-extraction. */
10395 39424 : tree bftype = TREE_TYPE (vectype);
10396 39424 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10397 0 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10398 39424 : tree new_tree = build3 (BIT_FIELD_REF, bftype,
10399 : vec_lhs, bitsize, bitstart);
10400 39424 : gimple_seq stmts = NULL;
10401 39424 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10402 : &stmts, true, NULL_TREE);
10403 39424 : if (TREE_CODE (new_tree) == SSA_NAME
10404 78848 : && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10405 2 : SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10406 39424 : if (is_a <gphi *> (vec_stmt))
10407 : {
10408 2579 : gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10409 2579 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10410 : }
10411 : else
10412 : {
10413 36845 : gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10414 36845 : gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10415 : }
10416 :
10417 : /* Replace use of lhs with newly computed result. If the use stmt is a
10418 : single arg PHI, just replace all uses of PHI result. It's necessary
10419 : because lcssa PHI defining lhs may be before newly inserted stmt. */
10420 39424 : use_operand_p use_p;
10421 39424 : stmt_vec_info use_stmt_info;
10422 247813 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10423 168965 : if (!is_gimple_debug (use_stmt)
10424 168965 : && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10425 118383 : || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10426 : {
10427 : /* ??? This can happen when the live lane ends up being
10428 : rooted in a vector construction code-generated by an
10429 : external SLP node (and code-generation for that already
10430 : happened). See gcc.dg/vect/bb-slp-47.c.
10431 : Doing this is what would happen if that vector CTOR
10432 : were not code-generated yet so it is not too bad.
10433 : ??? In fact we'd likely want to avoid this situation
10434 : in the first place. */
10435 67130 : if (TREE_CODE (new_tree) == SSA_NAME
10436 66768 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10437 66768 : && gimple_code (use_stmt) != GIMPLE_PHI
10438 126623 : && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10439 : use_stmt))
10440 : {
10441 362 : if (dump_enabled_p ())
10442 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10443 : "Using original scalar computation for "
10444 : "live lane because use preceeds vector "
10445 : "def\n");
10446 362 : continue;
10447 : }
10448 204132 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10449 : {
10450 : /* ??? It can also happen that we end up pulling a def into
10451 : a loop where replacing out-of-loop uses would require
10452 : a new LC SSA PHI node. Retain the original scalar in
10453 : those cases as well. PR98064. */
10454 68863 : edge e;
10455 68863 : if (TREE_CODE (new_tree) == SSA_NAME
10456 68863 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10457 68863 : && (gimple_bb (use_stmt)->loop_father
10458 68863 : != gimple_bb (vec_stmt)->loop_father)
10459 : /* But a replacement in a LC PHI is OK. This happens
10460 : in gcc.dg/vect/bb-slp-57.c for example. */
10461 7416 : && (gimple_code (use_stmt) != GIMPLE_PHI
10462 2796 : || (((e = phi_arg_edge_from_use (use_p)), true)
10463 2796 : && !loop_exit_edge_p
10464 2796 : (gimple_bb (vec_stmt)->loop_father, e)))
10465 74481 : && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10466 5618 : gimple_bb (use_stmt)->loop_father))
10467 : {
10468 0 : if (dump_enabled_p ())
10469 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10470 : "Using original scalar computation for "
10471 : "live lane because there is an "
10472 : "out-of-loop definition for it\n");
10473 0 : continue;
10474 : }
10475 68863 : SET_USE (use_p, new_tree);
10476 : }
10477 66406 : update_stmt (use_stmt);
10478 39424 : }
10479 : }
10480 :
10481 : return true;
10482 264786 : }
10483 :
10484 : /* Given loop represented by LOOP_VINFO, return true if computation of
10485 : LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10486 : otherwise. */
10487 :
10488 : static bool
10489 61666 : loop_niters_no_overflow (loop_vec_info loop_vinfo)
10490 : {
10491 61666 : gcc_assert (!LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo));
10492 :
10493 : /* Constant case. */
10494 61666 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10495 : {
10496 35815 : tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10497 35815 : tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10498 :
10499 35815 : gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10500 35815 : gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10501 35815 : if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10502 : return true;
10503 : }
10504 :
10505 25851 : widest_int max;
10506 25851 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10507 : /* Check the upper bound of loop niters. */
10508 25851 : if (get_max_loop_iterations (loop, &max))
10509 : {
10510 25851 : tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10511 25851 : signop sgn = TYPE_SIGN (type);
10512 25851 : widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10513 25851 : if (max < type_max)
10514 25626 : return true;
10515 25851 : }
10516 : return false;
10517 25851 : }
10518 :
10519 : /* Return a mask type with half the number of elements as OLD_TYPE,
10520 : given that it should have mode NEW_MODE. */
10521 :
10522 : tree
10523 3714 : vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10524 : {
10525 3714 : poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10526 3714 : return build_truth_vector_type_for_mode (nunits, new_mode);
10527 : }
10528 :
10529 : /* Return a mask type with twice as many elements as OLD_TYPE,
10530 : given that it should have mode NEW_MODE. */
10531 :
10532 : tree
10533 5915 : vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10534 : {
10535 5915 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10536 5915 : return build_truth_vector_type_for_mode (nunits, new_mode);
10537 : }
10538 :
10539 : /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10540 : contain a sequence of NVECTORS masks that each control a vector of type
10541 : VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10542 : these vector masks with the vector version of SCALAR_MASK. */
10543 :
10544 : void
10545 77672 : vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10546 : unsigned int nvectors, tree vectype, tree scalar_mask)
10547 : {
10548 77672 : gcc_assert (nvectors != 0);
10549 :
10550 77672 : if (scalar_mask)
10551 : {
10552 3638 : scalar_cond_masked_key cond (scalar_mask, nvectors);
10553 3638 : loop_vinfo->scalar_cond_masked_set.add (cond);
10554 : }
10555 :
10556 77672 : masks->mask_set.add (std::make_pair (vectype, nvectors));
10557 77672 : }
10558 :
10559 : /* Given a complete set of masks MASKS, extract mask number INDEX
10560 : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10561 : where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10562 :
10563 : See the comment above vec_loop_masks for more details about the mask
10564 : arrangement. */
10565 :
10566 : tree
10567 208 : vect_get_loop_mask (loop_vec_info loop_vinfo,
10568 : gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10569 : unsigned int nvectors, tree vectype, unsigned int index)
10570 : {
10571 208 : if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10572 : == vect_partial_vectors_while_ult)
10573 : {
10574 0 : rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10575 0 : tree mask_type = rgm->type;
10576 :
10577 : /* Populate the rgroup's mask array, if this is the first time we've
10578 : used it. */
10579 0 : if (rgm->controls.is_empty ())
10580 : {
10581 0 : rgm->controls.safe_grow_cleared (nvectors, true);
10582 0 : for (unsigned int i = 0; i < nvectors; ++i)
10583 : {
10584 0 : tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10585 : /* Provide a dummy definition until the real one is available. */
10586 0 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10587 0 : rgm->controls[i] = mask;
10588 : }
10589 : }
10590 :
10591 0 : tree mask = rgm->controls[index];
10592 0 : if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10593 0 : TYPE_VECTOR_SUBPARTS (vectype)))
10594 : {
10595 : /* A loop mask for data type X can be reused for data type Y
10596 : if X has N times more elements than Y and if Y's elements
10597 : are N times bigger than X's. In this case each sequence
10598 : of N elements in the loop mask will be all-zero or all-one.
10599 : We can then view-convert the mask so that each sequence of
10600 : N elements is replaced by a single element. */
10601 0 : gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10602 : TYPE_VECTOR_SUBPARTS (vectype)));
10603 0 : gimple_seq seq = NULL;
10604 0 : mask_type = truth_type_for (vectype);
10605 0 : mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10606 0 : if (seq)
10607 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10608 : }
10609 0 : return mask;
10610 : }
10611 208 : else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10612 : == vect_partial_vectors_avx512)
10613 : {
10614 : /* The number of scalars per iteration and the number of vectors are
10615 : both compile-time constants. */
10616 208 : unsigned int nscalars_per_iter
10617 208 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10618 208 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10619 :
10620 208 : rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10621 :
10622 : /* The stored nV is dependent on the mask type produced. */
10623 208 : gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10624 : TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10625 : == rgm->factor);
10626 208 : nvectors = rgm->factor;
10627 :
10628 : /* Populate the rgroup's mask array, if this is the first time we've
10629 : used it. */
10630 208 : if (rgm->controls.is_empty ())
10631 : {
10632 20 : rgm->controls.safe_grow_cleared (nvectors, true);
10633 106 : for (unsigned int i = 0; i < nvectors; ++i)
10634 : {
10635 86 : tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
10636 : /* Provide a dummy definition until the real one is available. */
10637 86 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10638 86 : rgm->controls[i] = mask;
10639 : }
10640 : }
10641 208 : if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
10642 : TYPE_VECTOR_SUBPARTS (vectype)))
10643 160 : return rgm->controls[index];
10644 :
10645 : /* Split the vector if needed. Since we are dealing with integer mode
10646 : masks with AVX512 we can operate on the integer representation
10647 : performing the whole vector shifting. */
10648 48 : unsigned HOST_WIDE_INT factor;
10649 48 : bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
10650 48 : TYPE_VECTOR_SUBPARTS (vectype), &factor);
10651 0 : gcc_assert (ok);
10652 48 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
10653 48 : tree mask_type = truth_type_for (vectype);
10654 48 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
10655 48 : unsigned vi = index / factor;
10656 48 : unsigned vpart = index % factor;
10657 48 : tree vec = rgm->controls[vi];
10658 48 : gimple_seq seq = NULL;
10659 48 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
10660 48 : lang_hooks.types.type_for_mode
10661 48 : (TYPE_MODE (rgm->type), 1), vec);
10662 : /* For integer mode masks simply shift the right bits into position. */
10663 48 : if (vpart != 0)
10664 40 : vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
10665 : build_int_cst (integer_type_node,
10666 80 : (TYPE_VECTOR_SUBPARTS (vectype)
10667 40 : * vpart)));
10668 48 : vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
10669 48 : (TYPE_MODE (mask_type), 1), vec);
10670 48 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
10671 48 : if (seq)
10672 48 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10673 48 : return vec;
10674 : }
10675 : else
10676 0 : gcc_unreachable ();
10677 : }
10678 :
10679 : /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10680 : lengths for controlling an operation on VECTYPE. The operation splits
10681 : each element of VECTYPE into FACTOR separate subelements, measuring the
10682 : length as a number of these subelements. */
10683 :
10684 : void
10685 0 : vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10686 : unsigned int nvectors, tree vectype, unsigned int factor)
10687 : {
10688 0 : gcc_assert (nvectors != 0);
10689 0 : if (lens->length () < nvectors)
10690 0 : lens->safe_grow_cleared (nvectors, true);
10691 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
10692 :
10693 : /* The number of scalars per iteration, scalar occupied bytes and
10694 : the number of vectors are both compile-time constants. */
10695 0 : unsigned int nscalars_per_iter
10696 0 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10697 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10698 :
10699 0 : if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10700 : {
10701 : /* For now, we only support cases in which all loads and stores fall back
10702 : to VnQI or none do. */
10703 0 : gcc_assert (!rgl->max_nscalars_per_iter
10704 : || (rgl->factor == 1 && factor == 1)
10705 : || (rgl->max_nscalars_per_iter * rgl->factor
10706 : == nscalars_per_iter * factor));
10707 0 : rgl->max_nscalars_per_iter = nscalars_per_iter;
10708 0 : rgl->type = vectype;
10709 0 : rgl->factor = factor;
10710 : }
10711 0 : }
10712 :
10713 : /* Given a complete set of lengths LENS, extract length number INDEX
10714 : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10715 : where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
10716 : multipled by the number of elements that should be processed.
10717 : Insert any set-up statements before GSI. */
10718 :
10719 : tree
10720 0 : vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10721 : vec_loop_lens *lens, unsigned int nvectors, tree vectype,
10722 : unsigned int index, unsigned int factor, bool adjusted)
10723 : {
10724 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
10725 0 : bool use_bias_adjusted_len =
10726 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10727 :
10728 : /* Populate the rgroup's len array, if this is the first time we've
10729 : used it. */
10730 0 : if (rgl->controls.is_empty ())
10731 : {
10732 0 : rgl->controls.safe_grow_cleared (nvectors, true);
10733 0 : for (unsigned int i = 0; i < nvectors; ++i)
10734 : {
10735 0 : tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10736 0 : gcc_assert (len_type != NULL_TREE);
10737 :
10738 0 : tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10739 :
10740 : /* Provide a dummy definition until the real one is available. */
10741 0 : SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10742 0 : rgl->controls[i] = len;
10743 :
10744 0 : if (use_bias_adjusted_len)
10745 : {
10746 0 : gcc_assert (i == 0);
10747 0 : tree adjusted_len =
10748 0 : make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10749 0 : SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10750 0 : rgl->bias_adjusted_ctrl = adjusted_len;
10751 : }
10752 : }
10753 : }
10754 :
10755 0 : if (use_bias_adjusted_len && adjusted)
10756 0 : return rgl->bias_adjusted_ctrl;
10757 :
10758 0 : tree loop_len = rgl->controls[index];
10759 0 : if (rgl->factor == 1 && factor == 1)
10760 : {
10761 0 : poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
10762 0 : poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
10763 0 : if (maybe_ne (nunits1, nunits2))
10764 : {
10765 : /* A loop len for data type X can be reused for data type Y
10766 : if X has N times more elements than Y and if Y's elements
10767 : are N times bigger than X's. */
10768 0 : gcc_assert (multiple_p (nunits1, nunits2));
10769 0 : factor = exact_div (nunits1, nunits2).to_constant ();
10770 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10771 0 : gimple_seq seq = NULL;
10772 0 : loop_len = gimple_build (&seq, EXACT_DIV_EXPR, iv_type, loop_len,
10773 0 : build_int_cst (iv_type, factor));
10774 0 : if (seq)
10775 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10776 : }
10777 0 : }
10778 0 : else if (factor && rgl->factor != factor)
10779 : {
10780 : /* The number of scalars per iteration, scalar occupied bytes and
10781 : the number of vectors are both compile-time constants. */
10782 0 : unsigned int nscalars_per_iter
10783 0 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10784 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10785 0 : unsigned int rglvecsize = rgl->factor * rgl->max_nscalars_per_iter;
10786 0 : unsigned int vecsize = nscalars_per_iter * factor;
10787 0 : if (rglvecsize > vecsize)
10788 : {
10789 0 : unsigned int fac = rglvecsize / vecsize;
10790 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10791 0 : gimple_seq seq = NULL;
10792 0 : loop_len = gimple_build (&seq, EXACT_DIV_EXPR, iv_type, loop_len,
10793 0 : build_int_cst (iv_type, fac));
10794 0 : if (seq)
10795 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10796 : }
10797 0 : else if (rglvecsize < vecsize)
10798 : {
10799 0 : unsigned int fac = vecsize / rglvecsize;
10800 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10801 0 : gimple_seq seq = NULL;
10802 0 : loop_len = gimple_build (&seq, MULT_EXPR, iv_type, loop_len,
10803 0 : build_int_cst (iv_type, fac));
10804 0 : if (seq)
10805 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10806 : }
10807 : }
10808 : return loop_len;
10809 : }
10810 :
10811 : /* Generate the tree for the loop len mask and return it. Given the lens,
10812 : nvectors, vectype, index and factor to gen the len mask as below.
10813 :
10814 : tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
10815 : */
10816 : tree
10817 0 : vect_gen_loop_len_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10818 : gimple_stmt_iterator *cond_gsi, vec_loop_lens *lens,
10819 : unsigned int nvectors, tree vectype, tree stmt,
10820 : unsigned int index, unsigned int factor)
10821 : {
10822 0 : tree all_one_mask = build_all_ones_cst (vectype);
10823 0 : tree all_zero_mask = build_zero_cst (vectype);
10824 0 : tree len = vect_get_loop_len (loop_vinfo, gsi, lens, nvectors, vectype, index,
10825 : factor, true);
10826 0 : tree bias = build_int_cst (intQI_type_node,
10827 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo));
10828 0 : tree len_mask = make_temp_ssa_name (TREE_TYPE (stmt), NULL, "vec_len_mask");
10829 0 : gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, stmt,
10830 : all_one_mask, all_zero_mask, len,
10831 : bias);
10832 0 : gimple_call_set_lhs (call, len_mask);
10833 0 : gsi_insert_before (cond_gsi, call, GSI_SAME_STMT);
10834 :
10835 0 : return len_mask;
10836 : }
10837 :
10838 : /* Scale profiling counters by estimation for LOOP which is vectorized
10839 : by factor VF.
10840 : If FLAT is true, the loop we started with had unrealistically flat
10841 : profile. */
10842 :
10843 : static void
10844 61709 : scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
10845 : {
10846 : /* For flat profiles do not scale down proportionally by VF and only
10847 : cap by known iteration count bounds. */
10848 61709 : if (flat)
10849 : {
10850 34690 : if (dump_file && (dump_flags & TDF_DETAILS))
10851 5257 : fprintf (dump_file,
10852 : "Vectorized loop profile seems flat; not scaling iteration "
10853 : "count down by the vectorization factor %i\n", vf);
10854 34690 : scale_loop_profile (loop, profile_probability::always (),
10855 : get_likely_max_loop_iterations_int (loop));
10856 34690 : return;
10857 : }
10858 : /* Loop body executes VF fewer times and exit increases VF times. */
10859 27019 : profile_count entry_count = loop_preheader_edge (loop)->count ();
10860 :
10861 : /* If we have unreliable loop profile avoid dropping entry
10862 : count below header count. This can happen since loops
10863 : has unrealistically low trip counts. */
10864 27019 : while (vf > 1
10865 28150 : && loop->header->count > entry_count
10866 57329 : && loop->header->count < entry_count * vf)
10867 : {
10868 2160 : if (dump_file && (dump_flags & TDF_DETAILS))
10869 153 : fprintf (dump_file,
10870 : "Vectorization factor %i seems too large for profile "
10871 : "prevoiusly believed to be consistent; reducing.\n", vf);
10872 2160 : vf /= 2;
10873 : }
10874 :
10875 27019 : if (entry_count.nonzero_p ())
10876 27019 : set_edge_probability_and_rescale_others
10877 27019 : (exit_e,
10878 27019 : entry_count.probability_in (loop->header->count / vf));
10879 : /* Avoid producing very large exit probability when we do not have
10880 : sensible profile. */
10881 0 : else if (exit_e->probability < profile_probability::always () / (vf * 2))
10882 0 : set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
10883 27019 : loop->latch->count = single_pred_edge (loop->latch)->count ();
10884 :
10885 27019 : scale_loop_profile (loop, profile_probability::always () / vf,
10886 : get_likely_max_loop_iterations_int (loop));
10887 : }
10888 :
10889 : /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
10890 : original loop that has now been vectorized.
10891 :
10892 : The inits of the data_references need to be advanced with the number of
10893 : iterations of the main loop. This has been computed in vect_do_peeling and
10894 : is stored in parameter ADVANCE.
10895 :
10896 : Since the loop_vec_info of this EPILOGUE was constructed for the original
10897 : loop, its stmt_vec_infos all point to the original statements. These need
10898 : to be updated to point to their corresponding copies.
10899 :
10900 : The data_reference's connections also need to be updated. Their
10901 : corresponding dr_vec_info need to be reconnected to the EPILOGUE's
10902 : stmt_vec_infos, their statements need to point to their corresponding
10903 : copy. */
10904 :
10905 : static void
10906 7058 : update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
10907 : {
10908 7058 : loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
10909 7058 : hash_map<tree,tree> mapping;
10910 7058 : gimple *orig_stmt, *new_stmt;
10911 7058 : gimple_stmt_iterator epilogue_gsi;
10912 7058 : gphi_iterator epilogue_phi_gsi;
10913 7058 : stmt_vec_info stmt_vinfo = NULL, related_vinfo;
10914 7058 : basic_block *epilogue_bbs = get_loop_body (epilogue);
10915 7058 : unsigned i;
10916 :
10917 7058 : free (LOOP_VINFO_BBS (epilogue_vinfo));
10918 7058 : LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
10919 7058 : LOOP_VINFO_NBBS (epilogue_vinfo) = epilogue->num_nodes;
10920 :
10921 : /* The EPILOGUE loop is a copy of the original loop so they share the same
10922 : gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
10923 : point to the copied statements. */
10924 21174 : for (unsigned i = 0; i < epilogue->num_nodes; ++i)
10925 : {
10926 14116 : for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
10927 36325 : !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
10928 : {
10929 22209 : new_stmt = epilogue_phi_gsi.phi ();
10930 :
10931 22209 : gcc_assert (gimple_uid (new_stmt) > 0);
10932 22209 : stmt_vinfo
10933 22209 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10934 :
10935 22209 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10936 : }
10937 :
10938 28232 : for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
10939 144672 : !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
10940 : {
10941 130556 : new_stmt = gsi_stmt (epilogue_gsi);
10942 130556 : if (is_gimple_debug (new_stmt))
10943 21966 : continue;
10944 :
10945 108590 : gcc_assert (gimple_uid (new_stmt) > 0);
10946 108590 : stmt_vinfo
10947 108590 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10948 :
10949 108590 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10950 :
10951 108590 : related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
10952 108590 : if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
10953 : {
10954 2009 : gimple *stmt = STMT_VINFO_STMT (related_vinfo);
10955 : /* Set BB such that the assert in
10956 : 'get_initial_defs_for_reduction' is able to determine that
10957 : the BB of the related stmt is inside this loop. */
10958 2009 : gimple_set_bb (stmt,
10959 : gimple_bb (new_stmt));
10960 2009 : related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
10961 2009 : gcc_assert (related_vinfo == NULL
10962 : || related_vinfo == stmt_vinfo);
10963 : }
10964 : }
10965 : }
10966 :
10967 7058 : struct data_reference *dr;
10968 7058 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
10969 32645 : FOR_EACH_VEC_ELT (datarefs, i, dr)
10970 : {
10971 25587 : orig_stmt = DR_STMT (dr);
10972 25587 : gcc_assert (gimple_uid (orig_stmt) > 0);
10973 25587 : stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
10974 25587 : DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
10975 : }
10976 :
10977 : /* Advance data_reference's with the number of iterations of the previous
10978 : loop and its prologue. */
10979 7058 : vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
10980 :
10981 : /* Remember the advancement made. */
10982 7058 : LOOP_VINFO_DRS_ADVANCED_BY (epilogue_vinfo) = advance;
10983 7058 : }
10984 :
10985 : /* When vectorizing early break statements instructions that happen before
10986 : the early break in the current BB need to be moved to after the early
10987 : break. This function deals with that and assumes that any validity
10988 : checks has already been performed.
10989 :
10990 : While moving the instructions if it encounters a VUSE or VDEF it then
10991 : corrects the VUSES as it moves the statements along. GDEST is the location
10992 : in which to insert the new statements. */
10993 :
10994 : static void
10995 1405 : move_early_exit_stmts (loop_vec_info loop_vinfo)
10996 : {
10997 1405 : DUMP_VECT_SCOPE ("move_early_exit_stmts");
10998 :
10999 1405 : if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
11000 1188 : return;
11001 :
11002 : /* Move all stmts that need moving. */
11003 217 : basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
11004 217 : gimple_stmt_iterator dest_gsi = gsi_after_labels (dest_bb);
11005 :
11006 217 : tree last_seen_vuse = NULL_TREE;
11007 533 : for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11008 : {
11009 : /* We have to update crossed degenerate virtual PHIs. Simply
11010 : elide them. */
11011 316 : if (gphi *vphi = dyn_cast <gphi *> (stmt))
11012 : {
11013 7 : tree vdef = gimple_phi_result (vphi);
11014 7 : tree vuse = gimple_phi_arg_def (vphi, 0);
11015 7 : imm_use_iterator iter;
11016 7 : use_operand_p use_p;
11017 7 : gimple *use_stmt;
11018 30 : FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef)
11019 : {
11020 48 : FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
11021 16 : SET_USE (use_p, vuse);
11022 7 : }
11023 7 : auto gsi = gsi_for_stmt (stmt);
11024 7 : remove_phi_node (&gsi, true);
11025 7 : last_seen_vuse = vuse;
11026 7 : continue;
11027 7 : }
11028 :
11029 : /* Check to see if statement is still required for vect or has been
11030 : elided. */
11031 309 : auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11032 309 : if (!stmt_info)
11033 0 : continue;
11034 :
11035 309 : if (dump_enabled_p ())
11036 158 : dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11037 :
11038 309 : gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11039 309 : gsi_move_before (&stmt_gsi, &dest_gsi, GSI_NEW_STMT);
11040 618 : last_seen_vuse = gimple_vuse (stmt);
11041 : }
11042 :
11043 : /* Update all the stmts with their new reaching VUSES. */
11044 679 : for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11045 : {
11046 194 : if (dump_enabled_p ())
11047 158 : dump_printf_loc (MSG_NOTE, vect_location,
11048 : "updating vuse to %T for load %G",
11049 : last_seen_vuse, p);
11050 194 : gimple_set_vuse (p, last_seen_vuse);
11051 194 : update_stmt (p);
11052 : }
11053 :
11054 : /* And update the LC PHIs on exits. */
11055 1098 : for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
11056 447 : if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
11057 243 : if (gphi *phi = get_virtual_phi (e->dest))
11058 460 : SET_PHI_ARG_DEF_ON_EDGE (phi, e, last_seen_vuse);
11059 : }
11060 :
11061 : /* Generate adjustment code for early break scalar IVs filling in the value
11062 : we created earlier on for LOOP_VINFO_EARLY_BRK_NITERS_VAR. */
11063 :
11064 : static void
11065 1405 : vect_update_ivs_after_vectorizer_for_early_breaks (loop_vec_info loop_vinfo)
11066 : {
11067 1405 : DUMP_VECT_SCOPE ("vect_update_ivs_after_vectorizer_for_early_breaks");
11068 :
11069 1405 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11070 0 : return;
11071 :
11072 1405 : gcc_assert (LOOP_VINFO_EARLY_BRK_NITERS_VAR (loop_vinfo));
11073 :
11074 1405 : tree phi_var = LOOP_VINFO_EARLY_BRK_NITERS_VAR (loop_vinfo);
11075 1405 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11076 1405 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11077 1405 : tree ty_var = TREE_TYPE (phi_var);
11078 1405 : auto loop = LOOP_VINFO_LOOP (loop_vinfo);
11079 1405 : tree induc_var = niters_skip ? copy_ssa_name (phi_var) : phi_var;
11080 :
11081 1405 : auto induction_phi = create_phi_node (induc_var, loop->header);
11082 1405 : tree induc_def = PHI_RESULT (induction_phi);
11083 :
11084 : /* Create the iv update inside the loop. */
11085 1405 : gimple_seq init_stmts = NULL;
11086 1405 : gimple_seq stmts = NULL;
11087 1405 : gimple_seq iv_stmts = NULL;
11088 1405 : tree tree_vf = build_int_cst (ty_var, vf);
11089 :
11090 : /* For loop len targets we have to use .SELECT_VL (ivtmp_33, VF); instead of
11091 : just += VF as the VF can change in between two loop iterations. */
11092 1405 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
11093 : {
11094 0 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
11095 0 : tree_vf = vect_get_loop_len (loop_vinfo, NULL, lens, 1,
11096 : NULL_TREE, 0, 0, true);
11097 : }
11098 :
11099 1405 : tree iter_var;
11100 1405 : if (POINTER_TYPE_P (ty_var))
11101 : {
11102 0 : tree offset = gimple_convert (&stmts, sizetype, tree_vf);
11103 0 : iter_var = gimple_build (&stmts, POINTER_PLUS_EXPR, ty_var, induc_def,
11104 : gimple_convert (&stmts, sizetype, offset));
11105 : }
11106 : else
11107 : {
11108 1405 : tree offset = gimple_convert (&stmts, ty_var, tree_vf);
11109 1405 : iter_var = gimple_build (&stmts, PLUS_EXPR, ty_var, induc_def, offset);
11110 : }
11111 :
11112 1405 : tree init_var = build_zero_cst (ty_var);
11113 1405 : if (niters_skip)
11114 0 : init_var = gimple_build (&init_stmts, MINUS_EXPR, ty_var, init_var,
11115 : gimple_convert (&init_stmts, ty_var, niters_skip));
11116 :
11117 1405 : add_phi_arg (induction_phi, iter_var,
11118 : loop_latch_edge (loop), UNKNOWN_LOCATION);
11119 1405 : add_phi_arg (induction_phi, init_var,
11120 : loop_preheader_edge (loop), UNKNOWN_LOCATION);
11121 :
11122 : /* Find the first insertion point in the BB. */
11123 1405 : auto pe = loop_preheader_edge (loop);
11124 :
11125 : /* If we've done any peeling, calculate the peeling adjustment needed to the
11126 : final IV. */
11127 1405 : if (niters_skip)
11128 : {
11129 0 : tree induc_type = TREE_TYPE (induc_def);
11130 0 : tree s_induc_type = signed_type_for (induc_type);
11131 0 : induc_def = gimple_build (&iv_stmts, MAX_EXPR, s_induc_type,
11132 : gimple_convert (&iv_stmts, s_induc_type,
11133 : induc_def),
11134 : build_zero_cst (s_induc_type));
11135 0 : auto stmt = gimple_build_assign (phi_var,
11136 : gimple_convert (&iv_stmts, induc_type,
11137 : induc_def));
11138 0 : gimple_seq_add_stmt_without_update (&iv_stmts, stmt);
11139 0 : basic_block exit_bb = NULL;
11140 : /* Identify the early exit merge block. I wish we had stored this. */
11141 0 : for (auto e : get_loop_exit_edges (loop))
11142 0 : if (e != LOOP_VINFO_MAIN_EXIT (loop_vinfo))
11143 : {
11144 0 : exit_bb = e->dest;
11145 0 : break;
11146 0 : }
11147 :
11148 0 : gcc_assert (exit_bb);
11149 0 : auto exit_gsi = gsi_after_labels (exit_bb);
11150 0 : gsi_insert_seq_before (&exit_gsi, iv_stmts, GSI_SAME_STMT);
11151 : }
11152 : /* Write the init_stmts in the loop-preheader block. */
11153 1405 : auto psi = gsi_last_nondebug_bb (pe->src);
11154 1405 : gsi_insert_seq_after (&psi, init_stmts, GSI_LAST_NEW_STMT);
11155 : /* Wite the adjustments in the header block. */
11156 1405 : basic_block bb = loop->header;
11157 1405 : auto si = gsi_after_labels (bb);
11158 1405 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11159 : }
11160 :
11161 : /* Function vect_transform_loop.
11162 :
11163 : The analysis phase has determined that the loop is vectorizable.
11164 : Vectorize the loop - created vectorized stmts to replace the scalar
11165 : stmts in the loop, and update the loop exit condition.
11166 : Returns scalar epilogue loop if any. */
11167 :
11168 : class loop *
11169 61709 : vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11170 : {
11171 61709 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11172 61709 : class loop *epilogue = NULL;
11173 61709 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11174 61709 : int nbbs = loop->num_nodes;
11175 61709 : int i;
11176 61709 : tree niters_vector = NULL_TREE;
11177 61709 : tree step_vector = NULL_TREE;
11178 61709 : tree niters_vector_mult_vf = NULL_TREE;
11179 61709 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11180 61709 : unsigned int lowest_vf = constant_lower_bound (vf);
11181 61709 : gimple *stmt;
11182 61709 : bool check_profitability = false;
11183 61709 : unsigned int th;
11184 61709 : bool flat = maybe_flat_loop_profile (loop);
11185 61709 : bool uncounted_p = LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo);
11186 :
11187 61709 : DUMP_VECT_SCOPE ("vec_transform_loop");
11188 :
11189 61709 : if (! LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11190 54651 : loop_vinfo->shared->check_datarefs ();
11191 :
11192 : /* Use the more conservative vectorization threshold. If the number
11193 : of iterations is constant assume the cost check has been performed
11194 : by our caller. If the threshold makes all loops profitable that
11195 : run at least the (estimated) vectorization factor number of times
11196 : checking is pointless, too. */
11197 61709 : th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11198 61709 : if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11199 : {
11200 18840 : if (dump_enabled_p ())
11201 174 : dump_printf_loc (MSG_NOTE, vect_location,
11202 : "Profitability threshold is %d loop iterations.\n",
11203 : th);
11204 : check_profitability = true;
11205 : }
11206 :
11207 : /* Make sure there exists a single-predecessor exit bb. Do this before
11208 : versioning. */
11209 61709 : edge e = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
11210 61709 : if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11211 : {
11212 19277 : split_loop_exit_edge (e, true);
11213 19277 : if (dump_enabled_p ())
11214 2268 : dump_printf (MSG_NOTE, "split exit edge\n");
11215 : }
11216 :
11217 : /* Version the loop first, if required, so the profitability check
11218 : comes first. */
11219 :
11220 61709 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11221 : {
11222 3783 : class loop *sloop
11223 3783 : = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11224 3783 : sloop->force_vectorize = false;
11225 3783 : check_profitability = false;
11226 : }
11227 :
11228 : /* Make sure there exists a single-predecessor exit bb also on the
11229 : scalar loop copy. Do this after versioning but before peeling
11230 : so CFG structure is fine for both scalar and if-converted loop
11231 : to make slpeel_duplicate_current_defs_from_edges face matched
11232 : loop closed PHI nodes on the exit. */
11233 61709 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11234 : {
11235 8039 : e = LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo);
11236 8039 : if (! single_pred_p (e->dest))
11237 : {
11238 7770 : split_loop_exit_edge (e, true);
11239 7770 : if (dump_enabled_p ())
11240 1140 : dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11241 : }
11242 : }
11243 :
11244 61709 : tree niters = vect_build_loop_niters (loop_vinfo);
11245 61709 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11246 61709 : tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11247 61709 : tree advance;
11248 61709 : drs_init_vec orig_drs_init;
11249 61709 : bool niters_no_overflow = uncounted_p ? false /* Not known. */
11250 61666 : : loop_niters_no_overflow (loop_vinfo);
11251 :
11252 61709 : epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11253 : &step_vector, &niters_vector_mult_vf, th,
11254 : check_profitability, niters_no_overflow,
11255 : &advance);
11256 :
11257 : /* Assign hierarchical discriminators to the vectorized loop. */
11258 61709 : poly_uint64 vf_val = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11259 61709 : unsigned int vf_int = constant_lower_bound (vf_val);
11260 61709 : if (vf_int > DISCR_MULTIPLICITY_MAX)
11261 : vf_int = DISCR_MULTIPLICITY_MAX;
11262 :
11263 : /* Assign unique copy_id dynamically instead of using hardcoded constants.
11264 : Epilogue and main vectorized loops get different copy_ids. */
11265 61709 : gimple *loop_last = last_nondebug_stmt (loop->header);
11266 61709 : location_t loop_loc
11267 61709 : = loop_last ? gimple_location (loop_last) : UNKNOWN_LOCATION;
11268 61435 : if (loop_loc != UNKNOWN_LOCATION)
11269 : {
11270 50906 : unsigned int copyid = allocate_copyid_base (loop_loc, 1);
11271 50906 : assign_discriminators_to_loop (loop, vf_int, copyid);
11272 : }
11273 61709 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11274 61709 : && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11275 : {
11276 : /* Ifcvt duplicates loop preheader, loop body and produces an basic
11277 : block after loop exit. We need to scale all that. */
11278 91 : basic_block preheader
11279 91 : = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11280 91 : preheader->count
11281 : = preheader->count.apply_probability
11282 91 : (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11283 91 : scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11284 : LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11285 91 : LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo)->dest->count = preheader->count;
11286 : }
11287 :
11288 61709 : if (niters_vector == NULL_TREE && !uncounted_p)
11289 : {
11290 27331 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11291 27331 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11292 55404 : && known_eq (lowest_vf, vf))
11293 : {
11294 27328 : niters_vector
11295 27328 : = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11296 27328 : LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11297 27328 : step_vector = build_one_cst (TREE_TYPE (niters));
11298 : }
11299 748 : else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11300 1 : vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11301 : &step_vector, niters_no_overflow);
11302 : else
11303 : /* vect_do_peeling subtracted the number of peeled prologue
11304 : iterations from LOOP_VINFO_NITERS. */
11305 747 : vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11306 : &niters_vector, &step_vector,
11307 : niters_no_overflow);
11308 : }
11309 :
11310 : /* 1) Make sure the loop header has exactly two entries
11311 : 2) Make sure we have a preheader basic block. */
11312 :
11313 61709 : gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11314 :
11315 61709 : split_edge (loop_preheader_edge (loop));
11316 :
11317 61709 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11318 : /* This will deal with any possible peeling. */
11319 1 : vect_prepare_for_masked_peels (loop_vinfo);
11320 :
11321 : /* Handle any code motion that we need to for early-break vectorization after
11322 : we've done peeling but just before we start vectorizing. */
11323 61709 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11324 : {
11325 1405 : vect_update_ivs_after_vectorizer_for_early_breaks (loop_vinfo);
11326 1405 : move_early_exit_stmts (loop_vinfo);
11327 : }
11328 :
11329 : /* Remove existing clobber stmts and prefetches. */
11330 188368 : for (i = 0; i < nbbs; i++)
11331 : {
11332 126659 : basic_block bb = bbs[i];
11333 1099874 : for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);)
11334 : {
11335 846556 : stmt = gsi_stmt (si);
11336 846556 : if (gimple_clobber_p (stmt)
11337 846556 : || gimple_call_builtin_p (stmt, BUILT_IN_PREFETCH))
11338 : {
11339 90 : unlink_stmt_vdef (stmt);
11340 90 : gsi_remove (&si, true);
11341 90 : release_defs (stmt);
11342 : }
11343 : else
11344 846466 : gsi_next (&si);
11345 : }
11346 : }
11347 :
11348 : /* Schedule the SLP instances. */
11349 61709 : if (!loop_vinfo->slp_instances.is_empty ())
11350 : {
11351 61709 : DUMP_VECT_SCOPE ("scheduling SLP instances");
11352 61709 : vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11353 : }
11354 :
11355 : /* Generate the loop invariant statements. */
11356 61709 : if (!gimple_seq_empty_p (LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo)))
11357 : {
11358 73 : if (dump_enabled_p ())
11359 30 : dump_printf_loc (MSG_NOTE, vect_location,
11360 : "------>generating loop invariant statements\n");
11361 73 : gimple_stmt_iterator gsi;
11362 73 : gsi = gsi_after_labels (loop_preheader_edge (loop)->src);
11363 73 : gsi_insert_seq_before (&gsi, LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo),
11364 : GSI_CONTINUE_LINKING);
11365 : }
11366 :
11367 : /* Stub out scalar statements that must not survive vectorization and
11368 : were not picked as relevant in any SLP instance.
11369 : Doing this here helps with grouped statements, or statements that
11370 : are involved in patterns. */
11371 188368 : for (i = 0; i < nbbs; i++)
11372 : {
11373 126659 : basic_block bb = bbs[i];
11374 126659 : stmt_vec_info stmt_info;
11375 253318 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11376 1686820 : !gsi_end_p (gsi); gsi_next (&gsi))
11377 : {
11378 1560161 : gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11379 6355 : if (!call || !gimple_call_internal_p (call))
11380 1554960 : continue;
11381 5201 : internal_fn ifn = gimple_call_internal_fn (call);
11382 5201 : if (ifn == IFN_MASK_LOAD)
11383 : {
11384 735 : tree lhs = gimple_get_lhs (call);
11385 735 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11386 : {
11387 0 : tree zero = build_zero_cst (TREE_TYPE (lhs));
11388 0 : gimple *new_stmt = gimple_build_assign (lhs, zero);
11389 0 : gsi_replace (&gsi, new_stmt, true);
11390 : }
11391 : }
11392 4466 : else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11393 : {
11394 2295 : tree lhs = gimple_get_lhs (call);
11395 2295 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11396 : {
11397 0 : tree else_arg
11398 0 : = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11399 0 : gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11400 0 : gsi_replace (&gsi, new_stmt, true);
11401 : }
11402 : }
11403 2171 : else if (ifn == IFN_MASK_CALL
11404 4 : && (stmt_info = loop_vinfo->lookup_stmt (call))
11405 4 : && !STMT_VINFO_RELEVANT_P (stmt_info)
11406 2175 : && !STMT_VINFO_LIVE_P (stmt_info))
11407 : {
11408 4 : gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11409 4 : loop_vinfo->remove_stmt (stmt_info);
11410 : }
11411 : }
11412 : }
11413 :
11414 61709 : if (!uncounted_p)
11415 : {
11416 : /* The vectorization factor is always > 1, so if we use an IV increment of
11417 : 1. A zero NITERS becomes a nonzero NITERS_VECTOR. */
11418 61666 : if (integer_onep (step_vector))
11419 61648 : niters_no_overflow = true;
11420 :
11421 61666 : vect_set_loop_condition (loop, LOOP_VINFO_MAIN_EXIT (loop_vinfo),
11422 : loop_vinfo, niters_vector, step_vector,
11423 61666 : niters_vector_mult_vf, !niters_no_overflow);
11424 : }
11425 :
11426 61709 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11427 :
11428 : /* True if the final iteration might not handle a full vector's
11429 : worth of scalar iterations. */
11430 123418 : bool final_iter_may_be_partial
11431 61709 : = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11432 61709 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
11433 :
11434 : /* +1 to convert latch counts to loop iteration counts. */
11435 61709 : int bias_for_lowest = 1;
11436 :
11437 : /* When we are peeling for gaps then we take away one scalar iteration
11438 : from the vector loop. Thus we can adjust the upper bound by one
11439 : scalar iteration. But only when we know the bound applies to the
11440 : IV exit test which might not be true when we have multiple exits. */
11441 61709 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11442 120236 : bias_for_lowest -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11443 :
11444 61709 : int bias_for_assumed = bias_for_lowest;
11445 61709 : int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11446 61709 : if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11447 : {
11448 : /* When the amount of peeling is known at compile time, the first
11449 : iteration will have exactly alignment_npeels active elements.
11450 : In the worst case it will have at least one. */
11451 1 : int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11452 1 : bias_for_lowest += lowest_vf - min_first_active;
11453 1 : bias_for_assumed += assumed_vf - min_first_active;
11454 : }
11455 : /* In these calculations the "- 1" converts loop iteration counts
11456 : back to latch counts. */
11457 61709 : if (loop->any_upper_bound)
11458 : {
11459 61693 : loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11460 61693 : loop->nb_iterations_upper_bound
11461 61693 : = (final_iter_may_be_partial
11462 63100 : ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11463 2814 : lowest_vf) - 1
11464 60286 : : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11465 120572 : lowest_vf) - 1);
11466 61693 : if (main_vinfo
11467 : /* Both peeling for alignment and peeling for gaps can end up
11468 : with the scalar epilogue running for more than VF-1 iterations. */
11469 7058 : && !main_vinfo->peeling_for_alignment
11470 7010 : && !main_vinfo->peeling_for_gaps)
11471 : {
11472 6828 : unsigned int bound;
11473 6828 : poly_uint64 main_iters
11474 6828 : = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11475 : LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11476 6828 : main_iters
11477 6828 : = upper_bound (main_iters,
11478 6828 : LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11479 13656 : if (can_div_away_from_zero_p (main_iters,
11480 6828 : LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11481 : &bound))
11482 6828 : loop->nb_iterations_upper_bound
11483 6828 : = wi::umin ((bound_wide_int) (bound - 1),
11484 6828 : loop->nb_iterations_upper_bound);
11485 : }
11486 : }
11487 61709 : if (loop->any_likely_upper_bound)
11488 61693 : loop->nb_iterations_likely_upper_bound
11489 61693 : = (final_iter_may_be_partial
11490 63100 : ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11491 1407 : + bias_for_lowest, lowest_vf) - 1
11492 60286 : : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11493 61693 : + bias_for_lowest, lowest_vf) - 1);
11494 61709 : if (loop->any_estimate)
11495 35447 : loop->nb_iterations_estimate
11496 35447 : = (final_iter_may_be_partial
11497 36140 : ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11498 1386 : assumed_vf) - 1
11499 34754 : : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11500 70201 : assumed_vf) - 1);
11501 61709 : scale_profile_for_vect_loop (loop, LOOP_VINFO_MAIN_EXIT (loop_vinfo),
11502 : assumed_vf, flat);
11503 :
11504 61709 : if (dump_enabled_p ())
11505 : {
11506 10949 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11507 : {
11508 9502 : dump_printf_loc (MSG_NOTE, vect_location,
11509 : "LOOP VECTORIZED\n");
11510 9502 : if (loop->inner)
11511 343 : dump_printf_loc (MSG_NOTE, vect_location,
11512 : "OUTER LOOP VECTORIZED\n");
11513 9502 : dump_printf (MSG_NOTE, "\n");
11514 : }
11515 : else
11516 1447 : dump_printf_loc (MSG_NOTE, vect_location,
11517 : "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11518 1447 : GET_MODE_NAME (loop_vinfo->vector_mode));
11519 : }
11520 :
11521 : /* Loops vectorized with a variable factor won't benefit from
11522 : unrolling/peeling. */
11523 61709 : if (!vf.is_constant ())
11524 : {
11525 : loop->unroll = 1;
11526 : if (dump_enabled_p ())
11527 : dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11528 : " variable-length vectorization factor\n");
11529 : }
11530 :
11531 : /* When we have unrolled the loop due to a user requested value we should
11532 : leave it up to the RTL unroll heuristics to determine if it's still worth
11533 : while to unroll more. */
11534 61709 : if (LOOP_VINFO_USER_UNROLL (loop_vinfo))
11535 44 : loop->unroll = 0;
11536 :
11537 : /* Free SLP instances here because otherwise stmt reference counting
11538 : won't work. */
11539 : slp_instance instance;
11540 151758 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11541 90049 : vect_free_slp_instance (instance);
11542 61709 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11543 : /* Clear-up safelen field since its value is invalid after vectorization
11544 : since vectorized loop can have loop-carried dependencies. */
11545 61709 : loop->safelen = 0;
11546 :
11547 61709 : if (epilogue)
11548 : {
11549 : /* Accumulate past advancements made. */
11550 7058 : if (LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo))
11551 89 : advance = fold_build2 (PLUS_EXPR, TREE_TYPE (advance),
11552 : LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo),
11553 : advance);
11554 7058 : update_epilogue_loop_vinfo (epilogue, advance);
11555 :
11556 7058 : epilogue->simduid = loop->simduid;
11557 7058 : epilogue->force_vectorize = loop->force_vectorize;
11558 7058 : epilogue->dont_vectorize = false;
11559 : }
11560 :
11561 61709 : return epilogue;
11562 61709 : }
11563 :
11564 : /* The code below is trying to perform simple optimization - revert
11565 : if-conversion for masked stores, i.e. if the mask of a store is zero
11566 : do not perform it and all stored value producers also if possible.
11567 : For example,
11568 : for (i=0; i<n; i++)
11569 : if (c[i])
11570 : {
11571 : p1[i] += 1;
11572 : p2[i] = p3[i] +2;
11573 : }
11574 : this transformation will produce the following semi-hammock:
11575 :
11576 : if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11577 : {
11578 : vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11579 : vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11580 : MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11581 : vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11582 : vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11583 : MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11584 : }
11585 : */
11586 :
11587 : void
11588 499 : optimize_mask_stores (class loop *loop)
11589 : {
11590 499 : basic_block *bbs = get_loop_body (loop);
11591 499 : unsigned nbbs = loop->num_nodes;
11592 499 : unsigned i;
11593 499 : basic_block bb;
11594 499 : class loop *bb_loop;
11595 499 : gimple_stmt_iterator gsi;
11596 499 : gimple *stmt;
11597 499 : auto_vec<gimple *> worklist;
11598 499 : auto_purge_vect_location sentinel;
11599 :
11600 499 : vect_location = find_loop_location (loop);
11601 : /* Pick up all masked stores in loop if any. */
11602 1996 : for (i = 0; i < nbbs; i++)
11603 : {
11604 998 : bb = bbs[i];
11605 17427 : for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11606 15431 : gsi_next (&gsi))
11607 : {
11608 15431 : stmt = gsi_stmt (gsi);
11609 15431 : if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11610 701 : worklist.safe_push (stmt);
11611 : }
11612 : }
11613 :
11614 499 : free (bbs);
11615 499 : if (worklist.is_empty ())
11616 68 : return;
11617 :
11618 : /* Loop has masked stores. */
11619 1115 : while (!worklist.is_empty ())
11620 : {
11621 684 : gimple *last, *last_store;
11622 684 : edge e, efalse;
11623 684 : tree mask;
11624 684 : basic_block store_bb, join_bb;
11625 684 : gimple_stmt_iterator gsi_to;
11626 684 : tree vdef, new_vdef;
11627 684 : gphi *phi;
11628 684 : tree vectype;
11629 684 : tree zero;
11630 :
11631 684 : last = worklist.pop ();
11632 684 : mask = gimple_call_arg (last, 2);
11633 684 : bb = gimple_bb (last);
11634 : /* Create then_bb and if-then structure in CFG, then_bb belongs to
11635 : the same loop as if_bb. It could be different to LOOP when two
11636 : level loop-nest is vectorized and mask_store belongs to the inner
11637 : one. */
11638 684 : e = split_block (bb, last);
11639 684 : bb_loop = bb->loop_father;
11640 684 : gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11641 684 : join_bb = e->dest;
11642 684 : store_bb = create_empty_bb (bb);
11643 684 : add_bb_to_loop (store_bb, bb_loop);
11644 684 : e->flags = EDGE_TRUE_VALUE;
11645 684 : efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11646 : /* Put STORE_BB to likely part. */
11647 684 : efalse->probability = profile_probability::likely ();
11648 684 : e->probability = efalse->probability.invert ();
11649 684 : store_bb->count = efalse->count ();
11650 684 : make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11651 684 : if (dom_info_available_p (CDI_DOMINATORS))
11652 684 : set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11653 684 : if (dump_enabled_p ())
11654 351 : dump_printf_loc (MSG_NOTE, vect_location,
11655 : "Create new block %d to sink mask stores.",
11656 : store_bb->index);
11657 : /* Create vector comparison with boolean result. */
11658 684 : vectype = TREE_TYPE (mask);
11659 684 : zero = build_zero_cst (vectype);
11660 684 : stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11661 684 : gsi = gsi_last_bb (bb);
11662 684 : gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11663 : /* Create new PHI node for vdef of the last masked store:
11664 : .MEM_2 = VDEF <.MEM_1>
11665 : will be converted to
11666 : .MEM.3 = VDEF <.MEM_1>
11667 : and new PHI node will be created in join bb
11668 : .MEM_2 = PHI <.MEM_1, .MEM_3>
11669 : */
11670 684 : vdef = gimple_vdef (last);
11671 684 : new_vdef = make_ssa_name (gimple_vop (cfun), last);
11672 684 : gimple_set_vdef (last, new_vdef);
11673 684 : phi = create_phi_node (vdef, join_bb);
11674 684 : add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11675 :
11676 : /* Put all masked stores with the same mask to STORE_BB if possible. */
11677 718 : while (true)
11678 : {
11679 701 : gimple_stmt_iterator gsi_from;
11680 701 : gimple *stmt1 = NULL;
11681 :
11682 : /* Move masked store to STORE_BB. */
11683 701 : last_store = last;
11684 701 : gsi = gsi_for_stmt (last);
11685 701 : gsi_from = gsi;
11686 : /* Shift GSI to the previous stmt for further traversal. */
11687 701 : gsi_prev (&gsi);
11688 701 : gsi_to = gsi_start_bb (store_bb);
11689 701 : gsi_move_before (&gsi_from, &gsi_to);
11690 : /* Setup GSI_TO to the non-empty block start. */
11691 701 : gsi_to = gsi_start_bb (store_bb);
11692 701 : if (dump_enabled_p ())
11693 367 : dump_printf_loc (MSG_NOTE, vect_location,
11694 : "Move stmt to created bb\n%G", last);
11695 : /* Move all stored value producers if possible. */
11696 4976 : while (!gsi_end_p (gsi))
11697 : {
11698 4975 : tree lhs;
11699 4975 : imm_use_iterator imm_iter;
11700 4975 : use_operand_p use_p;
11701 4975 : bool res;
11702 :
11703 : /* Skip debug statements. */
11704 4975 : if (is_gimple_debug (gsi_stmt (gsi)))
11705 : {
11706 3 : gsi_prev (&gsi);
11707 3231 : continue;
11708 : }
11709 4972 : stmt1 = gsi_stmt (gsi);
11710 : /* Do not consider statements writing to memory or having
11711 : volatile operand. */
11712 9794 : if (gimple_vdef (stmt1)
11713 9794 : || gimple_has_volatile_ops (stmt1))
11714 : break;
11715 4822 : gsi_from = gsi;
11716 4822 : gsi_prev (&gsi);
11717 4822 : lhs = gimple_get_lhs (stmt1);
11718 4822 : if (!lhs)
11719 : break;
11720 :
11721 : /* LHS of vectorized stmt must be SSA_NAME. */
11722 4822 : if (TREE_CODE (lhs) != SSA_NAME)
11723 : break;
11724 :
11725 4822 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11726 : {
11727 : /* Remove dead scalar statement. */
11728 3560 : if (has_zero_uses (lhs))
11729 : {
11730 3228 : gsi_remove (&gsi_from, true);
11731 3228 : release_defs (stmt1);
11732 3228 : continue;
11733 : }
11734 : }
11735 :
11736 : /* Check that LHS does not have uses outside of STORE_BB. */
11737 1594 : res = true;
11738 4333 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11739 : {
11740 1695 : gimple *use_stmt;
11741 1695 : use_stmt = USE_STMT (use_p);
11742 1695 : if (is_gimple_debug (use_stmt))
11743 0 : continue;
11744 1695 : if (gimple_bb (use_stmt) != store_bb)
11745 : {
11746 : res = false;
11747 : break;
11748 : }
11749 1594 : }
11750 1594 : if (!res)
11751 : break;
11752 :
11753 1044 : if (gimple_vuse (stmt1)
11754 1480 : && gimple_vuse (stmt1) != gimple_vuse (last_store))
11755 : break;
11756 :
11757 : /* Can move STMT1 to STORE_BB. */
11758 1044 : if (dump_enabled_p ())
11759 563 : dump_printf_loc (MSG_NOTE, vect_location,
11760 : "Move stmt to created bb\n%G", stmt1);
11761 1044 : gsi_move_before (&gsi_from, &gsi_to);
11762 : /* Shift GSI_TO for further insertion. */
11763 2088 : gsi_prev (&gsi_to);
11764 : }
11765 : /* Put other masked stores with the same mask to STORE_BB. */
11766 701 : if (worklist.is_empty ()
11767 270 : || gimple_call_arg (worklist.last (), 2) != mask
11768 17 : || worklist.last () != stmt1)
11769 : break;
11770 17 : last = worklist.pop ();
11771 17 : }
11772 1368 : add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11773 : }
11774 499 : }
11775 :
11776 : /* Decide whether it is possible to use a zero-based induction variable
11777 : when vectorizing LOOP_VINFO with partial vectors. If it is, return
11778 : the value that the induction variable must be able to hold in order
11779 : to ensure that the rgroups eventually have no active vector elements.
11780 : Return -1 otherwise. */
11781 :
11782 : widest_int
11783 33510 : vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11784 : {
11785 33510 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11786 33510 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11787 33510 : unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11788 :
11789 : /* Calculate the value that the induction variable must be able
11790 : to hit in order to ensure that we end the loop with an all-false mask.
11791 : This involves adding the maximum number of inactive trailing scalar
11792 : iterations. */
11793 33510 : widest_int iv_limit = -1;
11794 33510 : if (max_loop_iterations (loop, &iv_limit))
11795 : {
11796 33510 : if (niters_skip)
11797 : {
11798 : /* Add the maximum number of skipped iterations to the
11799 : maximum iteration count. */
11800 0 : if (TREE_CODE (niters_skip) == INTEGER_CST)
11801 0 : iv_limit += wi::to_widest (niters_skip);
11802 : else
11803 0 : iv_limit += max_vf - 1;
11804 : }
11805 33510 : else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11806 : /* Make a conservatively-correct assumption. */
11807 336 : iv_limit += max_vf - 1;
11808 :
11809 : /* IV_LIMIT is the maximum number of latch iterations, which is also
11810 : the maximum in-range IV value. Round this value down to the previous
11811 : vector alignment boundary and then add an extra full iteration. */
11812 33510 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11813 33510 : iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11814 : }
11815 33510 : return iv_limit;
11816 : }
11817 :
11818 : /* For the given rgroup_controls RGC, check whether an induction variable
11819 : would ever hit a value that produces a set of all-false masks or zero
11820 : lengths before wrapping around. Return true if it's possible to wrap
11821 : around before hitting the desirable value, otherwise return false. */
11822 :
11823 : bool
11824 0 : vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11825 : {
11826 0 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11827 :
11828 0 : if (iv_limit == -1)
11829 : return true;
11830 :
11831 0 : tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11832 0 : unsigned int compare_precision = TYPE_PRECISION (compare_type);
11833 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11834 :
11835 0 : if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11836 : return true;
11837 :
11838 : return false;
11839 0 : }
|