Line data Source code
1 : /* Loop Vectorization
2 : Copyright (C) 2003-2026 Free Software Foundation, Inc.
3 : Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 : Ira Rosen <irar@il.ibm.com>
5 :
6 : This file is part of GCC.
7 :
8 : GCC is free software; you can redistribute it and/or modify it under
9 : the terms of the GNU General Public License as published by the Free
10 : Software Foundation; either version 3, or (at your option) any later
11 : version.
12 :
13 : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : for more details.
17 :
18 : You should have received a copy of the GNU General Public License
19 : along with GCC; see the file COPYING3. If not see
20 : <http://www.gnu.org/licenses/>. */
21 :
22 : #define INCLUDE_ALGORITHM
23 : #include "config.h"
24 : #include "system.h"
25 : #include "coretypes.h"
26 : #include "backend.h"
27 : #include "target.h"
28 : #include "rtl.h"
29 : #include "tree.h"
30 : #include "gimple.h"
31 : #include "cfghooks.h"
32 : #include "tree-pass.h"
33 : #include "ssa.h"
34 : #include "optabs-tree.h"
35 : #include "memmodel.h"
36 : #include "optabs.h"
37 : #include "diagnostic-core.h"
38 : #include "fold-const.h"
39 : #include "stor-layout.h"
40 : #include "cfganal.h"
41 : #include "gimplify.h"
42 : #include "gimple-iterator.h"
43 : #include "gimplify-me.h"
44 : #include "tree-ssa-loop-ivopts.h"
45 : #include "tree-ssa-loop-manip.h"
46 : #include "tree-ssa-loop-niter.h"
47 : #include "tree-ssa-loop.h"
48 : #include "cfgloop.h"
49 : #include "tree-scalar-evolution.h"
50 : #include "tree-vectorizer.h"
51 : #include "gimple-fold.h"
52 : #include "cgraph.h"
53 : #include "tree-cfg.h"
54 : #include "tree-if-conv.h"
55 : #include "internal-fn.h"
56 : #include "tree-vector-builder.h"
57 : #include "vec-perm-indices.h"
58 : #include "tree-eh.h"
59 : #include "case-cfn-macros.h"
60 : #include "langhooks.h"
61 : #include "opts.h"
62 : #include "hierarchical_discriminator.h"
63 :
64 : /* Loop Vectorization Pass.
65 :
66 : This pass tries to vectorize loops.
67 :
68 : For example, the vectorizer transforms the following simple loop:
69 :
70 : short a[N]; short b[N]; short c[N]; int i;
71 :
72 : for (i=0; i<N; i++){
73 : a[i] = b[i] + c[i];
74 : }
75 :
76 : as if it was manually vectorized by rewriting the source code into:
77 :
78 : typedef int __attribute__((mode(V8HI))) v8hi;
79 : short a[N]; short b[N]; short c[N]; int i;
80 : v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
81 : v8hi va, vb, vc;
82 :
83 : for (i=0; i<N/8; i++){
84 : vb = pb[i];
85 : vc = pc[i];
86 : va = vb + vc;
87 : pa[i] = va;
88 : }
89 :
90 : The main entry to this pass is vectorize_loops(), in which
91 : the vectorizer applies a set of analyses on a given set of loops,
92 : followed by the actual vectorization transformation for the loops that
93 : had successfully passed the analysis phase.
94 : Throughout this pass we make a distinction between two types of
95 : data: scalars (which are represented by SSA_NAMES), and memory references
96 : ("data-refs"). These two types of data require different handling both
97 : during analysis and transformation. The types of data-refs that the
98 : vectorizer currently supports are ARRAY_REFS which base is an array DECL
99 : (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
100 : accesses are required to have a simple (consecutive) access pattern.
101 :
102 : Analysis phase:
103 : ===============
104 : The driver for the analysis phase is vect_analyze_loop().
105 : It applies a set of analyses, some of which rely on the scalar evolution
106 : analyzer (scev) developed by Sebastian Pop.
107 :
108 : During the analysis phase the vectorizer records some information
109 : per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
110 : loop, as well as general information about the loop as a whole, which is
111 : recorded in a "loop_vec_info" struct attached to each loop.
112 :
113 : Transformation phase:
114 : =====================
115 : The loop transformation phase scans all the stmts in the loop, and
116 : creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
117 : the loop that needs to be vectorized. It inserts the vector code sequence
118 : just before the scalar stmt S, and records a pointer to the vector code
119 : in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
120 : attached to S). This pointer will be used for the vectorization of following
121 : stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
122 : otherwise, we rely on dead code elimination for removing it.
123 :
124 : For example, say stmt S1 was vectorized into stmt VS1:
125 :
126 : VS1: vb = px[i];
127 : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
128 : S2: a = b;
129 :
130 : To vectorize stmt S2, the vectorizer first finds the stmt that defines
131 : the operand 'b' (S1), and gets the relevant vector def 'vb' from the
132 : vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
133 : resulting sequence would be:
134 :
135 : VS1: vb = px[i];
136 : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
137 : VS2: va = vb;
138 : S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
139 :
140 : Operands that are not SSA_NAMEs, are data-refs that appear in
141 : load/store operations (like 'x[i]' in S1), and are handled differently.
142 :
143 : Target modeling:
144 : =================
145 : Currently the only target specific information that is used is the
146 : size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
147 : Targets that can support different sizes of vectors, for now will need
148 : to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
149 : flexibility will be added in the future.
150 :
151 : Since we only vectorize operations which vector form can be
152 : expressed using existing tree codes, to verify that an operation is
153 : supported, the vectorizer checks the relevant optab at the relevant
154 : machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
155 : the value found is CODE_FOR_nothing, then there's no target support, and
156 : we can't vectorize the stmt.
157 :
158 : For additional information on this project see:
159 : http://gcc.gnu.org/projects/tree-ssa/vectorization.html
160 : */
161 :
162 : static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
163 : unsigned *);
164 : static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
165 : gphi **);
166 :
167 :
168 : /* Function vect_is_simple_iv_evolution.
169 :
170 : FORNOW: A simple evolution of an induction variables in the loop is
171 : considered a polynomial evolution. */
172 :
173 : static bool
174 902409 : vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn,
175 : stmt_vec_info stmt_info)
176 : {
177 902409 : tree init_expr;
178 902409 : tree step_expr;
179 902409 : tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
180 902409 : basic_block bb;
181 :
182 : /* When there is no evolution in this loop, the evolution function
183 : is not "simple". */
184 902409 : if (evolution_part == NULL_TREE)
185 : return false;
186 :
187 : /* When the evolution is a polynomial of degree >= 2
188 : the evolution function is not "simple". */
189 965989 : if (tree_is_chrec (evolution_part))
190 : return false;
191 :
192 792400 : step_expr = evolution_part;
193 792400 : init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
194 :
195 792400 : if (dump_enabled_p ())
196 39760 : dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
197 : step_expr, init_expr);
198 :
199 792400 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = init_expr;
200 792400 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = step_expr;
201 :
202 792400 : if (TREE_CODE (step_expr) != INTEGER_CST
203 71314 : && (TREE_CODE (step_expr) != SSA_NAME
204 56384 : || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
205 56134 : && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
206 7772 : || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
207 131 : && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
208 131 : || !flag_associative_math)))
209 856037 : && (TREE_CODE (step_expr) != REAL_CST
210 431 : || !flag_associative_math))
211 : {
212 63580 : if (dump_enabled_p ())
213 3064 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
214 : "step unknown.\n");
215 63580 : return false;
216 : }
217 :
218 : return true;
219 : }
220 :
221 : /* Function vect_is_nonlinear_iv_evolution
222 :
223 : Only support nonlinear induction for integer type
224 : 1. neg
225 : 2. mul by constant
226 : 3. lshift/rshift by constant.
227 :
228 : For neg induction, return a fake step as integer -1. */
229 : static bool
230 170998 : vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
231 : gphi* loop_phi_node)
232 : {
233 170998 : tree init_expr, ev_expr, result, op1, op2;
234 170998 : gimple* def;
235 :
236 170998 : if (gimple_phi_num_args (loop_phi_node) != 2)
237 : return false;
238 :
239 170998 : init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
240 170998 : ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
241 :
242 : /* Support nonlinear induction only for integer type. */
243 170998 : if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
244 : return false;
245 :
246 108062 : result = PHI_RESULT (loop_phi_node);
247 :
248 108062 : if (TREE_CODE (ev_expr) != SSA_NAME
249 105756 : || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
250 108062 : || !is_gimple_assign (def))
251 : return false;
252 :
253 97203 : enum tree_code t_code = gimple_assign_rhs_code (def);
254 97203 : tree step;
255 97203 : switch (t_code)
256 : {
257 3508 : case NEGATE_EXPR:
258 3508 : if (gimple_assign_rhs1 (def) != result)
259 : return false;
260 3508 : step = build_int_cst (TREE_TYPE (init_expr), -1);
261 3508 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
262 3508 : break;
263 :
264 11327 : case RSHIFT_EXPR:
265 11327 : case LSHIFT_EXPR:
266 11327 : case MULT_EXPR:
267 11327 : op1 = gimple_assign_rhs1 (def);
268 11327 : op2 = gimple_assign_rhs2 (def);
269 11327 : if (TREE_CODE (op2) != INTEGER_CST
270 7455 : || op1 != result)
271 : return false;
272 7070 : step = op2;
273 7070 : if (t_code == LSHIFT_EXPR)
274 472 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
275 6598 : else if (t_code == RSHIFT_EXPR)
276 5622 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
277 : /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
278 : else
279 976 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
280 : break;
281 :
282 : default:
283 : return false;
284 : }
285 :
286 10578 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = init_expr;
287 10578 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = step;
288 :
289 10578 : return true;
290 : }
291 :
292 : /* Returns true if Phi is a first-order recurrence. A first-order
293 : recurrence is a non-reduction recurrence relation in which the value of
294 : the recurrence in the current loop iteration equals a value defined in
295 : the previous iteration. */
296 :
297 : static bool
298 65952 : vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
299 : gphi *phi)
300 : {
301 : /* A nested cycle isn't vectorizable as first order recurrence. */
302 65952 : if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
303 : return false;
304 :
305 : /* Ensure the loop latch definition is from within the loop. */
306 65786 : edge latch = loop_latch_edge (loop);
307 65786 : tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
308 65786 : if (TREE_CODE (ldef) != SSA_NAME
309 63137 : || SSA_NAME_IS_DEFAULT_DEF (ldef)
310 63071 : || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
311 124240 : || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
312 7982 : return false;
313 :
314 57804 : tree def = gimple_phi_result (phi);
315 :
316 : /* Ensure every use_stmt of the phi node is dominated by the latch
317 : definition. */
318 57804 : imm_use_iterator imm_iter;
319 57804 : use_operand_p use_p;
320 127863 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
321 69552 : if (!is_gimple_debug (USE_STMT (use_p))
322 135512 : && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
323 45619 : || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
324 : USE_STMT (use_p))))
325 57297 : return false;
326 :
327 : /* First-order recurrence autovectorization needs shuffle vector. */
328 507 : tree scalar_type = TREE_TYPE (def);
329 507 : tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
330 507 : if (!vectype)
331 : return false;
332 :
333 : return true;
334 : }
335 :
336 : /* Function vect_analyze_scalar_cycles_1.
337 :
338 : Examine the cross iteration def-use cycles of scalar variables
339 : in LOOP. LOOP_VINFO represents the loop that is now being
340 : considered for vectorization (can be LOOP, or an outer-loop
341 : enclosing LOOP). SLP indicates there will be some subsequent
342 : slp analyses or not. */
343 :
344 : static void
345 443534 : vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
346 : {
347 443534 : basic_block bb = loop->header;
348 443534 : auto_vec<stmt_vec_info, 64> worklist;
349 443534 : gphi_iterator gsi;
350 :
351 443534 : DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
352 :
353 : /* First - identify all inductions. Reduction detection assumes that all the
354 : inductions have been identified, therefore, this order must not be
355 : changed. */
356 1585962 : for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
357 : {
358 1142428 : gphi *phi = gsi.phi ();
359 1142428 : tree access_fn = NULL;
360 1142428 : tree def = PHI_RESULT (phi);
361 1142428 : stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
362 :
363 : /* Skip virtual phi's. The data dependences that are associated with
364 : virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
365 2284856 : if (virtual_operand_p (def))
366 403036 : continue;
367 :
368 : /* Skip already analyzed inner loop PHIs of double reductions. */
369 903409 : if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))
370 1000 : continue;
371 :
372 902409 : if (dump_enabled_p ())
373 41876 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
374 : (gimple *) phi);
375 :
376 902409 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
377 :
378 : /* Analyze the evolution function. */
379 902409 : access_fn = analyze_scalar_evolution (loop, def);
380 902409 : if (dump_enabled_p ())
381 41876 : dump_printf_loc (MSG_NOTE, vect_location,
382 : "Access function of PHI: %T\n", access_fn);
383 902409 : if (access_fn)
384 902409 : STRIP_NOPS (access_fn);
385 :
386 1065426 : if ((!access_fn
387 902409 : || !vect_is_simple_iv_evolution (loop->num, access_fn, stmt_vinfo)
388 728820 : || (LOOP_VINFO_LOOP (loop_vinfo) != loop
389 11379 : && (TREE_CODE (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo))
390 : != INTEGER_CST)))
391 : /* Only handle nonlinear iv for same loop. */
392 1076004 : && (LOOP_VINFO_LOOP (loop_vinfo) != loop
393 170998 : || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo, phi)))
394 : {
395 163017 : worklist.safe_push (stmt_vinfo);
396 163017 : continue;
397 : }
398 :
399 739392 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
400 : != NULL_TREE);
401 739392 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
402 :
403 739392 : if (dump_enabled_p ())
404 36805 : dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
405 739392 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
406 :
407 : /* Mark if we have a non-linear IV. */
408 739392 : LOOP_VINFO_NON_LINEAR_IV (loop_vinfo)
409 739392 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_vinfo) != vect_step_op_add;
410 : }
411 :
412 :
413 : /* Second - identify all reductions and nested cycles. */
414 606551 : while (worklist.length () > 0)
415 : {
416 163017 : stmt_vec_info stmt_vinfo = worklist.pop ();
417 163017 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
418 163017 : tree def = PHI_RESULT (phi);
419 :
420 163017 : if (dump_enabled_p ())
421 5071 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
422 : (gimple *) phi);
423 :
424 326034 : gcc_assert (!virtual_operand_p (def)
425 : && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
426 :
427 163017 : gphi *double_reduc;
428 163017 : stmt_vec_info reduc_stmt_info
429 163017 : = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc);
430 163017 : if (reduc_stmt_info && double_reduc)
431 : {
432 1102 : stmt_vec_info inner_phi_info
433 1102 : = loop_vinfo->lookup_stmt (double_reduc);
434 : /* ??? Pass down flag we're the inner loop of a double reduc. */
435 1102 : stmt_vec_info inner_reduc_info
436 1102 : = vect_is_simple_reduction (loop_vinfo, inner_phi_info, NULL);
437 1102 : if (inner_reduc_info)
438 : {
439 1000 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
440 1000 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
441 1000 : STMT_VINFO_REDUC_DEF (inner_phi_info) = inner_reduc_info;
442 1000 : STMT_VINFO_REDUC_DEF (inner_reduc_info) = inner_phi_info;
443 1000 : if (dump_enabled_p ())
444 130 : dump_printf_loc (MSG_NOTE, vect_location,
445 : "Detected double reduction.\n");
446 :
447 1000 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
448 1000 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
449 1000 : STMT_VINFO_DEF_TYPE (inner_phi_info) = vect_nested_cycle;
450 : /* Make it accessible for SLP vectorization. */
451 1000 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
452 : }
453 102 : else if (dump_enabled_p ())
454 14 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
455 : "Unknown def-use cycle pattern.\n");
456 : }
457 161915 : else if (reduc_stmt_info)
458 : {
459 95963 : if (loop != LOOP_VINFO_LOOP (loop_vinfo))
460 : {
461 2431 : if (dump_enabled_p ())
462 434 : dump_printf_loc (MSG_NOTE, vect_location,
463 : "Detected vectorizable nested cycle.\n");
464 :
465 2431 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
466 : }
467 : else
468 : {
469 93532 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
470 93532 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
471 93532 : if (dump_enabled_p ())
472 3942 : dump_printf_loc (MSG_NOTE, vect_location,
473 : "Detected reduction.\n");
474 :
475 93532 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
476 93532 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
477 93532 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
478 : }
479 : }
480 65952 : else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
481 501 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
482 : else
483 65451 : if (dump_enabled_p ())
484 477 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
485 : "Unknown def-use cycle pattern.\n");
486 : }
487 443534 : }
488 :
489 :
490 : /* Function vect_analyze_scalar_cycles.
491 :
492 : Examine the cross iteration def-use cycles of scalar variables, by
493 : analyzing the loop-header PHIs of scalar variables. Classify each
494 : cycle as one of the following: invariant, induction, reduction, unknown.
495 : We do that for the loop represented by LOOP_VINFO, and also to its
496 : inner-loop, if exists.
497 : Examples for scalar cycles:
498 :
499 : Example1: reduction:
500 :
501 : loop1:
502 : for (i=0; i<N; i++)
503 : sum += a[i];
504 :
505 : Example2: induction:
506 :
507 : loop2:
508 : for (i=0; i<N; i++)
509 : a[i] = i; */
510 :
511 : static void
512 437751 : vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
513 : {
514 437751 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
515 :
516 437751 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
517 :
518 : /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
519 : Reductions in such inner-loop therefore have different properties than
520 : the reductions in the nest that gets vectorized:
521 : 1. When vectorized, they are executed in the same order as in the original
522 : scalar loop, so we can't change the order of computation when
523 : vectorizing them.
524 : 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
525 : current checks are too strict. */
526 :
527 437751 : if (loop->inner)
528 5783 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
529 437751 : }
530 :
531 : /* Function vect_get_loop_niters.
532 :
533 : Determine how many iterations the loop is executed and place it
534 : in NUMBER_OF_ITERATIONS. Place the number of latch iterations
535 : in NUMBER_OF_ITERATIONSM1. Place the condition under which the
536 : niter information holds in ASSUMPTIONS.
537 :
538 : Return the loop exit conditions. */
539 :
540 :
541 : static vec<gcond *>
542 278208 : vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
543 : tree *number_of_iterations, tree *number_of_iterationsm1)
544 : {
545 278208 : auto_vec<edge> exits = get_loop_exit_edges (loop);
546 278208 : vec<gcond *> conds;
547 556416 : conds.create (exits.length ());
548 278208 : class tree_niter_desc niter_desc;
549 278208 : tree niter_assumptions, niter, may_be_zero;
550 :
551 278208 : *assumptions = boolean_true_node;
552 278208 : *number_of_iterationsm1 = chrec_dont_know;
553 278208 : *number_of_iterations = chrec_dont_know;
554 :
555 278208 : DUMP_VECT_SCOPE ("get_loop_niters");
556 :
557 278208 : if (exits.is_empty ())
558 0 : return conds;
559 :
560 278208 : if (dump_enabled_p ())
561 14609 : dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
562 : exits.length ());
563 :
564 : edge exit;
565 : unsigned int i;
566 677152 : FOR_EACH_VEC_ELT (exits, i, exit)
567 : {
568 398944 : gcond *cond = get_loop_exit_condition (exit);
569 398944 : if (cond)
570 398911 : conds.safe_push (cond);
571 :
572 398944 : if (dump_enabled_p ())
573 15743 : dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
574 :
575 398944 : if (exit != main_exit)
576 179193 : continue;
577 :
578 278208 : may_be_zero = NULL_TREE;
579 278208 : if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
580 278208 : || chrec_contains_undetermined (niter_desc.niter))
581 58457 : continue;
582 :
583 219751 : niter_assumptions = niter_desc.assumptions;
584 219751 : may_be_zero = niter_desc.may_be_zero;
585 219751 : niter = niter_desc.niter;
586 :
587 219751 : if (may_be_zero && integer_zerop (may_be_zero))
588 : may_be_zero = NULL_TREE;
589 :
590 9343 : if (may_be_zero)
591 : {
592 9343 : if (COMPARISON_CLASS_P (may_be_zero))
593 : {
594 : /* Try to combine may_be_zero with assumptions, this can simplify
595 : computation of niter expression. */
596 9343 : if (niter_assumptions && !integer_nonzerop (niter_assumptions))
597 1023 : niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
598 : niter_assumptions,
599 : fold_build1 (TRUTH_NOT_EXPR,
600 : boolean_type_node,
601 : may_be_zero));
602 : else
603 8320 : niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
604 : build_int_cst (TREE_TYPE (niter), 0),
605 : rewrite_to_non_trapping_overflow (niter));
606 :
607 219751 : may_be_zero = NULL_TREE;
608 : }
609 0 : else if (integer_nonzerop (may_be_zero))
610 : {
611 0 : *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
612 0 : *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
613 0 : continue;
614 : }
615 : else
616 0 : continue;
617 : }
618 :
619 : /* Loop assumptions are based off the normal exit. */
620 219751 : *assumptions = niter_assumptions;
621 219751 : *number_of_iterationsm1 = niter;
622 :
623 : /* We want the number of loop header executions which is the number
624 : of latch executions plus one.
625 : ??? For UINT_MAX latch executions this number overflows to zero
626 : for loops like do { n++; } while (n != 0); */
627 219751 : if (niter && !chrec_contains_undetermined (niter))
628 : {
629 219751 : niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
630 : unshare_expr (niter),
631 : build_int_cst (TREE_TYPE (niter), 1));
632 219751 : if (TREE_CODE (niter) == INTEGER_CST
633 120899 : && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
634 : {
635 : /* If we manage to fold niter + 1 into INTEGER_CST even when
636 : niter is some complex expression, ensure back
637 : *number_of_iterationsm1 is an INTEGER_CST as well. See
638 : PR113210. */
639 0 : *number_of_iterationsm1
640 0 : = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
641 : build_minus_one_cst (TREE_TYPE (niter)));
642 : }
643 : }
644 219751 : *number_of_iterations = niter;
645 : }
646 :
647 278208 : if (dump_enabled_p ())
648 14609 : dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
649 :
650 278208 : return conds;
651 278208 : }
652 :
653 : /* Determine the main loop exit for the vectorizer. */
654 :
655 : edge
656 489902 : vec_init_loop_exit_info (class loop *loop)
657 : {
658 : /* Before we begin we must first determine which exit is the main one and
659 : which are auxilary exits. */
660 489902 : auto_vec<edge> exits = get_loop_exit_edges (loop);
661 974767 : if (exits.length () == 0)
662 : return NULL;
663 484865 : if (exits.length () == 1)
664 319798 : return exits[0];
665 :
666 : /* If we have multiple exits, look for counting IV exit.
667 : Analyze all exits and return the last one we can analyze. */
668 165067 : class tree_niter_desc niter_desc;
669 165067 : edge candidate = NULL;
670 612996 : for (edge exit : exits)
671 : {
672 468001 : if (!get_loop_exit_condition (exit))
673 : {
674 20072 : if (dump_enabled_p ())
675 14 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
676 : "Unhandled loop exit detected.\n");
677 20072 : return NULL;
678 : }
679 :
680 447929 : if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
681 447929 : && !chrec_contains_undetermined (niter_desc.niter))
682 : {
683 132395 : tree may_be_zero = niter_desc.may_be_zero;
684 132395 : if ((integer_zerop (may_be_zero)
685 : /* As we are handling may_be_zero that's not false by
686 : rewriting niter to may_be_zero ? 0 : niter we require
687 : an empty latch. */
688 458426 : || (single_pred_p (loop->latch)
689 10150 : && exit->src == single_pred (loop->latch)
690 2660 : && (integer_nonzerop (may_be_zero)
691 2660 : || COMPARISON_CLASS_P (may_be_zero))))
692 135055 : && (!candidate
693 5878 : || dominated_by_p (CDI_DOMINATORS, exit->src,
694 5878 : candidate->src)))
695 : candidate = exit;
696 : }
697 : }
698 :
699 : /* If no exit is analyzable by scalar evolution, we return the last exit
700 : under the assummption we are dealing with an uncounted loop. */
701 199836 : if (!candidate && single_pred_p (loop->latch))
702 34769 : candidate = loop_exits_from_bb_p (loop, single_pred (loop->latch));
703 :
704 : return candidate;
705 165067 : }
706 :
707 : /* Function bb_in_loop_p
708 :
709 : Used as predicate for dfs order traversal of the loop bbs. */
710 :
711 : static bool
712 1673536 : bb_in_loop_p (const_basic_block bb, const void *data)
713 : {
714 1673536 : const class loop *const loop = (const class loop *)data;
715 1673536 : if (flow_bb_inside_loop_p (loop, bb))
716 : return true;
717 : return false;
718 : }
719 :
720 :
721 : /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
722 : stmt_vec_info structs for all the stmts in LOOP_IN. */
723 :
724 573065 : _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
725 : : vec_info (vec_info::loop, shared),
726 573065 : loop (loop_in),
727 573065 : num_itersm1 (NULL_TREE),
728 573065 : num_iters (NULL_TREE),
729 573065 : num_iters_unchanged (NULL_TREE),
730 573065 : num_iters_assumptions (NULL_TREE),
731 573065 : vector_costs (nullptr),
732 573065 : scalar_costs (nullptr),
733 573065 : th (0),
734 573065 : versioning_threshold (0),
735 573065 : vectorization_factor (0),
736 573065 : main_loop_edge (nullptr),
737 573065 : skip_main_loop_edge (nullptr),
738 573065 : skip_this_loop_edge (nullptr),
739 573065 : reusable_accumulators (),
740 573065 : suggested_unroll_factor (1),
741 573065 : max_vectorization_factor (0),
742 573065 : mask_skip_niters (NULL_TREE),
743 573065 : mask_skip_niters_pfa_offset (NULL_TREE),
744 573065 : rgroup_compare_type (NULL_TREE),
745 573065 : simd_if_cond (NULL_TREE),
746 573065 : partial_vector_style (vect_partial_vectors_none),
747 573065 : unaligned_dr (NULL),
748 573065 : peeling_for_alignment (0),
749 573065 : ptr_mask (0),
750 573065 : max_spec_read_amount (0),
751 573065 : nonlinear_iv (false),
752 573065 : ivexpr_map (NULL),
753 573065 : scan_map (NULL),
754 573065 : inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
755 573065 : vectorizable (false),
756 573065 : can_use_partial_vectors_p (true),
757 573065 : must_use_partial_vectors_p (false),
758 573065 : using_partial_vectors_p (false),
759 573065 : using_decrementing_iv_p (false),
760 573065 : using_select_vl_p (false),
761 573065 : allow_mutual_alignment (false),
762 573065 : partial_load_store_bias (0),
763 573065 : peeling_for_gaps (false),
764 573065 : peeling_for_niter (false),
765 573065 : early_breaks (false),
766 573065 : loop_iv_cond (NULL),
767 573065 : user_unroll (false),
768 573065 : no_data_dependencies (false),
769 573065 : has_mask_store (false),
770 573065 : scalar_loop_scaling (profile_probability::uninitialized ()),
771 573065 : scalar_loop (NULL),
772 573065 : main_loop_info (NULL),
773 573065 : orig_loop_info (NULL),
774 573065 : epilogue_vinfo (NULL),
775 573065 : drs_advanced_by (NULL_TREE),
776 573065 : vec_loop_main_exit (NULL),
777 573065 : vec_epilogue_loop_main_exit (NULL),
778 573065 : scalar_loop_main_exit (NULL)
779 : {
780 : /* CHECKME: We want to visit all BBs before their successors (except for
781 : latch blocks, for which this assertion wouldn't hold). In the simple
782 : case of the loop forms we allow, a dfs order of the BBs would the same
783 : as reversed postorder traversal, so we are safe. */
784 :
785 573065 : bbs = XCNEWVEC (basic_block, loop->num_nodes);
786 1146130 : nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, bbs,
787 573065 : loop->num_nodes, loop);
788 573065 : gcc_assert (nbbs == loop->num_nodes);
789 :
790 1996366 : for (unsigned int i = 0; i < nbbs; i++)
791 : {
792 1423301 : basic_block bb = bbs[i];
793 1423301 : gimple_stmt_iterator si;
794 :
795 2939107 : for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
796 : {
797 1515806 : gimple *phi = gsi_stmt (si);
798 1515806 : gimple_set_uid (phi, 0);
799 1515806 : add_stmt (phi);
800 : }
801 :
802 12939785 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
803 : {
804 10093183 : gimple *stmt = gsi_stmt (si);
805 10093183 : gimple_set_uid (stmt, 0);
806 10093183 : if (is_gimple_debug (stmt))
807 4153102 : continue;
808 5940081 : add_stmt (stmt);
809 : /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
810 : third argument is the #pragma omp simd if (x) condition, when 0,
811 : loop shouldn't be vectorized, when non-zero constant, it should
812 : be vectorized normally, otherwise versioned with vectorized loop
813 : done if the condition is non-zero at runtime. */
814 5940081 : if (loop_in->simduid
815 43372 : && is_gimple_call (stmt)
816 4268 : && gimple_call_internal_p (stmt)
817 4141 : && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
818 4137 : && gimple_call_num_args (stmt) >= 3
819 103 : && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
820 5940184 : && (loop_in->simduid
821 103 : == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
822 : {
823 103 : tree arg = gimple_call_arg (stmt, 2);
824 103 : if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
825 103 : simd_if_cond = arg;
826 : else
827 0 : gcc_assert (integer_nonzerop (arg));
828 : }
829 : }
830 : }
831 573065 : }
832 :
833 : /* Free all levels of rgroup CONTROLS. */
834 :
835 : void
836 1420818 : release_vec_loop_controls (vec<rgroup_controls> *controls)
837 : {
838 1420818 : rgroup_controls *rgc;
839 1420818 : unsigned int i;
840 1445211 : FOR_EACH_VEC_ELT (*controls, i, rgc)
841 24393 : rgc->controls.release ();
842 1420818 : controls->release ();
843 1420818 : }
844 :
845 : /* Free all memory used by the _loop_vec_info, as well as all the
846 : stmt_vec_info structs of all the stmts in the loop. */
847 :
848 573065 : _loop_vec_info::~_loop_vec_info ()
849 : {
850 573065 : free (bbs);
851 :
852 573065 : release_vec_loop_controls (&masks.rgc_vec);
853 573065 : release_vec_loop_controls (&lens);
854 576897 : delete ivexpr_map;
855 573387 : delete scan_map;
856 573065 : delete scalar_costs;
857 573065 : delete vector_costs;
858 784521 : for (auto reduc_info : reduc_infos)
859 202932 : delete reduc_info;
860 :
861 : /* When we release an epiloge vinfo that we do not intend to use
862 : avoid clearing AUX of the main loop which should continue to
863 : point to the main loop vinfo since otherwise we'll leak that. */
864 573065 : if (loop->aux == this)
865 61416 : loop->aux = NULL;
866 1146130 : }
867 :
868 : /* Return an invariant or register for EXPR and emit necessary
869 : computations in the LOOP_VINFO loop preheader. */
870 :
871 : tree
872 19662 : cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
873 : {
874 19662 : if (is_gimple_reg (expr)
875 19662 : || is_gimple_min_invariant (expr))
876 6804 : return expr;
877 :
878 12858 : if (! loop_vinfo->ivexpr_map)
879 3832 : loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
880 12858 : tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
881 12858 : if (! cached)
882 : {
883 8319 : gimple_seq stmts = NULL;
884 8319 : cached = force_gimple_operand (unshare_expr (expr),
885 : &stmts, true, NULL_TREE);
886 8319 : if (stmts)
887 : {
888 8179 : edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
889 8179 : gsi_insert_seq_on_edge_immediate (e, stmts);
890 : }
891 : }
892 12858 : return cached;
893 : }
894 :
895 : /* Return true if we can use CMP_TYPE as the comparison type to produce
896 : all masks required to mask LOOP_VINFO. */
897 :
898 : static bool
899 109692 : can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
900 : {
901 109692 : rgroup_controls *rgm;
902 109692 : unsigned int i;
903 125196 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
904 125196 : if (rgm->type != NULL_TREE
905 125196 : && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
906 : cmp_type, rgm->type,
907 : OPTIMIZE_FOR_SPEED))
908 : return false;
909 : return true;
910 : }
911 :
912 : /* Calculate the maximum number of scalars per iteration for every
913 : rgroup in LOOP_VINFO. */
914 :
915 : static unsigned int
916 23397 : vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
917 : {
918 23397 : unsigned int res = 1;
919 23397 : unsigned int i;
920 23397 : rgroup_controls *rgm;
921 56062 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
922 32665 : res = MAX (res, rgm->max_nscalars_per_iter);
923 23397 : return res;
924 : }
925 :
926 : /* Calculate the minimum precision necessary to represent:
927 :
928 : MAX_NITERS * FACTOR
929 :
930 : as an unsigned integer, where MAX_NITERS is the maximum number of
931 : loop header iterations for the original scalar form of LOOP_VINFO. */
932 :
933 : unsigned
934 25772 : vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
935 : {
936 25772 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
937 :
938 : /* Get the maximum number of iterations that is representable
939 : in the counter type. */
940 25772 : tree ni_type;
941 25772 : if (!LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo))
942 25772 : ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
943 : else
944 0 : ni_type = sizetype;
945 25772 : widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
946 :
947 : /* Get a more refined estimate for the number of iterations. */
948 25772 : widest_int max_back_edges;
949 25772 : if (max_loop_iterations (loop, &max_back_edges))
950 25772 : max_ni = wi::smin (max_ni, max_back_edges + 1);
951 :
952 : /* Work out how many bits we need to represent the limit. */
953 25772 : return wi::min_precision (max_ni * factor, UNSIGNED);
954 25772 : }
955 :
956 : /* True if the loop needs peeling or partial vectors when vectorized. */
957 :
958 : static bool
959 154925 : vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
960 : {
961 154925 : unsigned HOST_WIDE_INT const_vf;
962 :
963 154925 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
964 : return true;
965 :
966 13443 : loop_vec_info main_loop_vinfo
967 153635 : = (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
968 153635 : ? LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo) : loop_vinfo);
969 153635 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
970 78905 : && LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo) >= 0)
971 : {
972 : /* Work out the (constant) number of iterations that need to be
973 : peeled for reasons other than niters. */
974 78855 : unsigned int peel_niter
975 : = LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo);
976 78855 : return !multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
977 78855 : LOOP_VINFO_VECT_FACTOR (loop_vinfo));
978 : }
979 :
980 74780 : if (!LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo)
981 74780 : && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf))
982 : {
983 : /* When the number of iterations is a multiple of the vectorization
984 : factor and we are not doing prologue or forced epilogue peeling
985 : the epilogue isn't necessary. */
986 74358 : if (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
987 148716 : >= (unsigned) exact_log2 (const_vf))
988 : return false;
989 : }
990 :
991 : return true;
992 : }
993 :
994 : /* Each statement in LOOP_VINFO can be masked where necessary. Check
995 : whether we can actually generate the masks required. Return true if so,
996 : storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
997 :
998 : static bool
999 23397 : vect_verify_full_masking (loop_vec_info loop_vinfo)
1000 : {
1001 23397 : unsigned int min_ni_width;
1002 :
1003 : /* Use a normal loop if there are no statements that need masking.
1004 : This only happens in rare degenerate cases: it means that the loop
1005 : has no loads, no stores, and no live-out values. */
1006 23397 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1007 : return false;
1008 :
1009 : /* Produce the rgroup controls. */
1010 92239 : for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1011 : {
1012 34421 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1013 34421 : tree vectype = mask.first;
1014 34421 : unsigned nvectors = mask.second;
1015 :
1016 45445 : if (masks->rgc_vec.length () < nvectors)
1017 25538 : masks->rgc_vec.safe_grow_cleared (nvectors, true);
1018 34421 : rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1019 : /* The number of scalars per iteration and the number of vectors are
1020 : both compile-time constants. */
1021 34421 : unsigned int nscalars_per_iter
1022 34421 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1023 34421 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1024 :
1025 34421 : if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1026 : {
1027 27449 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1028 27449 : rgm->type = truth_type_for (vectype);
1029 27449 : rgm->factor = 1;
1030 : }
1031 : }
1032 :
1033 23397 : unsigned int max_nscalars_per_iter
1034 23397 : = vect_get_max_nscalars_per_iter (loop_vinfo);
1035 :
1036 : /* Work out how many bits we need to represent the limit. */
1037 23397 : min_ni_width
1038 23397 : = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1039 :
1040 : /* Find a scalar mode for which WHILE_ULT is supported. */
1041 23397 : opt_scalar_int_mode cmp_mode_iter;
1042 23397 : tree cmp_type = NULL_TREE;
1043 23397 : tree iv_type = NULL_TREE;
1044 23397 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1045 23397 : unsigned int iv_precision = UINT_MAX;
1046 :
1047 23397 : if (iv_limit != -1)
1048 23397 : iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1049 : UNSIGNED);
1050 :
1051 187176 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1052 : {
1053 163779 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1054 163779 : if (cmp_bits >= min_ni_width
1055 163779 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1056 : {
1057 109692 : tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1058 109692 : if (this_type
1059 109692 : && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1060 : {
1061 : /* Although we could stop as soon as we find a valid mode,
1062 : there are at least two reasons why that's not always the
1063 : best choice:
1064 :
1065 : - An IV that's Pmode or wider is more likely to be reusable
1066 : in address calculations than an IV that's narrower than
1067 : Pmode.
1068 :
1069 : - Doing the comparison in IV_PRECISION or wider allows
1070 : a natural 0-based IV, whereas using a narrower comparison
1071 : type requires mitigations against wrap-around.
1072 :
1073 : Conversely, if the IV limit is variable, doing the comparison
1074 : in a wider type than the original type can introduce
1075 : unnecessary extensions, so picking the widest valid mode
1076 : is not always a good choice either.
1077 :
1078 : Here we prefer the first IV type that's Pmode or wider,
1079 : and the first comparison type that's IV_PRECISION or wider.
1080 : (The comparison type must be no wider than the IV type,
1081 : to avoid extensions in the vector loop.)
1082 :
1083 : ??? We might want to try continuing beyond Pmode for ILP32
1084 : targets if CMP_BITS < IV_PRECISION. */
1085 0 : iv_type = this_type;
1086 0 : if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1087 : cmp_type = this_type;
1088 0 : if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1089 : break;
1090 : }
1091 : }
1092 : }
1093 :
1094 23397 : if (!cmp_type)
1095 : {
1096 23397 : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1097 23397 : return false;
1098 : }
1099 :
1100 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1101 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1102 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1103 0 : return true;
1104 23397 : }
1105 :
1106 : /* Each statement in LOOP_VINFO can be masked where necessary. Check
1107 : whether we can actually generate AVX512 style masks. Return true if so,
1108 : storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1109 :
1110 : static bool
1111 23397 : vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1112 : {
1113 : /* Produce differently organized rgc_vec and differently check
1114 : we can produce masks. */
1115 :
1116 : /* Use a normal loop if there are no statements that need masking.
1117 : This only happens in rare degenerate cases: it means that the loop
1118 : has no loads, no stores, and no live-out values. */
1119 23397 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1120 : return false;
1121 :
1122 : /* For the decrementing IV we need to represent all values in
1123 : [0, niter + niter_skip] where niter_skip is the elements we
1124 : skip in the first iteration for prologue peeling. */
1125 23397 : tree iv_type = NULL_TREE;
1126 23397 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1127 23397 : unsigned int iv_precision = UINT_MAX;
1128 23397 : if (iv_limit != -1)
1129 23397 : iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1130 :
1131 : /* First compute the type for the IV we use to track the remaining
1132 : scalar iterations. */
1133 23397 : opt_scalar_int_mode cmp_mode_iter;
1134 30683 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1135 : {
1136 30683 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1137 30683 : if (cmp_bits >= iv_precision
1138 30683 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1139 : {
1140 23397 : iv_type = build_nonstandard_integer_type (cmp_bits, true);
1141 23397 : if (iv_type)
1142 : break;
1143 : }
1144 : }
1145 23397 : if (!iv_type)
1146 : return false;
1147 :
1148 : /* Produce the rgroup controls. */
1149 92239 : for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1150 : {
1151 34421 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1152 34421 : tree vectype = mask.first;
1153 34421 : unsigned nvectors = mask.second;
1154 :
1155 : /* The number of scalars per iteration and the number of vectors are
1156 : both compile-time constants. */
1157 34421 : unsigned int nscalars_per_iter
1158 34421 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1159 34421 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1160 :
1161 : /* We index the rgroup_controls vector with nscalars_per_iter
1162 : which we keep constant and instead have a varying nvectors,
1163 : remembering the vector mask with the fewest nV. */
1164 45445 : if (masks->rgc_vec.length () < nscalars_per_iter)
1165 23469 : masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1166 34421 : rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1167 :
1168 34421 : if (!rgm->type || rgm->factor > nvectors)
1169 : {
1170 25358 : rgm->type = truth_type_for (vectype);
1171 25358 : rgm->compare_type = NULL_TREE;
1172 25358 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1173 25358 : rgm->factor = nvectors;
1174 25358 : rgm->bias_adjusted_ctrl = NULL_TREE;
1175 : }
1176 : }
1177 :
1178 : /* There is no fixed compare type we are going to use but we have to
1179 : be able to get at one for each mask group. */
1180 23397 : unsigned int min_ni_width
1181 23397 : = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1182 :
1183 23397 : bool ok = true;
1184 88452 : for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1185 : {
1186 24345 : tree mask_type = rgc.type;
1187 24345 : if (!mask_type)
1188 869 : continue;
1189 :
1190 : /* For now vect_get_loop_mask only supports integer mode masks
1191 : when we need to split it. */
1192 23476 : if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1193 23476 : || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1194 : {
1195 : ok = false;
1196 : break;
1197 : }
1198 :
1199 : /* If iv_type is usable as compare type use that - we can elide the
1200 : saturation in that case. */
1201 17396 : if (TYPE_PRECISION (iv_type) >= min_ni_width)
1202 : {
1203 17396 : tree cmp_vectype
1204 17396 : = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1205 17396 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1206 5924 : rgc.compare_type = cmp_vectype;
1207 : }
1208 17396 : if (!rgc.compare_type)
1209 32987 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1210 : {
1211 32983 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1212 32983 : if (cmp_bits >= min_ni_width
1213 32983 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1214 : {
1215 32971 : tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1216 32971 : if (!cmp_type)
1217 0 : continue;
1218 :
1219 : /* Check whether we can produce the mask with cmp_type. */
1220 32971 : tree cmp_vectype
1221 32971 : = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1222 32971 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1223 : {
1224 11468 : rgc.compare_type = cmp_vectype;
1225 11468 : break;
1226 : }
1227 : }
1228 : }
1229 17396 : if (!rgc.compare_type)
1230 : {
1231 : ok = false;
1232 : break;
1233 : }
1234 : }
1235 23397 : if (!ok)
1236 : {
1237 6084 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1238 6084 : return false;
1239 : }
1240 :
1241 17313 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1242 17313 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1243 17313 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1244 17313 : return true;
1245 23397 : }
1246 :
1247 : /* Check whether we can use vector access with length based on precison
1248 : comparison. So far, to keep it simple, we only allow the case that the
1249 : precision of the target supported length is larger than the precision
1250 : required by loop niters. */
1251 :
1252 : static bool
1253 6 : vect_verify_loop_lens (loop_vec_info loop_vinfo)
1254 : {
1255 6 : if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1256 : return false;
1257 :
1258 0 : if (!VECTOR_MODE_P (loop_vinfo->vector_mode))
1259 : return false;
1260 :
1261 0 : machine_mode len_load_mode, len_store_mode;
1262 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1263 0 : .exists (&len_load_mode))
1264 0 : return false;
1265 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1266 0 : .exists (&len_store_mode))
1267 0 : return false;
1268 :
1269 0 : signed char partial_load_bias = internal_len_load_store_bias
1270 0 : (IFN_LEN_LOAD, len_load_mode);
1271 :
1272 0 : signed char partial_store_bias = internal_len_load_store_bias
1273 0 : (IFN_LEN_STORE, len_store_mode);
1274 :
1275 0 : gcc_assert (partial_load_bias == partial_store_bias);
1276 :
1277 0 : if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1278 : return false;
1279 :
1280 : /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1281 : len_loads with a length of zero. In order to avoid that we prohibit
1282 : more than one loop length here. */
1283 0 : if (partial_load_bias == -1
1284 0 : && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1285 : return false;
1286 :
1287 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1288 :
1289 0 : unsigned int max_nitems_per_iter = 1;
1290 0 : unsigned int i;
1291 0 : rgroup_controls *rgl;
1292 : /* Find the maximum number of items per iteration for every rgroup. */
1293 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1294 : {
1295 0 : unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1296 0 : max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1297 : }
1298 :
1299 : /* Work out how many bits we need to represent the length limit. */
1300 0 : unsigned int min_ni_prec
1301 0 : = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1302 :
1303 : /* Now use the maximum of below precisions for one suitable IV type:
1304 : - the IV's natural precision
1305 : - the precision needed to hold: the maximum number of scalar
1306 : iterations multiplied by the scale factor (min_ni_prec above)
1307 : - the Pmode precision
1308 :
1309 : If min_ni_prec is less than the precision of the current niters,
1310 : we perfer to still use the niters type. Prefer to use Pmode and
1311 : wider IV to avoid narrow conversions. */
1312 :
1313 0 : unsigned int ni_prec
1314 0 : = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1315 0 : min_ni_prec = MAX (min_ni_prec, ni_prec);
1316 0 : min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1317 :
1318 0 : tree iv_type = NULL_TREE;
1319 0 : opt_scalar_int_mode tmode_iter;
1320 0 : FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1321 : {
1322 0 : scalar_mode tmode = tmode_iter.require ();
1323 0 : unsigned int tbits = GET_MODE_BITSIZE (tmode);
1324 :
1325 : /* ??? Do we really want to construct one IV whose precision exceeds
1326 : BITS_PER_WORD? */
1327 0 : if (tbits > BITS_PER_WORD)
1328 : break;
1329 :
1330 : /* Find the first available standard integral type. */
1331 0 : if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1332 : {
1333 0 : iv_type = build_nonstandard_integer_type (tbits, true);
1334 0 : break;
1335 : }
1336 : }
1337 :
1338 0 : if (!iv_type)
1339 : {
1340 0 : if (dump_enabled_p ())
1341 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1342 : "can't vectorize with length-based partial vectors"
1343 : " because there is no suitable iv type.\n");
1344 0 : return false;
1345 : }
1346 :
1347 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1348 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1349 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1350 :
1351 0 : return true;
1352 : }
1353 :
1354 : /* Calculate the cost of one scalar iteration of the loop. */
1355 : static void
1356 361375 : vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1357 : {
1358 361375 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1359 361375 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1360 361375 : int nbbs = loop->num_nodes, factor;
1361 361375 : int innerloop_iters, i;
1362 :
1363 361375 : DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1364 :
1365 : /* Gather costs for statements in the scalar loop. */
1366 :
1367 : /* FORNOW. */
1368 361375 : innerloop_iters = 1;
1369 361375 : if (loop->inner)
1370 1604 : innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1371 :
1372 1242429 : for (i = 0; i < nbbs; i++)
1373 : {
1374 881054 : gimple_stmt_iterator si;
1375 881054 : basic_block bb = bbs[i];
1376 :
1377 881054 : if (bb->loop_father == loop->inner)
1378 : factor = innerloop_iters;
1379 : else
1380 877846 : factor = 1;
1381 :
1382 7059806 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1383 : {
1384 5297698 : gimple *stmt = gsi_stmt (si);
1385 5297698 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1386 :
1387 5297698 : if (!is_gimple_assign (stmt)
1388 : && !is_gimple_call (stmt)
1389 : && !is_a<gcond *> (stmt))
1390 1780402 : continue;
1391 :
1392 : /* Skip stmts that are not vectorized inside the loop. */
1393 3517296 : stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1394 3517296 : if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1395 1731824 : && (!STMT_VINFO_LIVE_P (vstmt_info)
1396 47 : || !VECTORIZABLE_CYCLE_DEF
1397 : (STMT_VINFO_DEF_TYPE (vstmt_info))))
1398 1731824 : continue;
1399 :
1400 1785472 : vect_cost_for_stmt kind;
1401 1785472 : if (STMT_VINFO_DATA_REF (stmt_info))
1402 : {
1403 857552 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1404 : kind = scalar_load;
1405 : else
1406 320569 : kind = scalar_store;
1407 : }
1408 927920 : else if (vect_nop_conversion_p (stmt_info))
1409 53763 : continue;
1410 : else
1411 : kind = scalar_stmt;
1412 :
1413 : /* We are using vect_prologue here to avoid scaling twice
1414 : by the inner loop factor. */
1415 1731709 : record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1416 : factor, kind, stmt_info, 0, vect_body);
1417 : }
1418 : }
1419 :
1420 : /* Now accumulate cost. */
1421 361375 : loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1422 361375 : add_stmt_costs (loop_vinfo->scalar_costs,
1423 : &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1424 361375 : loop_vinfo->scalar_costs->finish_cost (nullptr);
1425 361375 : }
1426 :
1427 : /* Function vect_analyze_loop_form.
1428 :
1429 : Verify that certain CFG restrictions hold, including:
1430 : - the loop has a pre-header
1431 : - the loop has a single entry
1432 : - nested loops can have only a single exit.
1433 : - the loop exit condition is simple enough
1434 : - the number of iterations can be analyzed, i.e, a countable loop. The
1435 : niter could be analyzed under some assumptions. */
1436 :
1437 : opt_result
1438 453524 : vect_analyze_loop_form (class loop *loop, gimple *loop_vectorized_call,
1439 : vect_loop_form_info *info)
1440 : {
1441 453524 : DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1442 :
1443 453524 : edge exit_e = vec_init_loop_exit_info (loop);
1444 453524 : if (!exit_e)
1445 29107 : return opt_result::failure_at (vect_location,
1446 : "not vectorized:"
1447 : " Infinite loop detected.\n");
1448 424417 : if (loop_vectorized_call)
1449 : {
1450 28598 : tree arg = gimple_call_arg (loop_vectorized_call, 1);
1451 28598 : class loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg));
1452 28598 : edge scalar_exit_e = vec_init_loop_exit_info (scalar_loop);
1453 28598 : if (!scalar_exit_e)
1454 0 : return opt_result::failure_at (vect_location,
1455 : "not vectorized:"
1456 : " could not determine main exit from"
1457 : " loop with multiple exits.\n");
1458 : }
1459 :
1460 424417 : info->loop_exit = exit_e;
1461 424417 : if (dump_enabled_p ())
1462 16007 : dump_printf_loc (MSG_NOTE, vect_location,
1463 : "using as main loop exit: %d -> %d [AUX: %p]\n",
1464 16007 : exit_e->src->index, exit_e->dest->index, exit_e->aux);
1465 :
1466 : /* Check if we have any control flow that doesn't leave the loop. */
1467 424417 : basic_block *bbs = get_loop_body (loop);
1468 1389603 : for (unsigned i = 0; i < loop->num_nodes; i++)
1469 1080319 : if (EDGE_COUNT (bbs[i]->succs) != 1
1470 1080319 : && (EDGE_COUNT (bbs[i]->succs) != 2
1471 647286 : || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1472 : {
1473 115133 : free (bbs);
1474 115133 : return opt_result::failure_at (vect_location,
1475 : "not vectorized:"
1476 : " unsupported control flow in loop.\n");
1477 : }
1478 :
1479 : /* Check if we have any control flow that doesn't leave the loop. */
1480 310379 : bool has_phi = false;
1481 310379 : for (unsigned i = 0; i < loop->num_nodes; i++)
1482 309922 : if (!gimple_seq_empty_p (phi_nodes (bbs[i])))
1483 : {
1484 : has_phi = true;
1485 : break;
1486 : }
1487 309284 : if (!has_phi)
1488 457 : return opt_result::failure_at (vect_location,
1489 : "not vectorized:"
1490 : " no scalar evolution detected in loop.\n");
1491 :
1492 308827 : free (bbs);
1493 :
1494 : /* Different restrictions apply when we are considering an inner-most loop,
1495 : vs. an outer (nested) loop.
1496 : (FORNOW. May want to relax some of these restrictions in the future). */
1497 :
1498 308827 : info->inner_loop_cond = NULL;
1499 308827 : if (!loop->inner)
1500 : {
1501 : /* Inner-most loop. */
1502 :
1503 290353 : if (empty_block_p (loop->header))
1504 0 : return opt_result::failure_at (vect_location,
1505 : "not vectorized: empty loop.\n");
1506 : }
1507 : else
1508 : {
1509 18474 : class loop *innerloop = loop->inner;
1510 18474 : edge entryedge;
1511 :
1512 : /* Nested loop. We currently require that the loop is doubly-nested,
1513 : contains a single inner loop with a single exit to the block
1514 : with the single exit condition in the outer loop.
1515 : Vectorizable outer-loops look like this:
1516 :
1517 : (pre-header)
1518 : |
1519 : header <---+
1520 : | |
1521 : inner-loop |
1522 : | |
1523 : tail ------+
1524 : |
1525 : (exit-bb)
1526 :
1527 : The inner-loop also has the properties expected of inner-most loops
1528 : as described above. */
1529 :
1530 18474 : if ((loop->inner)->inner || (loop->inner)->next)
1531 2935 : return opt_result::failure_at (vect_location,
1532 : "not vectorized:"
1533 : " multiple nested loops.\n");
1534 :
1535 15539 : entryedge = loop_preheader_edge (innerloop);
1536 15539 : if (entryedge->src != loop->header
1537 15040 : || !single_exit (innerloop)
1538 26949 : || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1539 4471 : return opt_result::failure_at (vect_location,
1540 : "not vectorized:"
1541 : " unsupported outerloop form.\n");
1542 :
1543 : /* Analyze the inner-loop. */
1544 11068 : vect_loop_form_info inner;
1545 11068 : opt_result res = vect_analyze_loop_form (loop->inner, NULL, &inner);
1546 11068 : if (!res)
1547 : {
1548 416 : if (dump_enabled_p ())
1549 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1550 : "not vectorized: Bad inner loop.\n");
1551 416 : return res;
1552 : }
1553 :
1554 : /* Don't support analyzing niter under assumptions for inner
1555 : loop. */
1556 10652 : if (!integer_onep (inner.assumptions))
1557 257 : return opt_result::failure_at (vect_location,
1558 : "not vectorized: Bad inner loop.\n");
1559 :
1560 10395 : if (inner.number_of_iterations == chrec_dont_know
1561 10395 : || !expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1562 1837 : return opt_result::failure_at (vect_location,
1563 : "not vectorized: inner-loop count not"
1564 : " invariant.\n");
1565 :
1566 8558 : if (dump_enabled_p ())
1567 1049 : dump_printf_loc (MSG_NOTE, vect_location,
1568 : "Considering outer-loop vectorization.\n");
1569 8558 : info->inner_loop_cond = inner.conds[0];
1570 11068 : }
1571 :
1572 298911 : if (EDGE_COUNT (loop->header->preds) != 2)
1573 0 : return opt_result::failure_at (vect_location,
1574 : "not vectorized:"
1575 : " too many incoming edges.\n");
1576 :
1577 : /* We assume that the latch is empty. */
1578 298911 : basic_block latch = loop->latch;
1579 298911 : do
1580 : {
1581 298911 : if (!empty_block_p (latch)
1582 298911 : || !gimple_seq_empty_p (phi_nodes (latch)))
1583 20670 : return opt_result::failure_at (vect_location,
1584 : "not vectorized: latch block not "
1585 : "empty.\n");
1586 278241 : latch = single_pred (latch);
1587 : }
1588 556482 : while (single_succ_p (latch));
1589 :
1590 : /* Make sure there is no abnormal exit. */
1591 278241 : auto_vec<edge> exits = get_loop_exit_edges (loop);
1592 1233667 : for (edge e : exits)
1593 : {
1594 398977 : if (e->flags & EDGE_ABNORMAL)
1595 33 : return opt_result::failure_at (vect_location,
1596 : "not vectorized:"
1597 : " abnormal loop exit edge.\n");
1598 : }
1599 :
1600 278208 : info->conds
1601 278208 : = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1602 : &info->number_of_iterations,
1603 278208 : &info->number_of_iterationsm1);
1604 278208 : if (info->conds.is_empty ())
1605 33 : return opt_result::failure_at
1606 33 : (vect_location,
1607 : "not vectorized: complicated exit condition.\n");
1608 :
1609 : /* Determine what the primary and alternate exit conds are. */
1610 677086 : for (unsigned i = 0; i < info->conds.length (); i++)
1611 : {
1612 398911 : gcond *cond = info->conds[i];
1613 398911 : if (exit_e->src == gimple_bb (cond))
1614 278175 : std::swap (info->conds[0], info->conds[i]);
1615 : }
1616 :
1617 278175 : if (chrec_contains_undetermined (info->number_of_iterations))
1618 : {
1619 58424 : if (dump_enabled_p ())
1620 257 : dump_printf_loc (MSG_NOTE, vect_location,
1621 : "Loop being analyzed as uncounted.\n");
1622 58424 : if (loop->inner)
1623 562 : return opt_result::failure_at
1624 562 : (vect_location,
1625 : "not vectorized: outer loop vectorization of uncounted loops"
1626 : " is unsupported.\n");
1627 57862 : return opt_result::success ();
1628 : }
1629 :
1630 219751 : if (integer_zerop (info->assumptions))
1631 4 : return opt_result::failure_at
1632 4 : (info->conds[0],
1633 : "not vectorized: number of iterations cannot be computed.\n");
1634 :
1635 219747 : if (integer_zerop (info->number_of_iterations))
1636 12 : return opt_result::failure_at
1637 12 : (info->conds[0],
1638 : "not vectorized: number of iterations = 0.\n");
1639 :
1640 219735 : if (!(tree_fits_shwi_p (info->number_of_iterations)
1641 120876 : && tree_to_shwi (info->number_of_iterations) > 0))
1642 : {
1643 98859 : if (dump_enabled_p ())
1644 : {
1645 2473 : dump_printf_loc (MSG_NOTE, vect_location,
1646 : "Symbolic number of iterations is ");
1647 2473 : dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1648 2473 : dump_printf (MSG_NOTE, "\n");
1649 : }
1650 : }
1651 :
1652 219735 : if (!integer_onep (info->assumptions))
1653 : {
1654 8521 : if (dump_enabled_p ())
1655 : {
1656 66 : dump_printf_loc (MSG_NOTE, vect_location,
1657 : "Loop to be versioned with niter assumption ");
1658 66 : dump_generic_expr (MSG_NOTE, TDF_SLIM, info->assumptions);
1659 66 : dump_printf (MSG_NOTE, "\n");
1660 : }
1661 : }
1662 :
1663 219735 : return opt_result::success ();
1664 278241 : }
1665 :
1666 : /* Create a loop_vec_info for LOOP with SHARED and the
1667 : vect_analyze_loop_form result. */
1668 :
1669 : loop_vec_info
1670 573065 : vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1671 : const vect_loop_form_info *info,
1672 : loop_vec_info orig_loop_info)
1673 : {
1674 573065 : loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1675 573065 : LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1676 573065 : LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1677 573065 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1678 573065 : LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_info;
1679 573065 : if (orig_loop_info && LOOP_VINFO_EPILOGUE_P (orig_loop_info))
1680 372 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo)
1681 372 : = LOOP_VINFO_MAIN_LOOP_INFO (orig_loop_info);
1682 : else
1683 572693 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo) = orig_loop_info;
1684 : /* Also record the assumptions for versioning. */
1685 573065 : if (!integer_onep (info->assumptions) && !orig_loop_info)
1686 19264 : LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1687 :
1688 2542495 : for (gcond *cond : info->conds)
1689 : {
1690 823300 : stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1691 : /* Mark the statement as a condition. */
1692 823300 : STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1693 : }
1694 :
1695 573065 : unsigned cond_id = 0;
1696 573065 : if (!LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo))
1697 489029 : LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[cond_id++];
1698 :
1699 907336 : for (; cond_id < info->conds.length (); cond_id ++)
1700 334271 : LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[cond_id]);
1701 :
1702 573065 : LOOP_VINFO_MAIN_EXIT (loop_vinfo) = info->loop_exit;
1703 :
1704 : /* Check to see if we're vectorizing multiple exits. */
1705 573065 : LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1706 573065 : = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1707 :
1708 573065 : if (info->inner_loop_cond)
1709 : {
1710 : /* If we have an estimate on the number of iterations of the inner
1711 : loop use that to limit the scale for costing, otherwise use
1712 : --param vect-inner-loop-cost-factor literally. */
1713 8967 : widest_int nit;
1714 8967 : if (estimated_stmt_executions (loop->inner, &nit))
1715 7675 : LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1716 7675 : = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1717 8967 : }
1718 :
1719 573065 : return loop_vinfo;
1720 : }
1721 :
1722 :
1723 :
1724 : /* Return true if we know that the iteration count is smaller than the
1725 : vectorization factor. Return false if it isn't, or if we can't be sure
1726 : either way. */
1727 :
1728 : static bool
1729 154007 : vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1730 : {
1731 154007 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1732 :
1733 154007 : HOST_WIDE_INT max_niter;
1734 154007 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1735 79128 : max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1736 : else
1737 74879 : max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1738 :
1739 154007 : if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1740 10910 : return true;
1741 :
1742 : return false;
1743 : }
1744 :
1745 : /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1746 : is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1747 : definitely no, or -1 if it's worth retrying. */
1748 :
1749 : static int
1750 154016 : vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1751 : unsigned *suggested_unroll_factor)
1752 : {
1753 154016 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1754 154016 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1755 :
1756 : /* Only loops that can handle partially-populated vectors can have iteration
1757 : counts less than the vectorization factor. */
1758 154016 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
1759 154016 : && vect_known_niters_smaller_than_vf (loop_vinfo))
1760 : {
1761 10900 : if (dump_enabled_p ())
1762 236 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1763 : "not vectorized: iteration count smaller than "
1764 : "vectorization factor.\n");
1765 10900 : return 0;
1766 : }
1767 :
1768 : /* If we know the number of iterations we can do better, for the
1769 : epilogue we can also decide whether the main loop leaves us
1770 : with enough iterations, prefering a smaller vector epilog then
1771 : also possibly used for the case we skip the vector loop. */
1772 143116 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1773 : {
1774 69466 : widest_int scalar_niters
1775 69466 : = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
1776 69466 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1777 : {
1778 2646 : loop_vec_info orig_loop_vinfo
1779 : = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1780 2646 : loop_vec_info main_loop_vinfo
1781 : = LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo);
1782 2646 : unsigned lowest_vf
1783 2646 : = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
1784 2646 : int prolog_peeling = 0;
1785 2646 : if (!vect_use_loop_mask_for_alignment_p (main_loop_vinfo))
1786 2646 : prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo);
1787 2646 : if (prolog_peeling >= 0
1788 2646 : && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
1789 : lowest_vf))
1790 : {
1791 5282 : unsigned gap
1792 2641 : = LOOP_VINFO_PEELING_FOR_GAPS (main_loop_vinfo) ? 1 : 0;
1793 5282 : scalar_niters = ((scalar_niters - gap - prolog_peeling)
1794 5282 : % lowest_vf + gap);
1795 : }
1796 : }
1797 : /* Reject vectorizing for a single scalar iteration, even if
1798 : we could in principle implement that using partial vectors.
1799 : But allow such vectorization if VF == 1 in case we do not
1800 : need to peel for gaps (if we need, avoid vectorization for
1801 : reasons of code footprint). */
1802 69466 : unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
1803 69466 : if (scalar_niters <= peeling_gap + 1
1804 69466 : && (assumed_vf > 1 || peeling_gap != 0))
1805 : {
1806 653 : if (dump_enabled_p ())
1807 159 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1808 : "not vectorized: loop only has a single "
1809 : "scalar iteration.\n");
1810 653 : return 0;
1811 : }
1812 :
1813 68813 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1814 : {
1815 : /* Check that the loop processes at least one full vector. */
1816 68802 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1817 68802 : if (known_lt (scalar_niters, vf))
1818 : {
1819 348 : if (dump_enabled_p ())
1820 296 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1821 : "loop does not have enough iterations "
1822 : "to support vectorization.\n");
1823 388 : return 0;
1824 : }
1825 :
1826 : /* If we need to peel an extra epilogue iteration to handle data
1827 : accesses with gaps, check that there are enough scalar iterations
1828 : available.
1829 :
1830 : The check above is redundant with this one when peeling for gaps,
1831 : but the distinction is useful for diagnostics. */
1832 68454 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1833 68760 : && known_le (scalar_niters, vf))
1834 : {
1835 40 : if (dump_enabled_p ())
1836 9 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1837 : "loop does not have enough iterations "
1838 : "to support peeling for gaps.\n");
1839 40 : return 0;
1840 : }
1841 : }
1842 69466 : }
1843 :
1844 : /* If using the "very cheap" model. reject cases in which we'd keep
1845 : a copy of the scalar code (even if we might be able to vectorize it). */
1846 142075 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1847 142075 : && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1848 75597 : || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
1849 : {
1850 721 : if (dump_enabled_p ())
1851 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1852 : "some scalar iterations would need to be peeled\n");
1853 721 : return 0;
1854 : }
1855 :
1856 141354 : int min_profitable_iters, min_profitable_estimate;
1857 141354 : vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1858 : &min_profitable_estimate,
1859 : suggested_unroll_factor);
1860 :
1861 141354 : if (min_profitable_iters < 0)
1862 : {
1863 24580 : if (dump_enabled_p ())
1864 34 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1865 : "not vectorized: vectorization not profitable.\n");
1866 24580 : if (dump_enabled_p ())
1867 34 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1868 : "not vectorized: vector version will never be "
1869 : "profitable.\n");
1870 24580 : return -1;
1871 : }
1872 :
1873 116774 : int min_scalar_loop_bound = (param_min_vect_loop_bound
1874 116774 : * assumed_vf);
1875 :
1876 : /* Use the cost model only if it is more conservative than user specified
1877 : threshold. */
1878 116774 : unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1879 : min_profitable_iters);
1880 :
1881 116774 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1882 :
1883 63144 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1884 179918 : && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1885 : {
1886 442 : if (dump_enabled_p ())
1887 1 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1888 : "not vectorized: vectorization not profitable.\n");
1889 442 : if (dump_enabled_p ())
1890 1 : dump_printf_loc (MSG_NOTE, vect_location,
1891 : "not vectorized: iteration count smaller than user "
1892 : "specified loop bound parameter or minimum profitable "
1893 : "iterations (whichever is more conservative).\n");
1894 442 : return 0;
1895 : }
1896 :
1897 : /* The static profitablity threshold min_profitable_estimate includes
1898 : the cost of having to check at runtime whether the scalar loop
1899 : should be used instead. If it turns out that we don't need or want
1900 : such a check, the threshold we should use for the static estimate
1901 : is simply the point at which the vector loop becomes more profitable
1902 : than the scalar loop. */
1903 116332 : if (min_profitable_estimate > min_profitable_iters
1904 24573 : && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1905 23964 : && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1906 613 : && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1907 116945 : && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1908 : {
1909 12 : if (dump_enabled_p ())
1910 7 : dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1911 : " choice between the scalar and vector loops\n");
1912 12 : min_profitable_estimate = min_profitable_iters;
1913 : }
1914 :
1915 : /* If the vector loop needs multiple iterations to be beneficial then
1916 : things are probably too close to call, and the conservative thing
1917 : would be to stick with the scalar code. */
1918 116332 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1919 116332 : && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1920 : {
1921 17777 : if (dump_enabled_p ())
1922 223 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1923 : "one iteration of the vector loop would be"
1924 : " more expensive than the equivalent number of"
1925 : " iterations of the scalar loop\n");
1926 17777 : return 0;
1927 : }
1928 :
1929 98555 : HOST_WIDE_INT estimated_niter;
1930 :
1931 : /* If we are vectorizing an epilogue then we know the maximum number of
1932 : scalar iterations it will cover is at least one lower than the
1933 : vectorization factor of the main loop. */
1934 98555 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1935 12115 : estimated_niter
1936 12115 : = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1937 : else
1938 : {
1939 86440 : estimated_niter = estimated_stmt_executions_int (loop);
1940 86440 : if (estimated_niter == -1)
1941 31205 : estimated_niter = likely_max_stmt_executions_int (loop);
1942 : }
1943 43320 : if (estimated_niter != -1
1944 95646 : && ((unsigned HOST_WIDE_INT) estimated_niter
1945 95646 : < MAX (th, (unsigned) min_profitable_estimate)))
1946 : {
1947 4337 : if (dump_enabled_p ())
1948 32 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1949 : "not vectorized: estimated iteration count too "
1950 : "small.\n");
1951 4337 : if (dump_enabled_p ())
1952 32 : dump_printf_loc (MSG_NOTE, vect_location,
1953 : "not vectorized: estimated iteration count smaller "
1954 : "than specified loop bound parameter or minimum "
1955 : "profitable iterations (whichever is more "
1956 : "conservative).\n");
1957 4337 : return -1;
1958 : }
1959 :
1960 : /* As we cannot use a runtime check to gate profitability for uncounted
1961 : loops require either an estimate or if none, at least a profitable
1962 : vectorization within the first vector iteration (that condition
1963 : will practically never be true due to the required epilog and
1964 : likely alignment prologue). */
1965 94218 : if (LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo)
1966 163 : && estimated_niter == -1
1967 94354 : && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1968 : {
1969 120 : if (dump_enabled_p ())
1970 2 : dump_printf_loc (MSG_NOTE, vect_location,
1971 : "not vectorized: no loop iteration estimate on the "
1972 : "uncounted loop and not trivially profitable.\n");
1973 120 : return -1;
1974 : }
1975 :
1976 : return 1;
1977 : }
1978 :
1979 : /* Gather data references in LOOP with body BBS and store them into
1980 : *DATAREFS. */
1981 :
1982 : static opt_result
1983 275844 : vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1984 : vec<data_reference_p> *datarefs)
1985 : {
1986 822760 : for (unsigned i = 0; i < loop->num_nodes; i++)
1987 1219466 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1988 5150317 : !gsi_end_p (gsi); gsi_next (&gsi))
1989 : {
1990 4603401 : gimple *stmt = gsi_stmt (gsi);
1991 4603401 : if (is_gimple_debug (stmt))
1992 2116166 : continue;
1993 2487365 : opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1994 : NULL, 0);
1995 2487365 : if (!res)
1996 : {
1997 62947 : if (is_gimple_call (stmt) && loop->safelen)
1998 : {
1999 404 : tree fndecl = gimple_call_fndecl (stmt), op;
2000 404 : if (fndecl == NULL_TREE
2001 404 : && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2002 : {
2003 0 : fndecl = gimple_call_arg (stmt, 0);
2004 0 : gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2005 0 : fndecl = TREE_OPERAND (fndecl, 0);
2006 0 : gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2007 : }
2008 404 : if (fndecl != NULL_TREE)
2009 : {
2010 367 : cgraph_node *node = cgraph_node::get (fndecl);
2011 367 : if (node != NULL && node->simd_clones != NULL)
2012 : {
2013 131 : unsigned int j, n = gimple_call_num_args (stmt);
2014 545 : for (j = 0; j < n; j++)
2015 : {
2016 284 : op = gimple_call_arg (stmt, j);
2017 284 : if (DECL_P (op)
2018 284 : || (REFERENCE_CLASS_P (op)
2019 0 : && get_base_address (op)))
2020 : break;
2021 : }
2022 131 : op = gimple_call_lhs (stmt);
2023 : /* Ignore #pragma omp declare simd functions
2024 : if they don't have data references in the
2025 : call stmt itself. */
2026 261 : if (j == n
2027 131 : && !(op
2028 120 : && (DECL_P (op)
2029 120 : || (REFERENCE_CLASS_P (op)
2030 0 : && get_base_address (op)))))
2031 130 : continue;
2032 : }
2033 : }
2034 : }
2035 62817 : return res;
2036 : }
2037 : /* If dependence analysis will give up due to the limit on the
2038 : number of datarefs stop here and fail fatally. */
2039 4252157 : if (datarefs->length ()
2040 1827739 : > (unsigned)param_loop_max_datarefs_for_datadeps)
2041 0 : return opt_result::failure_at (stmt, "exceeded param "
2042 : "loop-max-datarefs-for-datadeps\n");
2043 : }
2044 213027 : return opt_result::success ();
2045 : }
2046 :
2047 : /* Determine if operating on full vectors for LOOP_VINFO might leave
2048 : some scalar iterations still to do. If so, decide how we should
2049 : handle those scalar iterations. The possibilities are:
2050 :
2051 : (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2052 : In this case:
2053 :
2054 : LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2055 : LOOP_VINFO_PEELING_FOR_NITER == false
2056 :
2057 : (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2058 : to handle the remaining scalar iterations. In this case:
2059 :
2060 : LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2061 : LOOP_VINFO_PEELING_FOR_NITER == true
2062 :
2063 : The MASKED_P argument specifies to what extent
2064 : param_vect_partial_vector_usage is to be honored. For MASKED_P == 0
2065 : no partial vectors are to be used, for MASKED_P == -1 it's
2066 : param_vect_partial_vector_usage that gets to decide whether we may
2067 : consider partial vector usage. For MASKED_P == 1 partial vectors
2068 : may be used if possible.
2069 :
2070 : */
2071 :
2072 : static opt_result
2073 154925 : vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2074 : int masked_p)
2075 : {
2076 : /* Determine whether there would be any scalar iterations left over. */
2077 154925 : bool need_peeling_or_partial_vectors_p
2078 154925 : = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2079 :
2080 : /* Decide whether to vectorize the loop with partial vectors. */
2081 154925 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2082 154925 : if (masked_p == 0
2083 154925 : || (masked_p == -1 && param_vect_partial_vector_usage == 0))
2084 : /* If requested explicitly do not use partial vectors. */
2085 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2086 207 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2087 65 : && LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo))
2088 0 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2089 207 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2090 65 : && need_peeling_or_partial_vectors_p)
2091 : {
2092 : /* For partial-vector-usage=1, try to push the handling of partial
2093 : vectors to the epilogue, with the main loop continuing to operate
2094 : on full vectors.
2095 :
2096 : If we are unrolling we also do not want to use partial vectors. This
2097 : is to avoid the overhead of generating multiple masks and also to
2098 : avoid having to execute entire iterations of FALSE masked instructions
2099 : when dealing with one or less full iterations.
2100 :
2101 : ??? We could then end up failing to use partial vectors if we
2102 : decide to peel iterations into a prologue, and if the main loop
2103 : then ends up processing fewer than VF iterations. */
2104 43 : if ((param_vect_partial_vector_usage == 1
2105 11 : || loop_vinfo->suggested_unroll_factor > 1)
2106 32 : && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2107 65 : && !vect_known_niters_smaller_than_vf (loop_vinfo))
2108 : ;
2109 : else
2110 31 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2111 : }
2112 :
2113 154925 : if (LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo)
2114 0 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2115 0 : return opt_result::failure_at (vect_location,
2116 : "not vectorized: loop needs but cannot "
2117 : "use partial vectors\n");
2118 :
2119 154925 : if (dump_enabled_p ())
2120 12493 : dump_printf_loc (MSG_NOTE, vect_location,
2121 : "operating on %s vectors%s.\n",
2122 12493 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2123 : ? "partial" : "full",
2124 12493 : LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2125 : ? " for epilogue loop" : "");
2126 :
2127 154925 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2128 309850 : = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2129 154925 : && need_peeling_or_partial_vectors_p);
2130 :
2131 154925 : return opt_result::success ();
2132 : }
2133 :
2134 : /* Function vect_analyze_loop_2.
2135 :
2136 : Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2137 : analyses will record information in some members of LOOP_VINFO. FATAL
2138 : indicates if some analysis meets fatal error. If one non-NULL pointer
2139 : SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2140 : worked out suggested unroll factor, while one NULL pointer shows it's
2141 : going to apply the suggested unroll factor.
2142 : SINGLE_LANE_SLP_DONE_FOR_SUGGESTED_UF is to hold whether single-lane
2143 : slp was forced when the suggested unroll factor was worked out. */
2144 : static opt_result
2145 572365 : vect_analyze_loop_2 (loop_vec_info loop_vinfo, int masked_p, bool &fatal,
2146 : unsigned *suggested_unroll_factor,
2147 : bool& single_lane_slp_done_for_suggested_uf)
2148 : {
2149 572365 : opt_result ok = opt_result::success ();
2150 572365 : int res;
2151 572365 : unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2152 572365 : loop_vec_info orig_loop_vinfo = NULL;
2153 :
2154 : /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2155 : loop_vec_info of the first vectorized loop. */
2156 572365 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2157 13837 : orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2158 : else
2159 : orig_loop_vinfo = loop_vinfo;
2160 13837 : gcc_assert (orig_loop_vinfo);
2161 :
2162 : /* We can't mask on niters for uncounted loops due to unkown upper bound. */
2163 572365 : if (LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo))
2164 84036 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2165 :
2166 : /* The first group of checks is independent of the vector size. */
2167 572365 : fatal = true;
2168 :
2169 572365 : if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2170 572365 : && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2171 5 : return opt_result::failure_at (vect_location,
2172 : "not vectorized: simd if(0)\n");
2173 :
2174 : /* Find all data references in the loop (which correspond to vdefs/vuses)
2175 : and analyze their evolution in the loop. */
2176 :
2177 572360 : loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2178 :
2179 : /* Gather the data references. */
2180 572360 : if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2181 : {
2182 275844 : opt_result res
2183 275844 : = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2184 : &LOOP_VINFO_DATAREFS (loop_vinfo));
2185 275844 : if (!res)
2186 : {
2187 62817 : if (dump_enabled_p ())
2188 1642 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2189 : "not vectorized: loop contains function "
2190 : "calls or data references that cannot "
2191 : "be analyzed\n");
2192 62817 : return res;
2193 : }
2194 213027 : loop_vinfo->shared->save_datarefs ();
2195 : }
2196 : else
2197 296516 : loop_vinfo->shared->check_datarefs ();
2198 :
2199 : /* Analyze the data references and also adjust the minimal
2200 : vectorization factor according to the loads and stores. */
2201 :
2202 509543 : ok = vect_analyze_data_refs (loop_vinfo, &fatal);
2203 509543 : if (!ok)
2204 : {
2205 71792 : if (dump_enabled_p ())
2206 1230 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2207 : "bad data references.\n");
2208 71792 : return ok;
2209 : }
2210 :
2211 : /* Check if we are applying unroll factor now. */
2212 437751 : bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2213 437751 : gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2214 :
2215 : /* When single-lane SLP was forced and we are applying suggested unroll
2216 : factor, keep that decision here. */
2217 875502 : bool force_single_lane = (applying_suggested_uf
2218 437751 : && single_lane_slp_done_for_suggested_uf);
2219 :
2220 : /* Classify all cross-iteration scalar data-flow cycles.
2221 : Cross-iteration cycles caused by virtual phis are analyzed separately. */
2222 437751 : vect_analyze_scalar_cycles (loop_vinfo);
2223 :
2224 437751 : vect_pattern_recog (loop_vinfo);
2225 :
2226 : /* Analyze the access patterns of the data-refs in the loop (consecutive,
2227 : complex, etc.). FORNOW: Only handle consecutive access pattern. */
2228 :
2229 437751 : ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2230 437751 : if (!ok)
2231 : {
2232 7926 : if (dump_enabled_p ())
2233 291 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2234 : "bad data access.\n");
2235 7926 : return ok;
2236 : }
2237 :
2238 : /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2239 :
2240 429825 : ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2241 429825 : if (!ok)
2242 : {
2243 45061 : if (dump_enabled_p ())
2244 401 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2245 : "unexpected pattern.\n");
2246 45061 : return ok;
2247 : }
2248 :
2249 : /* While the rest of the analysis below depends on it in some way. */
2250 384764 : fatal = false;
2251 :
2252 : /* Analyze data dependences between the data-refs in the loop
2253 : and adjust the maximum vectorization factor according to
2254 : the dependences.
2255 : FORNOW: fail at the first data dependence that we encounter. */
2256 :
2257 384764 : ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2258 384764 : if (!ok)
2259 : {
2260 23389 : if (dump_enabled_p ())
2261 538 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2262 : "bad data dependence.\n");
2263 23389 : return ok;
2264 : }
2265 361375 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2266 :
2267 : /* Compute the scalar iteration cost. */
2268 361375 : vect_compute_single_scalar_iteration_cost (loop_vinfo);
2269 :
2270 361375 : bool saved_can_use_partial_vectors_p
2271 : = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2272 :
2273 : /* This is the point where we can re-start analysis with single-lane
2274 : SLP forced. */
2275 495677 : start_over:
2276 :
2277 : /* Check the SLP opportunities in the loop, analyze and build
2278 : SLP trees. */
2279 991354 : ok = vect_analyze_slp (loop_vinfo, loop_vinfo->stmt_vec_infos.length (),
2280 : force_single_lane);
2281 495677 : if (!ok)
2282 24878 : return ok;
2283 :
2284 : /* If there are any SLP instances mark them as pure_slp and compute
2285 : the overall vectorization factor. */
2286 470799 : if (!vect_make_slp_decision (loop_vinfo))
2287 60822 : return opt_result::failure_at (vect_location, "no stmts to vectorize.\n");
2288 :
2289 409977 : if (dump_enabled_p ())
2290 19042 : dump_printf_loc (MSG_NOTE, vect_location, "Loop contains only SLP stmts\n");
2291 :
2292 : /* Dump the vectorization factor from the SLP decision. */
2293 409977 : if (dump_enabled_p ())
2294 : {
2295 19042 : dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
2296 19042 : dump_dec (MSG_NOTE, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2297 19042 : dump_printf (MSG_NOTE, "\n");
2298 : }
2299 :
2300 : /* We don't expect to have to roll back to anything other than an empty
2301 : set of rgroups. */
2302 409977 : gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2303 :
2304 : /* Apply the suggested unrolling factor, this was determined by the backend
2305 : during finish_cost the first time we ran the analyzis for this
2306 : vector mode. */
2307 409977 : if (applying_suggested_uf)
2308 437 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2309 :
2310 : /* Now the vectorization factor is final. */
2311 409977 : poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2312 409977 : gcc_assert (known_ne (vectorization_factor, 0U));
2313 :
2314 : /* Optimize the SLP graph with the vectorization factor fixed. */
2315 409977 : vect_optimize_slp (loop_vinfo);
2316 :
2317 : /* Gather the loads reachable from the SLP graph entries. */
2318 409977 : vect_gather_slp_loads (loop_vinfo);
2319 :
2320 409977 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2321 : {
2322 14145 : dump_printf_loc (MSG_NOTE, vect_location,
2323 : "vectorization_factor = ");
2324 14145 : dump_dec (MSG_NOTE, vectorization_factor);
2325 14145 : dump_printf (MSG_NOTE, ", niters = %wd\n",
2326 14145 : LOOP_VINFO_INT_NITERS (loop_vinfo));
2327 : }
2328 :
2329 409977 : if (max_vf != MAX_VECTORIZATION_FACTOR
2330 409977 : && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2331 41 : return opt_result::failure_at (vect_location, "bad data dependence.\n");
2332 :
2333 409936 : loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2334 :
2335 : /* Analyze the alignment of the data-refs in the loop. */
2336 409936 : vect_analyze_data_refs_alignment (loop_vinfo);
2337 :
2338 : /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2339 : It is important to call pruning after vect_analyze_data_ref_accesses,
2340 : since we use grouping information gathered by interleaving analysis. */
2341 409936 : ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2342 409936 : if (!ok)
2343 16509 : return ok;
2344 :
2345 : /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2346 : vectorization, since we do not want to add extra peeling or
2347 : add versioning for alignment. */
2348 393427 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2349 : /* This pass will decide on using loop versioning and/or loop peeling in
2350 : order to enhance the alignment of data references in the loop. */
2351 378749 : ok = vect_enhance_data_refs_alignment (loop_vinfo);
2352 393427 : if (!ok)
2353 0 : return ok;
2354 :
2355 : /* Analyze operations in the SLP instances. We can't simply
2356 : remove unsupported SLP instances as this makes the above
2357 : SLP kind detection invalid and might also affect the VF. */
2358 393427 : if (! vect_slp_analyze_operations (loop_vinfo))
2359 : {
2360 238502 : ok = opt_result::failure_at (vect_location,
2361 : "unsupported SLP instances\n");
2362 238502 : goto again;
2363 : }
2364 :
2365 : /* For now, we don't expect to mix both masking and length approaches for one
2366 : loop, disable it if both are recorded. */
2367 154925 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2368 23403 : && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2369 178322 : && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2370 : {
2371 0 : if (dump_enabled_p ())
2372 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2373 : "can't vectorize a loop with partial vectors"
2374 : " because we don't expect to mix different"
2375 : " approaches with partial vectors for the"
2376 : " same loop.\n");
2377 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2378 : }
2379 :
2380 : /* If we still have the option of using partial vectors,
2381 : check whether we can generate the necessary loop controls. */
2382 154925 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2383 : {
2384 23403 : if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2385 : {
2386 23397 : if (!vect_verify_full_masking (loop_vinfo)
2387 23397 : && !vect_verify_full_masking_avx512 (loop_vinfo))
2388 6084 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2389 : }
2390 : else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
2391 6 : if (!vect_verify_loop_lens (loop_vinfo))
2392 6 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2393 : }
2394 :
2395 : /* Decide whether this loop_vinfo should use partial vectors or peeling,
2396 : assuming that the loop will be used as a main loop. We will redo
2397 : this analysis later if we instead decide to use the loop as an
2398 : epilogue loop. */
2399 154925 : ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, masked_p);
2400 154925 : if (!ok)
2401 0 : return ok;
2402 :
2403 : /* If we're vectorizing a loop that uses length "controls" and
2404 : can iterate more than once, we apply decrementing IV approach
2405 : in loop control. */
2406 154925 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2407 31 : && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
2408 0 : && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
2409 154925 : && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2410 0 : && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
2411 : LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
2412 0 : LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
2413 :
2414 : /* If a loop uses length controls and has a decrementing loop control IV,
2415 : we will normally pass that IV through a MIN_EXPR to calcaluate the
2416 : basis for the length controls. E.g. in a loop that processes one
2417 : element per scalar iteration, the number of elements would be
2418 : MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
2419 :
2420 : This MIN_EXPR approach allows us to use pointer IVs with an invariant
2421 : step, since only the final iteration of the vector loop can have
2422 : inactive lanes.
2423 :
2424 : However, some targets have a dedicated instruction for calculating the
2425 : preferred length, given the total number of elements that still need to
2426 : be processed. This is encapsulated in the SELECT_VL internal function.
2427 :
2428 : If the target supports SELECT_VL, we can use it instead of MIN_EXPR
2429 : to determine the basis for the length controls. However, unlike the
2430 : MIN_EXPR calculation, the SELECT_VL calculation can decide to make
2431 : lanes inactive in any iteration of the vector loop, not just the last
2432 : iteration. This SELECT_VL approach therefore requires us to use pointer
2433 : IVs with variable steps.
2434 :
2435 : Once we've decided how many elements should be processed by one
2436 : iteration of the vector loop, we need to populate the rgroup controls.
2437 : If a loop has multiple rgroups, we need to make sure that those rgroups
2438 : "line up" (that is, they must be consistent about which elements are
2439 : active and which aren't). This is done by vect_adjust_loop_lens_control.
2440 :
2441 : In principle, it would be possible to use vect_adjust_loop_lens_control
2442 : on either the result of a MIN_EXPR or the result of a SELECT_VL.
2443 : However:
2444 :
2445 : (1) In practice, it only makes sense to use SELECT_VL when a vector
2446 : operation will be controlled directly by the result. It is not
2447 : worth using SELECT_VL if it would only be the input to other
2448 : calculations.
2449 :
2450 : (2) If we use SELECT_VL for an rgroup that has N controls, each associated
2451 : pointer IV will need N updates by a variable amount (N-1 updates
2452 : within the iteration and 1 update to move to the next iteration).
2453 :
2454 : Because of this, we prefer to use the MIN_EXPR approach whenever there
2455 : is more than one length control.
2456 :
2457 : In addition, SELECT_VL always operates to a granularity of 1 unit.
2458 : If we wanted to use it to control an SLP operation on N consecutive
2459 : elements, we would need to make the SELECT_VL inputs measure scalar
2460 : iterations (rather than elements) and then multiply the SELECT_VL
2461 : result by N. But using SELECT_VL this way is inefficient because
2462 : of (1) above.
2463 :
2464 : 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
2465 : satisfied:
2466 :
2467 : (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
2468 : (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
2469 :
2470 : Since SELECT_VL (variable step) will make SCEV analysis failed and then
2471 : we will fail to gain benefits of following unroll optimizations. We prefer
2472 : using the MIN_EXPR approach in this situation. */
2473 154925 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
2474 : {
2475 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
2476 0 : if (LOOP_VINFO_LENS (loop_vinfo).length () == 1
2477 0 : && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1
2478 0 : && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2479 : || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
2480 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
2481 :
2482 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
2483 0 : for (auto rgc : LOOP_VINFO_LENS (loop_vinfo))
2484 0 : if (rgc.type
2485 0 : && !direct_internal_fn_supported_p (IFN_SELECT_VL,
2486 : rgc.type, iv_type,
2487 : OPTIMIZE_FOR_SPEED))
2488 : {
2489 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2490 0 : break;
2491 : }
2492 :
2493 : /* If any of the SLP instances cover more than a single lane
2494 : we cannot use .SELECT_VL at the moment, even if the number
2495 : of lanes is uniform throughout the SLP graph. */
2496 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
2497 0 : for (slp_instance inst : LOOP_VINFO_SLP_INSTANCES (loop_vinfo))
2498 0 : if (SLP_TREE_LANES (SLP_INSTANCE_TREE (inst)) != 1
2499 0 : && !(SLP_INSTANCE_KIND (inst) == slp_inst_kind_store
2500 0 : && SLP_INSTANCE_TREE (inst)->ldst_lanes))
2501 : {
2502 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2503 0 : break;
2504 : }
2505 : }
2506 :
2507 : /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2508 : to be able to handle fewer than VF scalars, or needs to have a lower VF
2509 : than the main loop. */
2510 154925 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2511 13521 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2512 : {
2513 13507 : poly_uint64 unscaled_vf
2514 13507 : = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2515 : orig_loop_vinfo->suggested_unroll_factor);
2516 13507 : if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
2517 391 : return opt_result::failure_at (vect_location,
2518 : "Vectorization factor too high for"
2519 : " epilogue loop.\n");
2520 : }
2521 :
2522 : /* If the epilogue needs peeling for gaps but the main loop doesn't give
2523 : up on the epilogue. */
2524 154534 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2525 13130 : && LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2526 73 : && (LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo)
2527 : != LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
2528 4 : return opt_result::failure_at (vect_location,
2529 : "Epilogue loop requires peeling for gaps "
2530 : "but main loop does not.\n");
2531 :
2532 : /* If an epilogue loop is required make sure we can create one. */
2533 154530 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2534 153249 : || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2535 56468 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
2536 : {
2537 99563 : if (dump_enabled_p ())
2538 5530 : dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2539 99563 : if (!vect_can_advance_ivs_p (loop_vinfo)
2540 198612 : || !slpeel_can_duplicate_loop_p (loop,
2541 : LOOP_VINFO_MAIN_EXIT (loop_vinfo),
2542 99049 : LOOP_VINFO_MAIN_EXIT (loop_vinfo)))
2543 : {
2544 514 : ok = opt_result::failure_at (vect_location,
2545 : "not vectorized: can't create required "
2546 : "epilog loop\n");
2547 514 : goto again;
2548 : }
2549 : }
2550 :
2551 : /* Check the costings of the loop make vectorizing worthwhile. */
2552 154016 : res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2553 154016 : if (res < 0 && !param_vect_allow_possibly_not_worthwhile_vectorizations)
2554 : {
2555 29037 : ok = opt_result::failure_at (vect_location,
2556 : "Loop costings may not be worthwhile.\n");
2557 29037 : goto again;
2558 : }
2559 124979 : if (!res)
2560 30881 : return opt_result::failure_at (vect_location,
2561 : "Loop costings not worthwhile.\n");
2562 :
2563 : /* During peeling, we need to check if number of loop iterations is
2564 : enough for both peeled prolog loop and vector loop. This check
2565 : can be merged along with threshold check of loop versioning, so
2566 : increase threshold for this case if necessary.
2567 :
2568 : If we are analyzing an epilogue we still want to check what its
2569 : versioning threshold would be. If we decide to vectorize the epilogues we
2570 : will want to use the lowest versioning threshold of all epilogues and main
2571 : loop. This will enable us to enter a vectorized epilogue even when
2572 : versioning the loop. We can't simply check whether the epilogue requires
2573 : versioning though since we may have skipped some versioning checks when
2574 : analyzing the epilogue. For instance, checks for alias versioning will be
2575 : skipped when dealing with epilogues as we assume we already checked them
2576 : for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2577 94098 : if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2578 : {
2579 8980 : poly_uint64 niters_th = 0;
2580 8980 : unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2581 :
2582 8980 : if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2583 : {
2584 : /* Niters for peeled prolog loop. */
2585 8980 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2586 : {
2587 125 : dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2588 125 : tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2589 125 : niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2590 : }
2591 : else
2592 8855 : niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2593 : }
2594 :
2595 : /* Niters for at least one iteration of vectorized loop. */
2596 8980 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2597 8976 : niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2598 : /* One additional iteration because of peeling for gap. */
2599 8980 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2600 66 : niters_th += 1;
2601 :
2602 : /* Use the same condition as vect_transform_loop to decide when to use
2603 : the cost to determine a versioning threshold. */
2604 8980 : if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2605 8980 : && ordered_p (th, niters_th))
2606 6656 : niters_th = ordered_max (poly_uint64 (th), niters_th);
2607 :
2608 8980 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2609 : }
2610 :
2611 94098 : gcc_assert (known_eq (vectorization_factor,
2612 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2613 :
2614 94098 : single_lane_slp_done_for_suggested_uf = force_single_lane;
2615 :
2616 : /* Ok to vectorize! */
2617 94098 : LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2618 94098 : return opt_result::success ();
2619 :
2620 268053 : again:
2621 : /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2622 268053 : gcc_assert (!ok);
2623 :
2624 : /* Try again with single-lane SLP. */
2625 268053 : if (force_single_lane)
2626 132803 : return ok;
2627 :
2628 : /* If we are applying suggested unroll factor, we don't need to
2629 : re-try any more as we want to keep the SLP mode fixed. */
2630 135250 : if (applying_suggested_uf)
2631 10 : return ok;
2632 :
2633 : /* Likewise if the grouped loads or stores in the SLP cannot be handled
2634 : via interleaving or lane instructions. */
2635 : slp_instance instance;
2636 : slp_tree node;
2637 : unsigned i, j;
2638 367024 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2639 : {
2640 232722 : if (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance)) != vect_internal_def)
2641 0 : continue;
2642 :
2643 232722 : stmt_vec_info vinfo;
2644 232722 : vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2645 232722 : if (!vinfo || !STMT_VINFO_GROUPED_ACCESS (vinfo))
2646 230014 : continue;
2647 2708 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2648 2708 : unsigned int size = DR_GROUP_SIZE (vinfo);
2649 2708 : tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
2650 2708 : if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
2651 4737 : && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2652 5410 : && ! vect_grouped_store_supported (vectype, size))
2653 673 : return opt_result::failure_at (vinfo->stmt,
2654 : "unsupported grouped store\n");
2655 235255 : FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2656 : {
2657 2201 : vinfo = SLP_TREE_REPRESENTATIVE (node);
2658 2201 : if (STMT_VINFO_GROUPED_ACCESS (vinfo))
2659 : {
2660 1925 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2661 1925 : bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2662 1925 : size = DR_GROUP_SIZE (vinfo);
2663 1925 : vectype = SLP_TREE_VECTYPE (node);
2664 1925 : if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
2665 1925 : && ! vect_grouped_load_supported (vectype, single_element_p,
2666 : size))
2667 265 : return opt_result::failure_at (vinfo->stmt,
2668 : "unsupported grouped load\n");
2669 : }
2670 : }
2671 : }
2672 :
2673 : /* Roll back state appropriately. Force single-lane SLP this time. */
2674 134302 : force_single_lane = true;
2675 134302 : if (dump_enabled_p ())
2676 3375 : dump_printf_loc (MSG_NOTE, vect_location,
2677 : "re-trying with single-lane SLP\n");
2678 :
2679 : /* Reset the vectorization factor. */
2680 134302 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = 0;
2681 : /* Free the SLP instances. */
2682 366079 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2683 231777 : vect_free_slp_instance (instance);
2684 134302 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2685 : /* Reset altered state on stmts. */
2686 510515 : for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2687 : {
2688 376213 : basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2689 376213 : for (gimple_stmt_iterator si = gsi_start_phis (bb);
2690 676745 : !gsi_end_p (si); gsi_next (&si))
2691 : {
2692 300532 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2693 300532 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2694 300532 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2695 : {
2696 : /* vectorizable_reduction adjusts reduction stmt def-types,
2697 : restore them to that of the PHI. */
2698 25537 : STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2699 25537 : = STMT_VINFO_DEF_TYPE (stmt_info);
2700 25537 : STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2701 : (STMT_VINFO_REDUC_DEF (stmt_info)))
2702 25537 : = STMT_VINFO_DEF_TYPE (stmt_info);
2703 : }
2704 : }
2705 : }
2706 : /* Free optimized alias test DDRS. */
2707 134302 : LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2708 134302 : LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2709 134302 : LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2710 : /* Reset target cost data. */
2711 134302 : delete loop_vinfo->vector_costs;
2712 134302 : loop_vinfo->vector_costs = nullptr;
2713 : /* Reset accumulated rgroup information. */
2714 134302 : LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
2715 134302 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
2716 134302 : release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2717 : /* Reset assorted flags. */
2718 134302 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2719 134302 : LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2720 134302 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2721 134302 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2722 134302 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2723 134302 : = saved_can_use_partial_vectors_p;
2724 134302 : LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2725 134302 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2726 134302 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2727 134302 : LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = false;
2728 :
2729 134302 : if (loop_vinfo->scan_map)
2730 122 : loop_vinfo->scan_map->empty ();
2731 :
2732 134302 : goto start_over;
2733 : }
2734 :
2735 : /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2736 : to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2737 : OLD_LOOP_VINFO is better unless something specifically indicates
2738 : otherwise.
2739 :
2740 : Note that this deliberately isn't a partial order. */
2741 :
2742 : static bool
2743 32297 : vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2744 : loop_vec_info old_loop_vinfo)
2745 : {
2746 32297 : struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2747 32297 : gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2748 :
2749 32297 : poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2750 32297 : poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2751 :
2752 : /* Always prefer a VF of loop->simdlen over any other VF. */
2753 32297 : if (loop->simdlen)
2754 : {
2755 0 : bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2756 0 : bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2757 0 : if (new_simdlen_p != old_simdlen_p)
2758 : return new_simdlen_p;
2759 : }
2760 :
2761 32297 : const auto *old_costs = old_loop_vinfo->vector_costs;
2762 32297 : const auto *new_costs = new_loop_vinfo->vector_costs;
2763 32297 : if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2764 1516 : return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2765 :
2766 30781 : return new_costs->better_main_loop_than_p (old_costs);
2767 : }
2768 :
2769 : /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2770 : true if we should. */
2771 :
2772 : static bool
2773 32297 : vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2774 : loop_vec_info old_loop_vinfo)
2775 : {
2776 32297 : if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2777 : return false;
2778 :
2779 1121 : if (dump_enabled_p ())
2780 11 : dump_printf_loc (MSG_NOTE, vect_location,
2781 : "***** Preferring vector mode %s to vector mode %s\n",
2782 11 : GET_MODE_NAME (new_loop_vinfo->vector_mode),
2783 11 : GET_MODE_NAME (old_loop_vinfo->vector_mode));
2784 : return true;
2785 : }
2786 :
2787 : /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if ORIG_LOOP_VINFO is
2788 : not NULL. When MASKED_P is not -1 override the default
2789 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P with it.
2790 : Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance MODE_I to the next
2791 : mode useful to analyze.
2792 : Return the loop_vinfo on success and wrapped null on failure. */
2793 :
2794 : static opt_loop_vec_info
2795 571928 : vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2796 : const vect_loop_form_info *loop_form_info,
2797 : loop_vec_info orig_loop_vinfo,
2798 : const vector_modes &vector_modes, unsigned &mode_i,
2799 : int masked_p,
2800 : machine_mode &autodetected_vector_mode,
2801 : bool &fatal)
2802 : {
2803 571928 : loop_vec_info loop_vinfo
2804 571928 : = vect_create_loop_vinfo (loop, shared, loop_form_info, orig_loop_vinfo);
2805 :
2806 571928 : machine_mode vector_mode = vector_modes[mode_i];
2807 571928 : loop_vinfo->vector_mode = vector_mode;
2808 571928 : unsigned int suggested_unroll_factor = 1;
2809 571928 : bool single_lane_slp_done_for_suggested_uf = false;
2810 :
2811 : /* Run the main analysis. */
2812 571928 : opt_result res = vect_analyze_loop_2 (loop_vinfo, masked_p, fatal,
2813 : &suggested_unroll_factor,
2814 : single_lane_slp_done_for_suggested_uf);
2815 571928 : if (dump_enabled_p ())
2816 21199 : dump_printf_loc (MSG_NOTE, vect_location,
2817 : "***** Analysis %s with vector mode %s\n",
2818 21199 : res ? "succeeded" : "failed",
2819 21199 : GET_MODE_NAME (loop_vinfo->vector_mode));
2820 :
2821 571928 : auto user_unroll = LOOP_VINFO_LOOP (loop_vinfo)->unroll;
2822 571928 : if (res && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2823 : /* Check to see if the user wants to unroll or if the target wants to. */
2824 657308 : && (suggested_unroll_factor > 1 || user_unroll > 1))
2825 : {
2826 463 : if (suggested_unroll_factor == 1)
2827 : {
2828 66 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
2829 66 : suggested_unroll_factor = user_unroll / assumed_vf;
2830 66 : if (suggested_unroll_factor > 1)
2831 : {
2832 40 : if (dump_enabled_p ())
2833 20 : dump_printf_loc (MSG_NOTE, vect_location,
2834 : "setting unroll factor to %d based on user requested "
2835 : "unroll factor %d and suggested vectorization "
2836 : "factor: %d\n",
2837 : suggested_unroll_factor, user_unroll, assumed_vf);
2838 : }
2839 : }
2840 :
2841 463 : if (suggested_unroll_factor > 1)
2842 : {
2843 437 : if (dump_enabled_p ())
2844 56 : dump_printf_loc (MSG_NOTE, vect_location,
2845 : "***** Re-trying analysis for unrolling"
2846 : " with unroll factor %d and %s slp.\n",
2847 : suggested_unroll_factor,
2848 : single_lane_slp_done_for_suggested_uf
2849 : ? "single-lane" : "");
2850 437 : loop_vec_info unroll_vinfo
2851 437 : = vect_create_loop_vinfo (loop, shared, loop_form_info, NULL);
2852 437 : unroll_vinfo->vector_mode = vector_mode;
2853 437 : unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
2854 437 : opt_result new_res
2855 437 : = vect_analyze_loop_2 (unroll_vinfo, masked_p, fatal, NULL,
2856 : single_lane_slp_done_for_suggested_uf);
2857 437 : if (new_res)
2858 : {
2859 378 : delete loop_vinfo;
2860 378 : loop_vinfo = unroll_vinfo;
2861 : }
2862 : else
2863 59 : delete unroll_vinfo;
2864 : }
2865 :
2866 : /* Record that we have honored a user unroll factor. */
2867 463 : LOOP_VINFO_USER_UNROLL (loop_vinfo) = user_unroll > 1;
2868 : }
2869 :
2870 : /* Remember the autodetected vector mode. */
2871 571928 : if (vector_mode == VOIDmode)
2872 266245 : autodetected_vector_mode = loop_vinfo->vector_mode;
2873 :
2874 : /* Advance mode_i, first skipping modes that would result in the
2875 : same analysis result. */
2876 2530506 : while (mode_i + 1 < vector_modes.length ()
2877 1742562 : && vect_chooses_same_modes_p (loop_vinfo,
2878 763273 : vector_modes[mode_i + 1]))
2879 : {
2880 407361 : if (dump_enabled_p ())
2881 17061 : dump_printf_loc (MSG_NOTE, vect_location,
2882 : "***** The result for vector mode %s would"
2883 : " be the same\n",
2884 17061 : GET_MODE_NAME (vector_modes[mode_i + 1]));
2885 407361 : mode_i += 1;
2886 : }
2887 571928 : if (mode_i + 1 < vector_modes.length ()
2888 927840 : && vect_chooses_same_modes_p (autodetected_vector_mode,
2889 355912 : vector_modes[mode_i + 1]))
2890 : {
2891 428 : if (dump_enabled_p ())
2892 11 : dump_printf_loc (MSG_NOTE, vect_location,
2893 : "***** Skipping vector mode %s, which would"
2894 : " repeat the analysis for %s\n",
2895 11 : GET_MODE_NAME (vector_modes[mode_i + 1]),
2896 11 : GET_MODE_NAME (autodetected_vector_mode));
2897 428 : mode_i += 1;
2898 : }
2899 571928 : mode_i++;
2900 :
2901 571928 : if (!res)
2902 : {
2903 478208 : delete loop_vinfo;
2904 478208 : if (fatal)
2905 103479 : gcc_checking_assert (orig_loop_vinfo == NULL);
2906 478208 : return opt_loop_vec_info::propagate_failure (res);
2907 : }
2908 :
2909 93720 : return opt_loop_vec_info::success (loop_vinfo);
2910 : }
2911 :
2912 : /* Function vect_analyze_loop.
2913 :
2914 : Apply a set of analyses on LOOP, and create a loop_vec_info struct
2915 : for it. The different analyses will record information in the
2916 : loop_vec_info struct. */
2917 : opt_loop_vec_info
2918 464011 : vect_analyze_loop (class loop *loop, gimple *loop_vectorized_call,
2919 : vec_info_shared *shared)
2920 : {
2921 464011 : DUMP_VECT_SCOPE ("analyze_loop_nest");
2922 :
2923 464011 : if (loop_outer (loop)
2924 464011 : && loop_vec_info_for_loop (loop_outer (loop))
2925 464594 : && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2926 583 : return opt_loop_vec_info::failure_at (vect_location,
2927 : "outer-loop already vectorized.\n");
2928 :
2929 463428 : if (!find_loop_nest (loop, &shared->loop_nest))
2930 22249 : return opt_loop_vec_info::failure_at
2931 22249 : (vect_location,
2932 : "not vectorized: loop nest containing two or more consecutive inner"
2933 : " loops cannot be vectorized\n");
2934 :
2935 : /* Analyze the loop form. */
2936 441179 : vect_loop_form_info loop_form_info;
2937 441179 : opt_result res = vect_analyze_loop_form (loop, loop_vectorized_call,
2938 : &loop_form_info);
2939 441179 : if (!res)
2940 : {
2941 174934 : if (dump_enabled_p ())
2942 1531 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2943 : "bad loop form.\n");
2944 174934 : return opt_loop_vec_info::propagate_failure (res);
2945 : }
2946 266245 : if (!integer_onep (loop_form_info.assumptions))
2947 : {
2948 : /* We consider to vectorize this loop by versioning it under
2949 : some assumptions. In order to do this, we need to clear
2950 : existing information computed by scev and niter analyzer. */
2951 8264 : scev_reset_htab ();
2952 8264 : free_numbers_of_iterations_estimates (loop);
2953 : /* Also set flag for this loop so that following scev and niter
2954 : analysis are done under the assumptions. */
2955 8264 : loop_constraint_set (loop, LOOP_C_FINITE);
2956 : }
2957 : else
2958 : /* Clear the existing niter information to make sure the nonwrapping flag
2959 : will be calculated and set propriately. */
2960 257981 : free_numbers_of_iterations_estimates (loop);
2961 :
2962 266245 : auto_vector_modes vector_modes;
2963 : /* Autodetect first vector size we try. */
2964 266245 : vector_modes.safe_push (VOIDmode);
2965 266245 : unsigned int autovec_flags
2966 532490 : = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2967 266245 : loop->simdlen != 0);
2968 266245 : bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2969 266245 : && !unlimited_cost_model (loop));
2970 266245 : machine_mode autodetected_vector_mode = VOIDmode;
2971 266245 : opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2972 266245 : unsigned int mode_i = 0;
2973 266245 : unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2974 :
2975 : /* Keep track of the VF for each mode. Initialize all to 0 which indicates
2976 : a mode has not been analyzed. */
2977 266245 : auto_vec<poly_uint64, 8> cached_vf_per_mode;
2978 2674272 : for (unsigned i = 0; i < vector_modes.length (); ++i)
2979 1070891 : cached_vf_per_mode.safe_push (0);
2980 :
2981 : /* First determine the main loop vectorization mode, either the first
2982 : one that works, starting with auto-detecting the vector mode and then
2983 : following the targets order of preference, or the one with the
2984 : lowest cost if pick_lowest_cost_p. */
2985 849937 : while (1)
2986 : {
2987 558091 : bool fatal;
2988 558091 : unsigned int last_mode_i = mode_i;
2989 : /* Set cached VF to -1 prior to analysis, which indicates a mode has
2990 : failed. */
2991 558091 : cached_vf_per_mode[last_mode_i] = -1;
2992 558091 : opt_loop_vec_info loop_vinfo
2993 558091 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
2994 : NULL, vector_modes, mode_i, -1,
2995 : autodetected_vector_mode, fatal);
2996 558091 : if (fatal)
2997 : break;
2998 :
2999 454612 : if (loop_vinfo)
3000 : {
3001 : /* Analyzis has been successful so update the VF value. The
3002 : VF should always be a multiple of unroll_factor and we want to
3003 : capture the original VF here. */
3004 85380 : cached_vf_per_mode[last_mode_i]
3005 85380 : = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3006 85380 : loop_vinfo->suggested_unroll_factor);
3007 : /* Once we hit the desired simdlen for the first time,
3008 : discard any previous attempts. */
3009 85380 : if (simdlen
3010 85380 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3011 : {
3012 47 : delete first_loop_vinfo;
3013 : first_loop_vinfo = opt_loop_vec_info::success (NULL);
3014 : simdlen = 0;
3015 : }
3016 85333 : else if (pick_lowest_cost_p
3017 71450 : && first_loop_vinfo
3018 116114 : && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3019 : {
3020 : /* Pick loop_vinfo over first_loop_vinfo. */
3021 935 : delete first_loop_vinfo;
3022 935 : first_loop_vinfo = opt_loop_vec_info::success (NULL);
3023 : }
3024 85380 : if (first_loop_vinfo == NULL)
3025 : first_loop_vinfo = loop_vinfo;
3026 : else
3027 : {
3028 29848 : delete loop_vinfo;
3029 29848 : loop_vinfo = opt_loop_vec_info::success (NULL);
3030 : }
3031 :
3032 : /* Commit to first_loop_vinfo if we have no reason to try
3033 : alternatives. */
3034 85380 : if (!simdlen && !pick_lowest_cost_p)
3035 : break;
3036 : }
3037 440691 : if (mode_i == vector_modes.length ()
3038 440691 : || autodetected_vector_mode == VOIDmode)
3039 : break;
3040 :
3041 : /* Try the next biggest vector size. */
3042 291846 : if (dump_enabled_p ())
3043 4754 : dump_printf_loc (MSG_NOTE, vect_location,
3044 : "***** Re-trying analysis with vector mode %s\n",
3045 4754 : GET_MODE_NAME (vector_modes[mode_i]));
3046 291846 : }
3047 266245 : if (!first_loop_vinfo)
3048 211653 : return opt_loop_vec_info::propagate_failure (res);
3049 :
3050 54592 : if (dump_enabled_p ())
3051 9511 : dump_printf_loc (MSG_NOTE, vect_location,
3052 : "***** Choosing vector mode %s\n",
3053 9511 : GET_MODE_NAME (first_loop_vinfo->vector_mode));
3054 :
3055 : /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3056 : enabled, SIMDUID is not set, it is the innermost loop and we have
3057 : either already found the loop's SIMDLEN or there was no SIMDLEN to
3058 : begin with.
3059 : TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3060 54592 : bool vect_epilogues = (!simdlen
3061 54590 : && loop->inner == NULL
3062 53993 : && param_vect_epilogues_nomask
3063 52853 : && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3064 : /* No code motion support for multiple epilogues so for now
3065 : not supported when multiple exits. */
3066 25865 : && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3067 25393 : && !loop->simduid
3068 78572 : && loop_cost_model (loop) > VECT_COST_MODEL_VERY_CHEAP);
3069 54592 : if (!vect_epilogues)
3070 41786 : return first_loop_vinfo;
3071 :
3072 : /* Now analyze first_loop_vinfo for epilogue vectorization. */
3073 :
3074 : /* For epilogues start the analysis from the first mode. The motivation
3075 : behind starting from the beginning comes from cases where the VECTOR_MODES
3076 : array may contain length-agnostic and length-specific modes. Their
3077 : ordering is not guaranteed, so we could end up picking a mode for the main
3078 : loop that is after the epilogue's optimal mode. */
3079 12806 : int masked_p = -1;
3080 12806 : if (!unlimited_cost_model (loop)
3081 12806 : && (first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p)
3082 : != VOIDmode))
3083 : {
3084 4 : vector_modes[0]
3085 4 : = first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p);
3086 4 : cached_vf_per_mode[0] = 0;
3087 : }
3088 : else
3089 12802 : vector_modes[0] = autodetected_vector_mode;
3090 12806 : mode_i = 0;
3091 :
3092 12842 : bool supports_partial_vectors = (param_vect_partial_vector_usage != 0
3093 12806 : || masked_p == 1);
3094 : if (supports_partial_vectors
3095 36 : && !partial_vectors_supported_p ()
3096 36 : && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (first_loop_vinfo))
3097 : supports_partial_vectors = false;
3098 12806 : poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3099 :
3100 12806 : loop_vec_info orig_loop_vinfo = first_loop_vinfo;
3101 12988 : do
3102 : {
3103 : /* Let the user override what the target suggests. */
3104 12897 : if (OPTION_SET_P (param_vect_partial_vector_usage))
3105 45 : masked_p = -1;
3106 :
3107 49938 : while (1)
3108 : {
3109 : /* If the target does not support partial vectors we can shorten the
3110 : number of modes to analyze for the epilogue as we know we can't
3111 : pick a mode that would lead to a VF at least as big as the
3112 : FIRST_VINFO_VF. */
3113 66365 : if (!supports_partial_vectors
3114 49938 : && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3115 : {
3116 23481 : mode_i++;
3117 46962 : if (mode_i == vector_modes.length ())
3118 : break;
3119 29047 : continue;
3120 : }
3121 : /* We would need an exhaustive search to find all modes we
3122 : skipped but that would lead to the same result as the
3123 : analysis it was skipped for and where we'd could check
3124 : cached_vf_per_mode against.
3125 : Check for the autodetected mode, which is the common
3126 : situation on x86 which does not perform cost comparison. */
3127 39077 : if (!supports_partial_vectors
3128 26414 : && maybe_ge (cached_vf_per_mode[0], first_vinfo_vf)
3129 52078 : && vect_chooses_same_modes_p (autodetected_vector_mode,
3130 25621 : vector_modes[mode_i]))
3131 : {
3132 12620 : mode_i++;
3133 25240 : if (mode_i == vector_modes.length ())
3134 : break;
3135 12620 : continue;
3136 : }
3137 :
3138 13837 : if (dump_enabled_p ())
3139 3253 : dump_printf_loc (MSG_NOTE, vect_location,
3140 : "***** Re-trying epilogue analysis with vector "
3141 3253 : "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3142 :
3143 13837 : bool fatal;
3144 13837 : opt_loop_vec_info loop_vinfo
3145 13837 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3146 : orig_loop_vinfo,
3147 : vector_modes, mode_i, masked_p,
3148 : autodetected_vector_mode, fatal);
3149 13837 : if (fatal)
3150 : break;
3151 :
3152 13837 : if (loop_vinfo)
3153 : {
3154 8340 : if (pick_lowest_cost_p
3155 5392 : && orig_loop_vinfo->epilogue_vinfo
3156 9856 : && vect_joust_loop_vinfos (loop_vinfo,
3157 1516 : orig_loop_vinfo->epilogue_vinfo))
3158 : {
3159 186 : gcc_assert (vect_epilogues);
3160 186 : delete orig_loop_vinfo->epilogue_vinfo;
3161 186 : orig_loop_vinfo->epilogue_vinfo = nullptr;
3162 : }
3163 8340 : if (!orig_loop_vinfo->epilogue_vinfo)
3164 7010 : orig_loop_vinfo->epilogue_vinfo = loop_vinfo;
3165 : else
3166 : {
3167 1330 : delete loop_vinfo;
3168 1330 : loop_vinfo = opt_loop_vec_info::success (NULL);
3169 : }
3170 :
3171 : /* For now only allow one epilogue loop, but allow
3172 : pick_lowest_cost_p to replace it, so commit to the
3173 : first epilogue if we have no reason to try alternatives. */
3174 8340 : if (!pick_lowest_cost_p)
3175 : break;
3176 : }
3177 :
3178 : /* Revert back to the default from the suggested prefered
3179 : epilogue vectorization mode. */
3180 10889 : masked_p = -1;
3181 21778 : if (mode_i == vector_modes.length ())
3182 : break;
3183 : }
3184 :
3185 12897 : orig_loop_vinfo = orig_loop_vinfo->epilogue_vinfo;
3186 12897 : if (!orig_loop_vinfo)
3187 : break;
3188 :
3189 : /* When we selected a first vectorized epilogue, see if the target
3190 : suggests to have another one. */
3191 6824 : masked_p = -1;
3192 6824 : if (!unlimited_cost_model (loop)
3193 3882 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (orig_loop_vinfo)
3194 10700 : && (orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p)
3195 : != VOIDmode))
3196 : {
3197 182 : vector_modes[0]
3198 91 : = orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p);
3199 91 : cached_vf_per_mode[0] = 0;
3200 91 : mode_i = 0;
3201 : }
3202 : else
3203 : break;
3204 91 : }
3205 : while (1);
3206 :
3207 12806 : if (first_loop_vinfo->epilogue_vinfo)
3208 : {
3209 6741 : poly_uint64 lowest_th
3210 6741 : = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3211 6741 : loop_vec_info epilog_vinfo = first_loop_vinfo->epilogue_vinfo;
3212 6824 : do
3213 : {
3214 6824 : poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (epilog_vinfo);
3215 6824 : gcc_assert (!LOOP_REQUIRES_VERSIONING (epilog_vinfo)
3216 : || maybe_ne (lowest_th, 0U));
3217 : /* Keep track of the known smallest versioning threshold. */
3218 6824 : if (ordered_p (lowest_th, th))
3219 6824 : lowest_th = ordered_min (lowest_th, th);
3220 6824 : epilog_vinfo = epilog_vinfo->epilogue_vinfo;
3221 : }
3222 6824 : while (epilog_vinfo);
3223 6741 : LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3224 6741 : if (dump_enabled_p ())
3225 1447 : dump_printf_loc (MSG_NOTE, vect_location,
3226 : "***** Choosing epilogue vector mode %s\n",
3227 1447 : GET_MODE_NAME
3228 : (first_loop_vinfo->epilogue_vinfo->vector_mode));
3229 : }
3230 :
3231 12806 : return first_loop_vinfo;
3232 707424 : }
3233 :
3234 : /* Return true if there is an in-order reduction function for CODE, storing
3235 : it in *REDUC_FN if so. */
3236 :
3237 : static bool
3238 5083 : fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3239 : {
3240 : /* We support MINUS_EXPR by negating the operand. This also preserves an
3241 : initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3242 : (-0.0) = -0.0. */
3243 5083 : if (code == PLUS_EXPR || code == MINUS_EXPR)
3244 : {
3245 4411 : *reduc_fn = IFN_FOLD_LEFT_PLUS;
3246 0 : return true;
3247 : }
3248 : return false;
3249 : }
3250 :
3251 : /* Function reduction_fn_for_scalar_code
3252 :
3253 : Input:
3254 : CODE - tree_code of a reduction operations.
3255 :
3256 : Output:
3257 : REDUC_FN - the corresponding internal function to be used to reduce the
3258 : vector of partial results into a single scalar result, or IFN_LAST
3259 : if the operation is a supported reduction operation, but does not have
3260 : such an internal function.
3261 :
3262 : Return FALSE if CODE currently cannot be vectorized as reduction. */
3263 :
3264 : bool
3265 1994254 : reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3266 : {
3267 1994254 : if (code.is_tree_code ())
3268 1994196 : switch (tree_code (code))
3269 : {
3270 15313 : case MAX_EXPR:
3271 15313 : *reduc_fn = IFN_REDUC_MAX;
3272 15313 : return true;
3273 :
3274 50003 : case MIN_EXPR:
3275 50003 : *reduc_fn = IFN_REDUC_MIN;
3276 50003 : return true;
3277 :
3278 1083086 : case PLUS_EXPR:
3279 1083086 : *reduc_fn = IFN_REDUC_PLUS;
3280 1083086 : return true;
3281 :
3282 233677 : case BIT_AND_EXPR:
3283 233677 : *reduc_fn = IFN_REDUC_AND;
3284 233677 : return true;
3285 :
3286 286772 : case BIT_IOR_EXPR:
3287 286772 : *reduc_fn = IFN_REDUC_IOR;
3288 286772 : return true;
3289 :
3290 43569 : case BIT_XOR_EXPR:
3291 43569 : *reduc_fn = IFN_REDUC_XOR;
3292 43569 : return true;
3293 :
3294 281776 : case MULT_EXPR:
3295 281776 : case MINUS_EXPR:
3296 281776 : *reduc_fn = IFN_LAST;
3297 281776 : return true;
3298 :
3299 : default:
3300 : return false;
3301 : }
3302 : else
3303 58 : switch (combined_fn (code))
3304 : {
3305 34 : CASE_CFN_FMAX:
3306 34 : *reduc_fn = IFN_REDUC_FMAX;
3307 34 : return true;
3308 :
3309 24 : CASE_CFN_FMIN:
3310 24 : *reduc_fn = IFN_REDUC_FMIN;
3311 24 : return true;
3312 :
3313 : default:
3314 : return false;
3315 : }
3316 : }
3317 :
3318 : /* Set *SBOOL_FN to the corresponding function working on vector masks
3319 : for REDUC_FN. Return true if that exists, false otherwise. */
3320 :
3321 : static bool
3322 0 : sbool_reduction_fn_for_fn (internal_fn reduc_fn, internal_fn *sbool_fn)
3323 : {
3324 0 : switch (reduc_fn)
3325 : {
3326 0 : case IFN_REDUC_AND:
3327 0 : *sbool_fn = IFN_REDUC_SBOOL_AND;
3328 0 : return true;
3329 0 : case IFN_REDUC_IOR:
3330 0 : *sbool_fn = IFN_REDUC_SBOOL_IOR;
3331 0 : return true;
3332 0 : case IFN_REDUC_XOR:
3333 0 : *sbool_fn = IFN_REDUC_SBOOL_XOR;
3334 0 : return true;
3335 : default:
3336 : return false;
3337 : }
3338 : }
3339 :
3340 : /* If there is a neutral value X such that a reduction would not be affected
3341 : by the introduction of additional X elements, return that X, otherwise
3342 : return null. CODE is the code of the reduction and SCALAR_TYPE is type
3343 : of the scalar elements. If the reduction has just a single initial value
3344 : then INITIAL_VALUE is that value, otherwise it is null.
3345 : If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3346 : In that case no signed zero is returned. */
3347 :
3348 : tree
3349 77299 : neutral_op_for_reduction (tree scalar_type, code_helper code,
3350 : tree initial_value, bool as_initial)
3351 : {
3352 77299 : if (code.is_tree_code ())
3353 77241 : switch (tree_code (code))
3354 : {
3355 13812 : case DOT_PROD_EXPR:
3356 13812 : case SAD_EXPR:
3357 13812 : case MINUS_EXPR:
3358 13812 : case BIT_IOR_EXPR:
3359 13812 : case BIT_XOR_EXPR:
3360 13812 : return build_zero_cst (scalar_type);
3361 57326 : case WIDEN_SUM_EXPR:
3362 57326 : case PLUS_EXPR:
3363 57326 : if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3364 92 : return build_real (scalar_type, dconstm0);
3365 : else
3366 57234 : return build_zero_cst (scalar_type);
3367 :
3368 2109 : case MULT_EXPR:
3369 2109 : return build_one_cst (scalar_type);
3370 :
3371 1558 : case BIT_AND_EXPR:
3372 1558 : return build_all_ones_cst (scalar_type);
3373 :
3374 : case MAX_EXPR:
3375 : case MIN_EXPR:
3376 : return initial_value;
3377 :
3378 408 : default:
3379 408 : return NULL_TREE;
3380 : }
3381 : else
3382 58 : switch (combined_fn (code))
3383 : {
3384 : CASE_CFN_FMIN:
3385 : CASE_CFN_FMAX:
3386 : return initial_value;
3387 :
3388 0 : default:
3389 0 : return NULL_TREE;
3390 : }
3391 : }
3392 :
3393 : /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3394 : STMT is printed with a message MSG. */
3395 :
3396 : static void
3397 578 : report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3398 : {
3399 578 : dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3400 578 : }
3401 :
3402 : /* Return true if we need an in-order reduction for operation CODE
3403 : on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3404 : overflow must wrap. */
3405 :
3406 : bool
3407 6465614 : needs_fold_left_reduction_p (tree type, code_helper code)
3408 : {
3409 : /* CHECKME: check for !flag_finite_math_only too? */
3410 6465614 : if (SCALAR_FLOAT_TYPE_P (type))
3411 : {
3412 576308 : if (code.is_tree_code ())
3413 576254 : switch (tree_code (code))
3414 : {
3415 : case MIN_EXPR:
3416 : case MAX_EXPR:
3417 : return false;
3418 :
3419 574382 : default:
3420 574382 : return !flag_associative_math;
3421 : }
3422 : else
3423 54 : switch (combined_fn (code))
3424 : {
3425 : CASE_CFN_FMIN:
3426 : CASE_CFN_FMAX:
3427 : return false;
3428 :
3429 2 : default:
3430 2 : return !flag_associative_math;
3431 : }
3432 : }
3433 :
3434 5889306 : if (INTEGRAL_TYPE_P (type))
3435 5888434 : return (!code.is_tree_code ()
3436 5888434 : || !operation_no_trapping_overflow (type, tree_code (code)));
3437 :
3438 872 : if (SAT_FIXED_POINT_TYPE_P (type))
3439 : return true;
3440 :
3441 : return false;
3442 : }
3443 :
3444 : /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3445 : has a handled computation expression. Store the main reduction
3446 : operation in *CODE. */
3447 :
3448 : static bool
3449 101642 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3450 : tree loop_arg, code_helper *code,
3451 : vec<std::pair<ssa_op_iter, use_operand_p> > &path,
3452 : bool inner_loop_of_double_reduc)
3453 : {
3454 101642 : auto_bitmap visited;
3455 101642 : tree lookfor = PHI_RESULT (phi);
3456 101642 : ssa_op_iter curri;
3457 101642 : use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3458 211417 : while (USE_FROM_PTR (curr) != loop_arg)
3459 8133 : curr = op_iter_next_use (&curri);
3460 101642 : curri.i = curri.numops;
3461 946604 : do
3462 : {
3463 946604 : path.safe_push (std::make_pair (curri, curr));
3464 946604 : tree use = USE_FROM_PTR (curr);
3465 946604 : if (use == lookfor)
3466 : break;
3467 845362 : gimple *def = SSA_NAME_DEF_STMT (use);
3468 845362 : if (gimple_nop_p (def)
3469 845362 : || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3470 : {
3471 711580 : pop:
3472 711580 : do
3473 : {
3474 711580 : std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3475 711580 : curri = x.first;
3476 711580 : curr = x.second;
3477 778803 : do
3478 778803 : curr = op_iter_next_use (&curri);
3479 : /* Skip already visited or non-SSA operands (from iterating
3480 : over PHI args). */
3481 : while (curr != NULL_USE_OPERAND_P
3482 1557606 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3483 269021 : || ! bitmap_set_bit (visited,
3484 269021 : SSA_NAME_VERSION
3485 : (USE_FROM_PTR (curr)))));
3486 : }
3487 1423160 : while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3488 237955 : if (curr == NULL_USE_OPERAND_P)
3489 : break;
3490 : }
3491 : else
3492 : {
3493 711045 : if (gimple_code (def) == GIMPLE_PHI)
3494 72163 : curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3495 : else
3496 638882 : curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3497 : while (curr != NULL_USE_OPERAND_P
3498 849187 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3499 740415 : || ! bitmap_set_bit (visited,
3500 740415 : SSA_NAME_VERSION
3501 : (USE_FROM_PTR (curr)))))
3502 138142 : curr = op_iter_next_use (&curri);
3503 711045 : if (curr == NULL_USE_OPERAND_P)
3504 103638 : goto pop;
3505 : }
3506 : }
3507 : while (1);
3508 101642 : if (dump_file && (dump_flags & TDF_DETAILS))
3509 : {
3510 4076 : dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3511 4076 : unsigned i;
3512 4076 : std::pair<ssa_op_iter, use_operand_p> *x;
3513 13845 : FOR_EACH_VEC_ELT (path, i, x)
3514 9769 : dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3515 4076 : dump_printf (MSG_NOTE, "\n");
3516 : }
3517 :
3518 : /* Check whether the reduction path detected is valid. */
3519 101642 : bool fail = path.length () == 0;
3520 101642 : bool neg = false;
3521 101642 : int sign = -1;
3522 101642 : *code = ERROR_MARK;
3523 218667 : for (unsigned i = 1; i < path.length (); ++i)
3524 : {
3525 123703 : gimple *use_stmt = USE_STMT (path[i].second);
3526 123703 : gimple_match_op op;
3527 123703 : if (!gimple_extract_op (use_stmt, &op))
3528 : {
3529 : fail = true;
3530 6678 : break;
3531 : }
3532 122816 : unsigned int opi = op.num_ops;
3533 122816 : if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3534 : {
3535 : /* The following make sure we can compute the operand index
3536 : easily plus it mostly disallows chaining via COND_EXPR condition
3537 : operands. */
3538 189968 : for (opi = 0; opi < op.num_ops; ++opi)
3539 188956 : if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3540 : break;
3541 : }
3542 6228 : else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3543 : {
3544 12477 : for (opi = 0; opi < op.num_ops; ++opi)
3545 12477 : if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3546 : break;
3547 : }
3548 122816 : if (opi == op.num_ops)
3549 : {
3550 : fail = true;
3551 : break;
3552 : }
3553 121804 : op.code = canonicalize_code (op.code, op.type);
3554 121804 : if (op.code == MINUS_EXPR)
3555 : {
3556 5594 : op.code = PLUS_EXPR;
3557 : /* Track whether we negate the reduction value each iteration. */
3558 5594 : if (op.ops[1] == op.ops[opi])
3559 34 : neg = ! neg;
3560 : }
3561 116210 : else if (op.code == IFN_COND_SUB)
3562 : {
3563 9 : op.code = IFN_COND_ADD;
3564 : /* Track whether we negate the reduction value each iteration. */
3565 9 : if (op.ops[2] == op.ops[opi])
3566 0 : neg = ! neg;
3567 : }
3568 : /* For an FMA the reduction code is the PLUS if the addition chain
3569 : is the reduction. */
3570 116201 : else if (op.code == IFN_FMA && opi == 2)
3571 33 : op.code = PLUS_EXPR;
3572 121804 : if (CONVERT_EXPR_CODE_P (op.code)
3573 121804 : && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3574 : ;
3575 116326 : else if (*code == ERROR_MARK)
3576 : {
3577 99440 : *code = op.code;
3578 99440 : sign = TYPE_SIGN (op.type);
3579 : }
3580 16886 : else if (op.code != *code)
3581 : {
3582 : fail = true;
3583 : break;
3584 : }
3585 15564 : else if ((op.code == MIN_EXPR
3586 15408 : || op.code == MAX_EXPR)
3587 15579 : && sign != TYPE_SIGN (op.type))
3588 : {
3589 : fail = true;
3590 : break;
3591 : }
3592 : /* Check there's only a single stmt the op is used on. For the
3593 : not value-changing tail and the last stmt allow out-of-loop uses,
3594 : but not when this is the inner loop of a double reduction.
3595 : ??? We could relax this and handle arbitrary live stmts by
3596 : forcing a scalar epilogue for example. */
3597 120479 : imm_use_iterator imm_iter;
3598 120479 : use_operand_p use_p;
3599 120479 : gimple *op_use_stmt;
3600 120479 : unsigned cnt = 0;
3601 126672 : bool cond_fn_p = op.code.is_internal_fn ()
3602 6193 : && (conditional_internal_fn_code (internal_fn (op.code))
3603 120479 : != ERROR_MARK);
3604 :
3605 409684 : FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3606 : {
3607 : /* In case of a COND_OP (mask, op1, op2, op1) reduction we should
3608 : have op1 twice (once as definition, once as else) in the same
3609 : operation. Enforce this. */
3610 168726 : if (cond_fn_p && op_use_stmt == use_stmt)
3611 : {
3612 6127 : gcall *call = as_a<gcall *> (use_stmt);
3613 6127 : unsigned else_pos
3614 6127 : = internal_fn_else_index (internal_fn (op.code));
3615 6127 : if (gimple_call_arg (call, else_pos) != op.ops[opi])
3616 : {
3617 : fail = true;
3618 : break;
3619 : }
3620 30635 : for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
3621 : {
3622 24508 : if (j == else_pos)
3623 6127 : continue;
3624 18381 : if (gimple_call_arg (call, j) == op.ops[opi])
3625 6127 : cnt++;
3626 : }
3627 : }
3628 162599 : else if (!is_gimple_debug (op_use_stmt)
3629 162599 : && ((*code != ERROR_MARK || inner_loop_of_double_reduc)
3630 2813 : || flow_bb_inside_loop_p (loop,
3631 2813 : gimple_bb (op_use_stmt))))
3632 235759 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3633 117884 : cnt++;
3634 120479 : }
3635 :
3636 120479 : if (cnt != 1)
3637 : {
3638 : fail = true;
3639 : break;
3640 : }
3641 : }
3642 108731 : return ! fail && ! neg && *code != ERROR_MARK;
3643 101642 : }
3644 :
3645 : bool
3646 21 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3647 : tree loop_arg, enum tree_code code)
3648 : {
3649 21 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3650 21 : code_helper code_;
3651 21 : return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path, false)
3652 21 : && code_ == code);
3653 21 : }
3654 :
3655 :
3656 :
3657 : /* Function vect_is_simple_reduction
3658 :
3659 : (1) Detect a cross-iteration def-use cycle that represents a simple
3660 : reduction computation. We look for the following pattern:
3661 :
3662 : loop_header:
3663 : a1 = phi < a0, a2 >
3664 : a3 = ...
3665 : a2 = operation (a3, a1)
3666 :
3667 : or
3668 :
3669 : a3 = ...
3670 : loop_header:
3671 : a1 = phi < a0, a2 >
3672 : a2 = operation (a3, a1)
3673 :
3674 : such that:
3675 : 1. operation is commutative and associative and it is safe to
3676 : change the order of the computation
3677 : 2. no uses for a2 in the loop (a2 is used out of the loop)
3678 : 3. no uses of a1 in the loop besides the reduction operation
3679 : 4. no uses of a1 outside the loop.
3680 :
3681 : Conditions 1,4 are tested here.
3682 : Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3683 :
3684 : (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3685 : nested cycles.
3686 :
3687 : (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3688 : reductions:
3689 :
3690 : a1 = phi < a0, a2 >
3691 : inner loop (def of a3)
3692 : a2 = phi < a3 >
3693 :
3694 : (4) Detect condition expressions, ie:
3695 : for (int i = 0; i < N; i++)
3696 : if (a[i] < val)
3697 : ret_val = a[i];
3698 :
3699 : */
3700 :
3701 : static stmt_vec_info
3702 164119 : vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3703 : gphi **double_reduc)
3704 : {
3705 164119 : gphi *phi = as_a <gphi *> (phi_info->stmt);
3706 164119 : gimple *phi_use_stmt = NULL;
3707 164119 : imm_use_iterator imm_iter;
3708 164119 : use_operand_p use_p;
3709 :
3710 : /* When double_reduc is NULL we are testing the inner loop of a
3711 : double reduction. */
3712 164119 : bool inner_loop_of_double_reduc = double_reduc == NULL;
3713 164119 : if (double_reduc)
3714 163017 : *double_reduc = NULL;
3715 164119 : STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3716 :
3717 164119 : tree phi_name = PHI_RESULT (phi);
3718 : /* ??? If there are no uses of the PHI result the inner loop reduction
3719 : won't be detected as possibly double-reduction by vectorizable_reduction
3720 : because that tries to walk the PHI arg from the preheader edge which
3721 : can be constant. See PR60382. */
3722 164119 : if (has_zero_uses (phi_name))
3723 : return NULL;
3724 163986 : class loop *loop = (gimple_bb (phi))->loop_father;
3725 163986 : unsigned nphi_def_loop_uses = 0;
3726 619666 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3727 : {
3728 303437 : gimple *use_stmt = USE_STMT (use_p);
3729 303437 : if (is_gimple_debug (use_stmt))
3730 82504 : continue;
3731 :
3732 220933 : if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3733 : {
3734 11743 : if (dump_enabled_p ())
3735 40 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3736 : "intermediate value used outside loop.\n");
3737 :
3738 11743 : return NULL;
3739 : }
3740 :
3741 : /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
3742 : op1 twice (once as definition, once as else) in the same operation.
3743 : Only count it as one. */
3744 209190 : if (use_stmt != phi_use_stmt)
3745 : {
3746 202656 : nphi_def_loop_uses++;
3747 202656 : phi_use_stmt = use_stmt;
3748 : }
3749 11743 : }
3750 :
3751 152243 : tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3752 152243 : if (TREE_CODE (latch_def) != SSA_NAME)
3753 : {
3754 1449 : if (dump_enabled_p ())
3755 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3756 : "reduction: not ssa_name: %T\n", latch_def);
3757 1449 : return NULL;
3758 : }
3759 :
3760 150794 : stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3761 150794 : if (!def_stmt_info
3762 150794 : || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3763 161 : return NULL;
3764 :
3765 150633 : bool nested_in_vect_loop
3766 150633 : = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3767 150633 : unsigned nlatch_def_loop_uses = 0;
3768 150633 : auto_vec<gphi *, 3> lcphis;
3769 740879 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3770 : {
3771 439613 : gimple *use_stmt = USE_STMT (use_p);
3772 439613 : if (is_gimple_debug (use_stmt))
3773 135556 : continue;
3774 304057 : if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3775 189784 : nlatch_def_loop_uses++;
3776 : else
3777 : /* We can have more than one loop-closed PHI. */
3778 114273 : lcphis.safe_push (as_a <gphi *> (use_stmt));
3779 150633 : }
3780 :
3781 : /* If we are vectorizing an inner reduction we are executing that
3782 : in the original order only in case we are not dealing with a
3783 : double reduction. */
3784 150633 : if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3785 : {
3786 2431 : if (dump_enabled_p ())
3787 434 : report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3788 : "detected nested cycle: ");
3789 2431 : return def_stmt_info;
3790 : }
3791 :
3792 : /* When the inner loop of a double reduction ends up with more than
3793 : one loop-closed PHI we have failed to classify alternate such
3794 : PHIs as double reduction, leading to wrong code. See PR103237. */
3795 149292 : if (inner_loop_of_double_reduc && lcphis.length () != 1)
3796 : {
3797 1 : if (dump_enabled_p ())
3798 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3799 : "unhandle double reduction\n");
3800 1 : return NULL;
3801 : }
3802 :
3803 : /* If this isn't a nested cycle or if the nested cycle reduction value
3804 : is used ouside of the inner loop we cannot handle uses of the reduction
3805 : value. */
3806 148201 : if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3807 : {
3808 45223 : if (dump_enabled_p ())
3809 401 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3810 : "reduction used in loop.\n");
3811 45223 : return NULL;
3812 : }
3813 :
3814 : /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3815 : defined in the inner loop. */
3816 102978 : if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3817 : {
3818 1357 : tree op1 = PHI_ARG_DEF (def_stmt, 0);
3819 1357 : if (gimple_phi_num_args (def_stmt) != 1
3820 1357 : || TREE_CODE (op1) != SSA_NAME)
3821 : {
3822 91 : if (dump_enabled_p ())
3823 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3824 : "unsupported phi node definition.\n");
3825 :
3826 91 : return NULL;
3827 : }
3828 :
3829 : /* Verify there is an inner cycle composed of the PHI phi_use_stmt
3830 : and the latch definition op1. */
3831 1266 : gimple *def1 = SSA_NAME_DEF_STMT (op1);
3832 1266 : if (gimple_bb (def1)
3833 1266 : && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3834 1266 : && loop->inner
3835 1212 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3836 1212 : && (is_gimple_assign (def1) || is_gimple_call (def1))
3837 1203 : && is_a <gphi *> (phi_use_stmt)
3838 1191 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
3839 1191 : && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
3840 : loop_latch_edge (loop->inner)))
3841 2455 : && lcphis.length () == 1)
3842 : {
3843 1102 : if (dump_enabled_p ())
3844 144 : report_vect_op (MSG_NOTE, def_stmt,
3845 : "detected double reduction: ");
3846 :
3847 1102 : *double_reduc = as_a <gphi *> (phi_use_stmt);
3848 1102 : return def_stmt_info;
3849 : }
3850 :
3851 164 : return NULL;
3852 : }
3853 :
3854 : /* Look for the expression computing latch_def from then loop PHI result. */
3855 101621 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3856 101621 : code_helper code;
3857 101621 : if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3858 : path, inner_loop_of_double_reduc))
3859 : {
3860 94532 : STMT_VINFO_REDUC_CODE (phi_info) = code;
3861 94532 : if (code == COND_EXPR && !nested_in_vect_loop)
3862 8220 : STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3863 :
3864 : /* Fill in STMT_VINFO_REDUC_IDX. */
3865 94532 : unsigned i;
3866 304429 : for (i = path.length () - 1; i >= 1; --i)
3867 : {
3868 115365 : gimple *stmt = USE_STMT (path[i].second);
3869 115365 : stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3870 115365 : gimple_match_op op;
3871 115365 : if (!gimple_extract_op (stmt, &op))
3872 0 : gcc_unreachable ();
3873 115365 : if (gassign *assign = dyn_cast<gassign *> (stmt))
3874 109157 : STMT_VINFO_REDUC_IDX (stmt_info)
3875 109157 : = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3876 : else
3877 : {
3878 6208 : gcall *call = as_a<gcall *> (stmt);
3879 6208 : STMT_VINFO_REDUC_IDX (stmt_info)
3880 6208 : = path[i].second->use - gimple_call_arg_ptr (call, 0);
3881 : }
3882 : }
3883 94532 : if (dump_enabled_p ())
3884 4072 : dump_printf_loc (MSG_NOTE, vect_location,
3885 : "reduction: detected reduction\n");
3886 :
3887 94532 : return def_stmt_info;
3888 : }
3889 :
3890 7089 : if (dump_enabled_p ())
3891 86 : dump_printf_loc (MSG_NOTE, vect_location,
3892 : "reduction: unknown pattern\n");
3893 :
3894 : return NULL;
3895 252254 : }
3896 :
3897 : /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3898 : PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3899 : or -1 if not known. */
3900 :
3901 : static int
3902 479619 : vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3903 : {
3904 479619 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
3905 479619 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3906 : {
3907 202727 : if (dump_enabled_p ())
3908 3579 : dump_printf_loc (MSG_NOTE, vect_location,
3909 : "cost model: epilogue peel iters set to vf/2 "
3910 : "because loop iterations are unknown .\n");
3911 202727 : return assumed_vf / 2;
3912 : }
3913 : else
3914 : {
3915 276892 : int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3916 276892 : peel_iters_prologue = MIN (niters, peel_iters_prologue);
3917 276892 : int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3918 : /* If we need to peel for gaps, but no peeling is required, we have to
3919 : peel VF iterations. */
3920 276892 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3921 276892 : peel_iters_epilogue = assumed_vf;
3922 276892 : return peel_iters_epilogue;
3923 : }
3924 : }
3925 :
3926 : /* Calculate cost of peeling the scalar loop PEEL_ITERS_PROLOGUE times for
3927 : a prologue and the corresponding times for the epilogue. */
3928 : int
3929 355464 : vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue)
3930 : {
3931 355464 : int retval = 0;
3932 :
3933 355464 : int peel_iters_epilogue
3934 355464 : = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3935 :
3936 355464 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3937 : {
3938 : /* If peeled iterations are known but number of scalar loop
3939 : iterations are unknown, count a taken branch per peeled loop. */
3940 137066 : if (peel_iters_prologue > 0)
3941 83104 : retval = builtin_vectorization_cost (cond_branch_taken, NULL_TREE, 0);
3942 137066 : if (peel_iters_epilogue > 0)
3943 136958 : retval += builtin_vectorization_cost (cond_branch_taken, NULL_TREE, 0);
3944 : }
3945 :
3946 710928 : retval += ((peel_iters_prologue + peel_iters_epilogue)
3947 355464 : * loop_vinfo->scalar_costs->body_cost ());
3948 710928 : retval += (((peel_iters_prologue != 0) + (peel_iters_epilogue != 0))
3949 355464 : * loop_vinfo->scalar_costs->outside_cost ());
3950 :
3951 355464 : return retval;
3952 : }
3953 :
3954 : /* Function vect_estimate_min_profitable_iters
3955 :
3956 : Return the number of iterations required for the vector version of the
3957 : loop to be profitable relative to the cost of the scalar version of the
3958 : loop.
3959 :
3960 : *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3961 : of iterations for vectorization. -1 value means loop vectorization
3962 : is not profitable. This returned value may be used for dynamic
3963 : profitability check.
3964 :
3965 : *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3966 : for static check against estimated number of iterations. */
3967 :
3968 : static void
3969 141354 : vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3970 : int *ret_min_profitable_niters,
3971 : int *ret_min_profitable_estimate,
3972 : unsigned *suggested_unroll_factor)
3973 : {
3974 141354 : int min_profitable_iters;
3975 141354 : int min_profitable_estimate;
3976 141354 : int peel_iters_prologue;
3977 141354 : int peel_iters_epilogue;
3978 141354 : unsigned vec_inside_cost = 0;
3979 141354 : int vec_outside_cost = 0;
3980 141354 : unsigned vec_prologue_cost = 0;
3981 141354 : unsigned vec_epilogue_cost = 0;
3982 141354 : int scalar_single_iter_cost = 0;
3983 141354 : int scalar_outside_cost = 0;
3984 141354 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
3985 141354 : int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3986 141354 : vector_costs *target_cost_data = loop_vinfo->vector_costs;
3987 :
3988 : /* Cost model disabled. */
3989 141354 : if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3990 : {
3991 16882 : if (dump_enabled_p ())
3992 10608 : dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3993 16882 : *ret_min_profitable_niters = 0;
3994 16882 : *ret_min_profitable_estimate = 0;
3995 16882 : return;
3996 : }
3997 :
3998 : /* Requires loop versioning tests to handle misalignment. */
3999 124472 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4000 : {
4001 : /* FIXME: Make cost depend on complexity of individual check. */
4002 18 : unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4003 18 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4004 18 : if (dump_enabled_p ())
4005 2 : dump_printf (MSG_NOTE,
4006 : "cost model: Adding cost of checks for loop "
4007 : "versioning to treat misalignment.\n");
4008 : }
4009 :
4010 : /* Requires loop versioning with alias checks. */
4011 124472 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4012 : {
4013 : /* FIXME: Make cost depend on complexity of individual check. */
4014 6988 : unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4015 6988 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4016 6988 : len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4017 4 : if (len)
4018 : /* Count LEN - 1 ANDs and LEN comparisons. */
4019 4 : (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4020 : scalar_stmt, vect_prologue);
4021 6988 : len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4022 1148 : if (len)
4023 : {
4024 : /* Count LEN - 1 ANDs and LEN comparisons. */
4025 1148 : unsigned int nstmts = len * 2 - 1;
4026 : /* +1 for each bias that needs adding. */
4027 2296 : for (unsigned int i = 0; i < len; ++i)
4028 1148 : if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4029 143 : nstmts += 1;
4030 1148 : (void) add_stmt_cost (target_cost_data, nstmts,
4031 : scalar_stmt, vect_prologue);
4032 : }
4033 6988 : if (dump_enabled_p ())
4034 32 : dump_printf (MSG_NOTE,
4035 : "cost model: Adding cost of checks for loop "
4036 : "versioning aliasing.\n");
4037 : }
4038 :
4039 : /* Requires loop versioning with niter checks. */
4040 124472 : if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4041 : {
4042 : /* FIXME: Make cost depend on complexity of individual check. */
4043 763 : (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4044 : NULL, NULL, NULL_TREE, 0, vect_prologue);
4045 763 : if (dump_enabled_p ())
4046 1 : dump_printf (MSG_NOTE,
4047 : "cost model: Adding cost of checks for loop "
4048 : "versioning niters.\n");
4049 : }
4050 :
4051 124472 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4052 7763 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4053 : vect_prologue);
4054 :
4055 : /* Count statements in scalar loop. Using this as scalar cost for a single
4056 : iteration for now.
4057 :
4058 : TODO: Add outer loop support.
4059 :
4060 : TODO: Consider assigning different costs to different scalar
4061 : statements. */
4062 :
4063 124472 : scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4064 :
4065 : /* Add additional cost for the peeled instructions in prologue and epilogue
4066 : loop. (For fully-masked loops there will be no peeling.)
4067 :
4068 : FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4069 : at compile-time - we assume it's vf/2 (the worst would be vf-1).
4070 :
4071 : TODO: Build an expression that represents peel_iters for prologue and
4072 : epilogue to be used in a run-time test. */
4073 :
4074 124472 : bool prologue_need_br_taken_cost = false;
4075 124472 : bool prologue_need_br_not_taken_cost = false;
4076 :
4077 : /* Calculate peel_iters_prologue. */
4078 124472 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4079 : peel_iters_prologue = 0;
4080 124472 : else if (npeel < 0)
4081 : {
4082 291 : peel_iters_prologue = assumed_vf / 2;
4083 291 : if (dump_enabled_p ())
4084 8 : dump_printf (MSG_NOTE, "cost model: "
4085 : "prologue peel iters set to vf/2.\n");
4086 :
4087 : /* If peeled iterations are unknown, count a taken branch and a not taken
4088 : branch per peeled loop. Even if scalar loop iterations are known,
4089 : vector iterations are not known since peeled prologue iterations are
4090 : not known. Hence guards remain the same. */
4091 : prologue_need_br_taken_cost = true;
4092 : prologue_need_br_not_taken_cost = true;
4093 : }
4094 : else
4095 : {
4096 124181 : peel_iters_prologue = npeel;
4097 124181 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4098 : /* If peeled iterations are known but number of scalar loop
4099 : iterations are unknown, count a taken branch per peeled loop. */
4100 124472 : prologue_need_br_taken_cost = true;
4101 : }
4102 :
4103 124472 : bool epilogue_need_br_taken_cost = false;
4104 124472 : bool epilogue_need_br_not_taken_cost = false;
4105 :
4106 : /* Calculate peel_iters_epilogue. */
4107 124472 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4108 : /* We need to peel exactly one iteration for gaps. */
4109 26 : peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4110 124446 : else if (npeel < 0)
4111 : {
4112 : /* If peeling for alignment is unknown, loop bound of main loop
4113 : becomes unknown. */
4114 291 : peel_iters_epilogue = assumed_vf / 2;
4115 291 : if (dump_enabled_p ())
4116 8 : dump_printf (MSG_NOTE, "cost model: "
4117 : "epilogue peel iters set to vf/2 because "
4118 : "peeling for alignment is unknown.\n");
4119 :
4120 : /* See the same reason above in peel_iters_prologue calculation. */
4121 : epilogue_need_br_taken_cost = true;
4122 : epilogue_need_br_not_taken_cost = true;
4123 : }
4124 : else
4125 : {
4126 124155 : peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4127 124155 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4128 : /* If peeled iterations are known but number of scalar loop
4129 : iterations are unknown, count a taken branch per peeled loop. */
4130 124472 : epilogue_need_br_taken_cost = true;
4131 : }
4132 :
4133 : /* The way we cummulate peeling costs into the vector prologue/epilogue
4134 : cost is a bit awkward given we cannot reuse scalar_costs which is
4135 : already computed and also because it cannot take into account any
4136 : epilogue vectorization we'll carry out in the end. */
4137 :
4138 124472 : stmt_info_for_cost *si;
4139 124472 : int j;
4140 : /* Add costs associated with peel_iters_prologue. */
4141 124472 : if (peel_iters_prologue)
4142 1104 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4143 : {
4144 799 : (void) add_stmt_cost (target_cost_data,
4145 799 : si->count * peel_iters_prologue, si->kind,
4146 : si->stmt_info, si->node, si->vectype,
4147 : si->misalign, vect_prologue);
4148 : }
4149 :
4150 : /* Add costs associated with peel_iters_epilogue. */
4151 124472 : if (peel_iters_epilogue)
4152 387616 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4153 : {
4154 310741 : (void) add_stmt_cost (target_cost_data,
4155 310741 : si->count * peel_iters_epilogue, si->kind,
4156 : si->stmt_info, si->node, si->vectype,
4157 : si->misalign, vect_epilogue);
4158 : }
4159 :
4160 : /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4161 :
4162 124472 : if (prologue_need_br_taken_cost)
4163 291 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4164 : vect_prologue);
4165 :
4166 124472 : if (prologue_need_br_not_taken_cost)
4167 291 : (void) add_stmt_cost (target_cost_data, 1,
4168 : cond_branch_not_taken, vect_prologue);
4169 :
4170 124472 : if (epilogue_need_br_taken_cost)
4171 65072 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4172 : vect_epilogue);
4173 :
4174 124472 : if (epilogue_need_br_not_taken_cost)
4175 291 : (void) add_stmt_cost (target_cost_data, 1,
4176 : cond_branch_not_taken, vect_epilogue);
4177 :
4178 : /* Take care of special costs for rgroup controls of partial vectors. */
4179 26 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4180 124498 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4181 : == vect_partial_vectors_avx512))
4182 : {
4183 : /* Calculate how many masks we need to generate. */
4184 26 : unsigned int num_masks = 0;
4185 26 : bool need_saturation = false;
4186 108 : for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4187 30 : if (rgm.type)
4188 : {
4189 26 : unsigned nvectors = rgm.factor;
4190 26 : num_masks += nvectors;
4191 26 : if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4192 26 : < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4193 9 : need_saturation = true;
4194 : }
4195 :
4196 : /* ??? The target isn't able to identify the costs below as
4197 : producing masks so it cannot penaltize cases where we'd run
4198 : out of mask registers for example. */
4199 :
4200 : /* ??? We are also failing to account for smaller vector masks
4201 : we generate by splitting larger masks in vect_get_loop_mask. */
4202 :
4203 : /* In the worst case, we need to generate each mask in the prologue
4204 : and in the loop body. We need one splat per group and one
4205 : compare per mask.
4206 :
4207 : Sometimes the prologue mask will fold to a constant,
4208 : so the actual prologue cost might be smaller. However, it's
4209 : simpler and safer to use the worst-case cost; if this ends up
4210 : being the tie-breaker between vectorizing or not, then it's
4211 : probably better not to vectorize. */
4212 26 : (void) add_stmt_cost (target_cost_data,
4213 : num_masks
4214 26 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4215 : vector_stmt, NULL, NULL, NULL_TREE, 0,
4216 : vect_prologue);
4217 52 : (void) add_stmt_cost (target_cost_data,
4218 : num_masks
4219 52 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4220 : vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4221 :
4222 : /* When we need saturation we need it both in the prologue and
4223 : the epilogue. */
4224 26 : if (need_saturation)
4225 : {
4226 9 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4227 : NULL, NULL, NULL_TREE, 0, vect_prologue);
4228 9 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4229 : NULL, NULL, NULL_TREE, 0, vect_body);
4230 : }
4231 : }
4232 0 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4233 124446 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4234 : == vect_partial_vectors_while_ult))
4235 : {
4236 : /* Calculate how many masks we need to generate. */
4237 : unsigned int num_masks = 0;
4238 : rgroup_controls *rgm;
4239 : unsigned int num_vectors_m1;
4240 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4241 : num_vectors_m1, rgm)
4242 0 : if (rgm->type)
4243 0 : num_masks += num_vectors_m1 + 1;
4244 0 : gcc_assert (num_masks > 0);
4245 :
4246 : /* In the worst case, we need to generate each mask in the prologue
4247 : and in the loop body. One of the loop body mask instructions
4248 : replaces the comparison in the scalar loop, and since we don't
4249 : count the scalar comparison against the scalar body, we shouldn't
4250 : count that vector instruction against the vector body either.
4251 :
4252 : Sometimes we can use unpacks instead of generating prologue
4253 : masks and sometimes the prologue mask will fold to a constant,
4254 : so the actual prologue cost might be smaller. However, it's
4255 : simpler and safer to use the worst-case cost; if this ends up
4256 : being the tie-breaker between vectorizing or not, then it's
4257 : probably better not to vectorize. */
4258 0 : (void) add_stmt_cost (target_cost_data, num_masks,
4259 : vector_stmt, NULL, NULL, NULL_TREE, 0,
4260 : vect_prologue);
4261 0 : (void) add_stmt_cost (target_cost_data, num_masks - 1,
4262 : vector_stmt, NULL, NULL, NULL_TREE, 0,
4263 : vect_body);
4264 : }
4265 124446 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4266 : {
4267 : /* Referring to the functions vect_set_loop_condition_partial_vectors
4268 : and vect_set_loop_controls_directly, we need to generate each
4269 : length in the prologue and in the loop body if required. Although
4270 : there are some possible optimizations, we consider the worst case
4271 : here. */
4272 :
4273 0 : bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4274 0 : signed char partial_load_store_bias
4275 : = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4276 0 : bool need_iterate_p
4277 0 : = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4278 0 : && !vect_known_niters_smaller_than_vf (loop_vinfo));
4279 :
4280 : /* Calculate how many statements to be added. */
4281 0 : unsigned int prologue_stmts = 0;
4282 0 : unsigned int body_stmts = 0;
4283 :
4284 0 : rgroup_controls *rgc;
4285 0 : unsigned int num_vectors_m1;
4286 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4287 0 : if (rgc->type)
4288 : {
4289 : /* May need one SHIFT for nitems_total computation. */
4290 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4291 0 : if (nitems != 1 && !niters_known_p)
4292 0 : prologue_stmts += 1;
4293 :
4294 : /* May need one MAX and one MINUS for wrap around. */
4295 0 : if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4296 0 : prologue_stmts += 2;
4297 :
4298 : /* Need one MAX and one MINUS for each batch limit excepting for
4299 : the 1st one. */
4300 0 : prologue_stmts += num_vectors_m1 * 2;
4301 :
4302 0 : unsigned int num_vectors = num_vectors_m1 + 1;
4303 :
4304 : /* Need to set up lengths in prologue, only one MIN required
4305 : for each since start index is zero. */
4306 0 : prologue_stmts += num_vectors;
4307 :
4308 : /* If we have a non-zero partial load bias, we need one PLUS
4309 : to adjust the load length. */
4310 0 : if (partial_load_store_bias != 0)
4311 0 : body_stmts += 1;
4312 :
4313 0 : unsigned int length_update_cost = 0;
4314 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4315 : /* For decrement IV style, Each only need a single SELECT_VL
4316 : or MIN since beginning to calculate the number of elements
4317 : need to be processed in current iteration. */
4318 : length_update_cost = 1;
4319 : else
4320 : /* For increment IV stype, Each may need two MINs and one MINUS to
4321 : update lengths in body for next iteration. */
4322 0 : length_update_cost = 3;
4323 :
4324 0 : if (need_iterate_p)
4325 0 : body_stmts += length_update_cost * num_vectors;
4326 : }
4327 :
4328 0 : (void) add_stmt_cost (target_cost_data, prologue_stmts,
4329 : scalar_stmt, vect_prologue);
4330 0 : (void) add_stmt_cost (target_cost_data, body_stmts,
4331 : scalar_stmt, vect_body);
4332 : }
4333 :
4334 : /* FORNOW: The scalar outside cost is incremented in one of the
4335 : following ways:
4336 :
4337 : 1. The vectorizer checks for alignment and aliasing and generates
4338 : a condition that allows dynamic vectorization. A cost model
4339 : check is ANDED with the versioning condition. Hence scalar code
4340 : path now has the added cost of the versioning check.
4341 :
4342 : if (cost > th & versioning_check)
4343 : jmp to vector code
4344 :
4345 : Hence run-time scalar is incremented by not-taken branch cost.
4346 :
4347 : 2. The vectorizer then checks if a prologue is required. If the
4348 : cost model check was not done before during versioning, it has to
4349 : be done before the prologue check.
4350 :
4351 : if (cost <= th)
4352 : prologue = scalar_iters
4353 : if (prologue == 0)
4354 : jmp to vector code
4355 : else
4356 : execute prologue
4357 : if (prologue == num_iters)
4358 : go to exit
4359 :
4360 : Hence the run-time scalar cost is incremented by a taken branch,
4361 : plus a not-taken branch, plus a taken branch cost.
4362 :
4363 : 3. The vectorizer then checks if an epilogue is required. If the
4364 : cost model check was not done before during prologue check, it
4365 : has to be done with the epilogue check.
4366 :
4367 : if (prologue == 0)
4368 : jmp to vector code
4369 : else
4370 : execute prologue
4371 : if (prologue == num_iters)
4372 : go to exit
4373 : vector code:
4374 : if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4375 : jmp to epilogue
4376 :
4377 : Hence the run-time scalar cost should be incremented by 2 taken
4378 : branches.
4379 :
4380 : TODO: The back end may reorder the BBS's differently and reverse
4381 : conditions/branch directions. Change the estimates below to
4382 : something more reasonable. */
4383 :
4384 : /* If the number of iterations is known and we do not do versioning, we can
4385 : decide whether to vectorize at compile time. Hence the scalar version
4386 : do not carry cost model guard costs. */
4387 58542 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4388 183014 : || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4389 : {
4390 : /* Cost model check occurs at versioning. */
4391 66995 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4392 7763 : scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4393 : else
4394 : {
4395 : /* Cost model check occurs at prologue generation. */
4396 59232 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4397 152 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4398 152 : + vect_get_stmt_cost (cond_branch_not_taken);
4399 : /* Cost model check occurs at epilogue generation. */
4400 : else
4401 59080 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4402 : }
4403 : }
4404 :
4405 : /* Complete the target-specific cost calculations. */
4406 124472 : loop_vinfo->vector_costs->finish_cost (loop_vinfo->scalar_costs);
4407 124472 : vec_prologue_cost = loop_vinfo->vector_costs->prologue_cost ();
4408 124472 : vec_inside_cost = loop_vinfo->vector_costs->body_cost ();
4409 124472 : vec_epilogue_cost = loop_vinfo->vector_costs->epilogue_cost ();
4410 124472 : if (suggested_unroll_factor)
4411 124104 : *suggested_unroll_factor
4412 124104 : = loop_vinfo->vector_costs->suggested_unroll_factor ();
4413 :
4414 124104 : if (suggested_unroll_factor && *suggested_unroll_factor > 1
4415 413 : && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4416 0 : && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4417 : *suggested_unroll_factor,
4418 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4419 : {
4420 0 : if (dump_enabled_p ())
4421 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4422 : "can't unroll as unrolled vectorization factor larger"
4423 : " than maximum vectorization factor: "
4424 : HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4425 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4426 0 : *suggested_unroll_factor = 1;
4427 : }
4428 :
4429 124472 : vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4430 :
4431 124472 : if (dump_enabled_p ())
4432 : {
4433 1075 : dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4434 1075 : dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4435 : vec_inside_cost);
4436 1075 : dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4437 : vec_prologue_cost);
4438 1075 : dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4439 : vec_epilogue_cost);
4440 1075 : dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4441 : scalar_single_iter_cost);
4442 1075 : dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4443 : scalar_outside_cost);
4444 1075 : dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4445 : vec_outside_cost);
4446 1075 : dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4447 : peel_iters_prologue);
4448 1075 : dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4449 : peel_iters_epilogue);
4450 : }
4451 :
4452 : /* Calculate number of iterations required to make the vector version
4453 : profitable, relative to the loop bodies only. The following condition
4454 : must hold true:
4455 : SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4456 : where
4457 : SIC = scalar iteration cost, VIC = vector iteration cost,
4458 : VOC = vector outside cost, VF = vectorization factor,
4459 : NPEEL = prologue iterations + epilogue iterations,
4460 : SOC = scalar outside cost for run time cost model check. */
4461 :
4462 124472 : int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4463 124472 : - vec_inside_cost);
4464 124472 : if (saving_per_viter <= 0)
4465 : {
4466 24580 : if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4467 0 : warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4468 : "vectorization did not happen for a simd loop");
4469 :
4470 24580 : if (dump_enabled_p ())
4471 34 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4472 : "cost model: the vector iteration cost = %d "
4473 : "divided by the scalar iteration cost = %d "
4474 : "is greater or equal to the vectorization factor = %d"
4475 : ".\n",
4476 : vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4477 24580 : *ret_min_profitable_niters = -1;
4478 24580 : *ret_min_profitable_estimate = -1;
4479 24580 : return;
4480 : }
4481 :
4482 : /* ??? The "if" arm is written to handle all cases; see below for what
4483 : we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4484 99892 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4485 : {
4486 : /* Rewriting the condition above in terms of the number of
4487 : vector iterations (vniters) rather than the number of
4488 : scalar iterations (niters) gives:
4489 :
4490 : SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4491 :
4492 : <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4493 :
4494 : For integer N, X and Y when X > 0:
4495 :
4496 : N * X > Y <==> N >= (Y /[floor] X) + 1. */
4497 18 : int outside_overhead = (vec_outside_cost
4498 18 : - scalar_single_iter_cost * peel_iters_prologue
4499 18 : - scalar_single_iter_cost * peel_iters_epilogue
4500 : - scalar_outside_cost);
4501 : /* We're only interested in cases that require at least one
4502 : vector iteration. */
4503 18 : int min_vec_niters = 1;
4504 18 : if (outside_overhead > 0)
4505 13 : min_vec_niters = outside_overhead / saving_per_viter + 1;
4506 :
4507 18 : if (dump_enabled_p ())
4508 7 : dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4509 : min_vec_niters);
4510 :
4511 18 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4512 : {
4513 : /* Now that we know the minimum number of vector iterations,
4514 : find the minimum niters for which the scalar cost is larger:
4515 :
4516 : SIC * niters > VIC * vniters + VOC - SOC
4517 :
4518 : We know that the minimum niters is no more than
4519 : vniters * VF + NPEEL, but it might be (and often is) less
4520 : than that if a partial vector iteration is cheaper than the
4521 : equivalent scalar code. */
4522 18 : int threshold = (vec_inside_cost * min_vec_niters
4523 18 : + vec_outside_cost
4524 18 : - scalar_outside_cost);
4525 18 : if (threshold <= 0)
4526 : min_profitable_iters = 1;
4527 : else
4528 18 : min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4529 : }
4530 : else
4531 : /* Convert the number of vector iterations into a number of
4532 : scalar iterations. */
4533 0 : min_profitable_iters = (min_vec_niters * assumed_vf
4534 0 : + peel_iters_prologue
4535 : + peel_iters_epilogue);
4536 : }
4537 : else
4538 : {
4539 99874 : min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4540 99874 : * assumed_vf
4541 99874 : - vec_inside_cost * peel_iters_prologue
4542 99874 : - vec_inside_cost * peel_iters_epilogue);
4543 99874 : if (min_profitable_iters <= 0)
4544 : min_profitable_iters = 0;
4545 : else
4546 : {
4547 84848 : min_profitable_iters /= saving_per_viter;
4548 :
4549 84848 : if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4550 84848 : <= (((int) vec_inside_cost * min_profitable_iters)
4551 84848 : + (((int) vec_outside_cost - scalar_outside_cost)
4552 : * assumed_vf)))
4553 84848 : min_profitable_iters++;
4554 : }
4555 : }
4556 :
4557 99892 : if (dump_enabled_p ())
4558 1041 : dump_printf (MSG_NOTE,
4559 : " Calculated minimum iters for profitability: %d\n",
4560 : min_profitable_iters);
4561 :
4562 99892 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4563 99874 : && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4564 : /* We want the vectorized loop to execute at least once. */
4565 : min_profitable_iters = assumed_vf + peel_iters_prologue;
4566 22053 : else if (min_profitable_iters < peel_iters_prologue)
4567 : /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4568 : vectorized loop executes at least once. */
4569 : min_profitable_iters = peel_iters_prologue;
4570 :
4571 99892 : if (dump_enabled_p ())
4572 1041 : dump_printf_loc (MSG_NOTE, vect_location,
4573 : " Runtime profitability threshold = %d\n",
4574 : min_profitable_iters);
4575 :
4576 99892 : *ret_min_profitable_niters = min_profitable_iters;
4577 :
4578 : /* Calculate number of iterations required to make the vector version
4579 : profitable, relative to the loop bodies only.
4580 :
4581 : Non-vectorized variant is SIC * niters and it must win over vector
4582 : variant on the expected loop trip count. The following condition must hold true:
4583 : SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4584 :
4585 99892 : if (vec_outside_cost <= 0)
4586 : min_profitable_estimate = 0;
4587 : /* ??? This "else if" arm is written to handle all cases; see below for
4588 : what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4589 89361 : else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4590 : {
4591 : /* This is a repeat of the code above, but with + SOC rather
4592 : than - SOC. */
4593 18 : int outside_overhead = (vec_outside_cost
4594 18 : - scalar_single_iter_cost * peel_iters_prologue
4595 18 : - scalar_single_iter_cost * peel_iters_epilogue
4596 : + scalar_outside_cost);
4597 18 : int min_vec_niters = 1;
4598 18 : if (outside_overhead > 0)
4599 18 : min_vec_niters = outside_overhead / saving_per_viter + 1;
4600 :
4601 18 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4602 : {
4603 18 : int threshold = (vec_inside_cost * min_vec_niters
4604 18 : + vec_outside_cost
4605 18 : + scalar_outside_cost);
4606 18 : min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4607 : }
4608 : else
4609 : min_profitable_estimate = (min_vec_niters * assumed_vf
4610 : + peel_iters_prologue
4611 : + peel_iters_epilogue);
4612 : }
4613 : else
4614 : {
4615 89343 : min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4616 89343 : * assumed_vf
4617 89343 : - vec_inside_cost * peel_iters_prologue
4618 89343 : - vec_inside_cost * peel_iters_epilogue)
4619 89343 : / ((scalar_single_iter_cost * assumed_vf)
4620 : - vec_inside_cost);
4621 : }
4622 99892 : min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4623 99892 : if (dump_enabled_p ())
4624 1041 : dump_printf_loc (MSG_NOTE, vect_location,
4625 : " Static estimate profitability threshold = %d\n",
4626 : min_profitable_estimate);
4627 :
4628 99892 : *ret_min_profitable_estimate = min_profitable_estimate;
4629 : }
4630 :
4631 : /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4632 : vector elements (not bits) for a vector with NELT elements. */
4633 : static void
4634 2292 : calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4635 : vec_perm_builder *sel)
4636 : {
4637 : /* The encoding is a single stepped pattern. Any wrap-around is handled
4638 : by vec_perm_indices. */
4639 2292 : sel->new_vector (nelt, 1, 3);
4640 9168 : for (unsigned int i = 0; i < 3; i++)
4641 6876 : sel->quick_push (i + offset);
4642 2292 : }
4643 :
4644 : /* Checks whether the target supports whole-vector shifts for vectors of mode
4645 : MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4646 : it supports vec_perm_const with masks for all necessary shift amounts. */
4647 : static bool
4648 13633 : have_whole_vector_shift (machine_mode mode)
4649 : {
4650 13633 : if (can_implement_p (vec_shr_optab, mode))
4651 : return true;
4652 :
4653 : /* Variable-length vectors should be handled via the optab. */
4654 63 : unsigned int nelt;
4655 126 : if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4656 : return false;
4657 :
4658 63 : vec_perm_builder sel;
4659 63 : vec_perm_indices indices;
4660 315 : for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4661 : {
4662 252 : calc_vec_perm_mask_for_shift (i, nelt, &sel);
4663 252 : indices.new_vector (sel, 2, nelt);
4664 252 : if (!can_vec_perm_const_p (mode, mode, indices, false))
4665 : return false;
4666 : }
4667 : return true;
4668 63 : }
4669 :
4670 : /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
4671 : multiplication operands have differing signs and (b) we intend
4672 : to emulate the operation using a series of signed DOT_PROD_EXPRs.
4673 : See vect_emulate_mixed_dot_prod for the actual sequence used. */
4674 :
4675 : static bool
4676 2456 : vect_is_emulated_mixed_dot_prod (slp_tree slp_node)
4677 : {
4678 2456 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
4679 2456 : gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
4680 2003 : if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
4681 : return false;
4682 :
4683 821 : tree rhs1 = gimple_assign_rhs1 (assign);
4684 821 : tree rhs2 = gimple_assign_rhs2 (assign);
4685 821 : if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
4686 : return false;
4687 :
4688 627 : return !directly_supported_p (DOT_PROD_EXPR,
4689 : SLP_TREE_VECTYPE (slp_node),
4690 209 : SLP_TREE_VECTYPE
4691 : (SLP_TREE_CHILDREN (slp_node)[0]),
4692 209 : optab_vector_mixed_sign);
4693 : }
4694 :
4695 : /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4696 : functions. Design better to avoid maintenance issues. */
4697 :
4698 : /* Function vect_model_reduction_cost.
4699 :
4700 : Models cost for a reduction operation, including the vector ops
4701 : generated within the strip-mine loop in some cases, the initial
4702 : definition before the loop, and the epilogue code that must be generated. */
4703 :
4704 : static void
4705 71966 : vect_model_reduction_cost (loop_vec_info loop_vinfo,
4706 : slp_tree node, internal_fn reduc_fn,
4707 : vect_reduction_type reduction_type,
4708 : int ncopies, stmt_vector_for_cost *cost_vec)
4709 : {
4710 71966 : int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4711 71966 : tree vectype;
4712 71966 : machine_mode mode;
4713 71966 : class loop *loop = NULL;
4714 :
4715 71966 : if (loop_vinfo)
4716 71966 : loop = LOOP_VINFO_LOOP (loop_vinfo);
4717 :
4718 : /* Condition reductions generate two reductions in the loop. */
4719 71966 : if (reduction_type == COND_REDUCTION)
4720 304 : ncopies *= 2;
4721 :
4722 71966 : vectype = SLP_TREE_VECTYPE (node);
4723 71966 : mode = TYPE_MODE (vectype);
4724 71966 : stmt_vec_info orig_stmt_info
4725 71966 : = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
4726 :
4727 71966 : gimple_match_op op;
4728 71966 : if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4729 0 : gcc_unreachable ();
4730 :
4731 71966 : if (reduction_type == EXTRACT_LAST_REDUCTION)
4732 : /* No extra instructions are needed in the prologue. The loop body
4733 : operations are costed in vectorizable_condition. */
4734 : inside_cost = 0;
4735 71966 : else if (reduction_type == FOLD_LEFT_REDUCTION)
4736 : {
4737 : /* No extra instructions needed in the prologue. */
4738 4291 : prologue_cost = 0;
4739 :
4740 4291 : if (reduc_fn != IFN_LAST)
4741 : /* Count one reduction-like operation per vector. */
4742 0 : inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4743 : node, 0, vect_body);
4744 : else
4745 : {
4746 : /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4747 4291 : unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4748 4291 : inside_cost = record_stmt_cost (cost_vec, nelements,
4749 : vec_to_scalar, node, 0,
4750 : vect_body);
4751 4291 : inside_cost += record_stmt_cost (cost_vec, nelements,
4752 : scalar_stmt, node, 0,
4753 : vect_body);
4754 : }
4755 : }
4756 : else
4757 : {
4758 : /* Add in the cost of the initial definitions. */
4759 67675 : int prologue_stmts;
4760 67675 : if (reduction_type == COND_REDUCTION)
4761 : /* For cond reductions we have four vectors: initial index, step,
4762 : initial result of the data reduction, initial value of the index
4763 : reduction. */
4764 : prologue_stmts = 4;
4765 : else
4766 : /* We need the initial reduction value. */
4767 67371 : prologue_stmts = 1;
4768 67675 : prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4769 : scalar_to_vec, node, 0,
4770 : vect_prologue);
4771 : }
4772 :
4773 : /* Determine cost of epilogue code.
4774 :
4775 : We have a reduction operator that will reduce the vector in one statement.
4776 : Also requires scalar extract. */
4777 :
4778 71966 : if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4779 : {
4780 71782 : if (reduc_fn != IFN_LAST)
4781 : {
4782 52310 : if (reduction_type == COND_REDUCTION)
4783 : {
4784 : /* An EQ stmt and an COND_EXPR stmt. */
4785 8 : epilogue_cost += record_stmt_cost (cost_vec, 2,
4786 : vector_stmt, node, 0,
4787 : vect_epilogue);
4788 : /* Reduction of the max index and a reduction of the found
4789 : values. */
4790 8 : epilogue_cost += record_stmt_cost (cost_vec, 2,
4791 : vec_to_scalar, node, 0,
4792 : vect_epilogue);
4793 : /* A broadcast of the max value. */
4794 8 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4795 : scalar_to_vec, node, 0,
4796 : vect_epilogue);
4797 : }
4798 : else
4799 : {
4800 52302 : epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4801 : node, 0, vect_epilogue);
4802 52302 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4803 : vec_to_scalar, node, 0,
4804 : vect_epilogue);
4805 : }
4806 : }
4807 19472 : else if (reduction_type == COND_REDUCTION)
4808 : {
4809 296 : unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4810 : /* Extraction of scalar elements. */
4811 592 : epilogue_cost += record_stmt_cost (cost_vec,
4812 296 : 2 * estimated_nunits,
4813 : vec_to_scalar, node, 0,
4814 : vect_epilogue);
4815 : /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4816 296 : epilogue_cost += record_stmt_cost (cost_vec,
4817 296 : 2 * estimated_nunits - 3,
4818 : scalar_stmt, node, 0,
4819 : vect_epilogue);
4820 : }
4821 19176 : else if (reduction_type == EXTRACT_LAST_REDUCTION
4822 19176 : || reduction_type == FOLD_LEFT_REDUCTION)
4823 : /* No extra instructions need in the epilogue. */
4824 : ;
4825 : else
4826 : {
4827 14885 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4828 14885 : tree bitsize = TYPE_SIZE (op.type);
4829 14885 : int element_bitsize = tree_to_uhwi (bitsize);
4830 14885 : int nelements = vec_size_in_bits / element_bitsize;
4831 :
4832 14885 : if (op.code == COND_EXPR)
4833 31 : op.code = MAX_EXPR;
4834 :
4835 : /* We have a whole vector shift available. */
4836 3092 : if (VECTOR_MODE_P (mode)
4837 14885 : && directly_supported_p (op.code, vectype)
4838 26548 : && have_whole_vector_shift (mode))
4839 : {
4840 : /* Final reduction via vector shifts and the reduction operator.
4841 : Also requires scalar extract. */
4842 34989 : epilogue_cost += record_stmt_cost (cost_vec,
4843 23326 : exact_log2 (nelements) * 2,
4844 : vector_stmt, node, 0,
4845 : vect_epilogue);
4846 11663 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4847 : vec_to_scalar, node, 0,
4848 : vect_epilogue);
4849 : }
4850 : else
4851 : /* Use extracts and reduction op for final reduction. For N
4852 : elements, we have N extracts and N-1 reduction ops. */
4853 3222 : epilogue_cost += record_stmt_cost (cost_vec,
4854 3222 : nelements + nelements - 1,
4855 : vector_stmt, node, 0,
4856 : vect_epilogue);
4857 : }
4858 : }
4859 :
4860 71966 : if (dump_enabled_p ())
4861 2977 : dump_printf (MSG_NOTE,
4862 : "vect_model_reduction_cost: inside_cost = %d, "
4863 : "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4864 : prologue_cost, epilogue_cost);
4865 71966 : }
4866 :
4867 : /* SEQ is a sequence of instructions that initialize the reduction
4868 : described by REDUC_INFO. Emit them in the appropriate place. */
4869 :
4870 : static void
4871 442 : vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4872 : vect_reduc_info reduc_info, gimple *seq)
4873 : {
4874 442 : if (VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info))
4875 : {
4876 : /* When reusing an accumulator from the main loop, we only need
4877 : initialization instructions if the main loop can be skipped.
4878 : In that case, emit the initialization instructions at the end
4879 : of the guard block that does the skip. */
4880 22 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
4881 22 : gcc_assert (skip_edge);
4882 22 : gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4883 22 : gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4884 : }
4885 : else
4886 : {
4887 : /* The normal case: emit the initialization instructions on the
4888 : preheader edge. */
4889 420 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4890 420 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4891 : }
4892 442 : }
4893 :
4894 : /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4895 : which performs a reduction involving GROUP_SIZE scalar statements.
4896 : NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
4897 : is nonnull, introducing extra elements of that value will not change the
4898 : result. */
4899 :
4900 : static void
4901 21604 : get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4902 : vect_reduc_info reduc_info,
4903 : tree vector_type,
4904 : vec<tree> *vec_oprnds,
4905 : unsigned int number_of_vectors,
4906 : unsigned int group_size, tree neutral_op)
4907 : {
4908 21604 : vec<tree> &initial_values = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info);
4909 21604 : unsigned HOST_WIDE_INT nunits;
4910 21604 : unsigned j, number_of_places_left_in_vector;
4911 21604 : unsigned int i;
4912 :
4913 43208 : gcc_assert (group_size == initial_values.length () || neutral_op);
4914 :
4915 : /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4916 : created vectors. It is greater than 1 if unrolling is performed.
4917 :
4918 : For example, we have two scalar operands, s1 and s2 (e.g., group of
4919 : strided accesses of size two), while NUNITS is four (i.e., four scalars
4920 : of this type can be packed in a vector). The output vector will contain
4921 : two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4922 : will be 2).
4923 :
4924 : If GROUP_SIZE > NUNITS, the scalars will be split into several
4925 : vectors containing the operands.
4926 :
4927 : For example, NUNITS is four as before, and the group size is 8
4928 : (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4929 : {s5, s6, s7, s8}. */
4930 :
4931 21604 : if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4932 : nunits = group_size;
4933 :
4934 21604 : tree vector_elt_type = TREE_TYPE (vector_type);
4935 21604 : number_of_places_left_in_vector = nunits;
4936 21604 : bool constant_p = true;
4937 21604 : tree_vector_builder elts (vector_type, nunits, 1);
4938 21604 : elts.quick_grow (nunits);
4939 21604 : gimple_seq ctor_seq = NULL;
4940 21604 : if (neutral_op
4941 42642 : && !useless_type_conversion_p (vector_elt_type,
4942 21038 : TREE_TYPE (neutral_op)))
4943 : {
4944 222 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4945 201 : neutral_op = gimple_build (&ctor_seq, COND_EXPR,
4946 : vector_elt_type,
4947 : neutral_op,
4948 : build_all_ones_cst (vector_elt_type),
4949 : build_zero_cst (vector_elt_type));
4950 : else
4951 21 : neutral_op = gimple_convert (&ctor_seq, vector_elt_type, neutral_op);
4952 : }
4953 202576 : for (j = 0; j < nunits * number_of_vectors; ++j)
4954 : {
4955 180972 : tree op;
4956 180972 : i = j % group_size;
4957 :
4958 : /* Get the def before the loop. In reduction chain we have only
4959 : one initial value. Else we have as many as PHIs in the group. */
4960 180972 : if (i >= initial_values.length () || (j > i && neutral_op))
4961 : op = neutral_op;
4962 : else
4963 : {
4964 50734 : if (!useless_type_conversion_p (vector_elt_type,
4965 25367 : TREE_TYPE (initial_values[i])))
4966 : {
4967 237 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4968 426 : initial_values[i] = gimple_build (&ctor_seq, COND_EXPR,
4969 : vector_elt_type,
4970 213 : initial_values[i],
4971 : build_all_ones_cst
4972 : (vector_elt_type),
4973 : build_zero_cst
4974 : (vector_elt_type));
4975 : else
4976 48 : initial_values[i] = gimple_convert (&ctor_seq,
4977 : vector_elt_type,
4978 24 : initial_values[i]);
4979 : }
4980 25367 : op = initial_values[i];
4981 : }
4982 :
4983 : /* Create 'vect_ = {op0,op1,...,opn}'. */
4984 180972 : number_of_places_left_in_vector--;
4985 180972 : elts[nunits - number_of_places_left_in_vector - 1] = op;
4986 180972 : if (!CONSTANT_CLASS_P (op))
4987 2337 : constant_p = false;
4988 :
4989 180972 : if (number_of_places_left_in_vector == 0)
4990 : {
4991 23068 : tree init;
4992 46136 : if (constant_p && !neutral_op
4993 45853 : ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4994 23068 : : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4995 : /* Build the vector directly from ELTS. */
4996 23068 : init = gimple_build_vector (&ctor_seq, &elts);
4997 0 : else if (neutral_op)
4998 : {
4999 : /* Build a vector of the neutral value and shift the
5000 : other elements into place. */
5001 0 : init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5002 : neutral_op);
5003 0 : int k = nunits;
5004 0 : while (k > 0 && operand_equal_p (elts[k - 1], neutral_op))
5005 : k -= 1;
5006 0 : while (k > 0)
5007 : {
5008 0 : k -= 1;
5009 0 : init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5010 0 : vector_type, init, elts[k]);
5011 : }
5012 : }
5013 : else
5014 : {
5015 : /* First time round, duplicate ELTS to fill the
5016 : required number of vectors. */
5017 0 : duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5018 : elts, number_of_vectors, *vec_oprnds);
5019 0 : break;
5020 : }
5021 23068 : vec_oprnds->quick_push (init);
5022 :
5023 23068 : number_of_places_left_in_vector = nunits;
5024 23068 : elts.new_vector (vector_type, nunits, 1);
5025 23068 : elts.quick_grow (nunits);
5026 23068 : constant_p = true;
5027 : }
5028 : }
5029 21604 : if (ctor_seq != NULL)
5030 442 : vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5031 21604 : }
5032 :
5033 : vect_reduc_info
5034 160814 : info_for_reduction (loop_vec_info loop_vinfo, slp_tree node)
5035 : {
5036 160814 : if (node->cycle_info.id == -1)
5037 : return NULL;
5038 158846 : return loop_vinfo->reduc_infos[node->cycle_info.id];
5039 : }
5040 :
5041 : /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5042 : REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5043 : return false. */
5044 :
5045 : static bool
5046 21243 : vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5047 : vect_reduc_info reduc_info, tree vectype)
5048 : {
5049 21243 : loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5050 21243 : if (!main_loop_vinfo)
5051 : return false;
5052 :
5053 4576 : if (VECT_REDUC_INFO_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5054 : return false;
5055 :
5056 : /* We are not set up to handle vector bools when they are not mapped
5057 : to vector integer data types. */
5058 4561 : if (VECTOR_BOOLEAN_TYPE_P (vectype)
5059 4633 : && GET_MODE_CLASS (TYPE_MODE (vectype)) != MODE_VECTOR_INT)
5060 : return false;
5061 :
5062 4559 : unsigned int num_phis = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).length ();
5063 4559 : auto_vec<tree, 16> main_loop_results (num_phis);
5064 4559 : auto_vec<tree, 16> initial_values (num_phis);
5065 4559 : if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5066 : {
5067 : /* The epilogue loop can be entered either from the main loop or
5068 : from an earlier guard block. */
5069 4336 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
5070 17368 : for (tree incoming_value : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info))
5071 : {
5072 : /* Look for:
5073 :
5074 : INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5075 : INITIAL_VALUE(guard block)>. */
5076 4360 : gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5077 :
5078 4360 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5079 4360 : gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5080 :
5081 4360 : tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5082 4360 : tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5083 :
5084 4360 : main_loop_results.quick_push (from_main_loop);
5085 4360 : initial_values.quick_push (from_skip);
5086 : }
5087 : }
5088 : else
5089 : /* The main loop dominates the epilogue loop. */
5090 223 : main_loop_results.splice (VECT_REDUC_INFO_INITIAL_VALUES (reduc_info));
5091 :
5092 : /* See if the main loop has the kind of accumulator we need. */
5093 4559 : vect_reusable_accumulator *accumulator
5094 4559 : = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5095 4559 : if (!accumulator
5096 9102 : || num_phis != VECT_REDUC_INFO_SCALAR_RESULTS (accumulator->reduc_info).length ()
5097 13657 : || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5098 : VECT_REDUC_INFO_SCALAR_RESULTS (accumulator->reduc_info).begin ()))
5099 : return false;
5100 :
5101 : /* Handle the case where we can reduce wider vectors to narrower ones. */
5102 4549 : tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5103 4549 : unsigned HOST_WIDE_INT m;
5104 4549 : if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5105 4549 : TYPE_VECTOR_SUBPARTS (vectype), &m))
5106 0 : return false;
5107 : /* Check the intermediate vector types and operations are available. */
5108 4549 : tree prev_vectype = old_vectype;
5109 4549 : poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5110 13273 : while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5111 : {
5112 4699 : intermediate_nunits = exact_div (intermediate_nunits, 2);
5113 4699 : tree intermediate_vectype = get_related_vectype_for_scalar_type
5114 4699 : (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5115 4699 : if (!intermediate_vectype
5116 4699 : || !directly_supported_p (VECT_REDUC_INFO_CODE (reduc_info),
5117 : intermediate_vectype)
5118 8878 : || !can_vec_extract (TYPE_MODE (prev_vectype),
5119 4179 : TYPE_MODE (intermediate_vectype)))
5120 : return false;
5121 : prev_vectype = intermediate_vectype;
5122 : }
5123 :
5124 : /* Non-SLP reductions might apply an adjustment after the reduction
5125 : operation, in order to simplify the initialization of the accumulator.
5126 : If the epilogue loop carries on from where the main loop left off,
5127 : it should apply the same adjustment to the final reduction result.
5128 :
5129 : If the epilogue loop can also be entered directly (rather than via
5130 : the main loop), we need to be able to handle that case in the same way,
5131 : with the same adjustment. (In principle we could add a PHI node
5132 : to select the correct adjustment, but in practice that shouldn't be
5133 : necessary.) */
5134 4025 : tree main_adjustment
5135 4025 : = VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5136 4025 : if (loop_vinfo->main_loop_edge && main_adjustment)
5137 : {
5138 3435 : gcc_assert (num_phis == 1);
5139 3435 : tree initial_value = initial_values[0];
5140 : /* Check that we can use INITIAL_VALUE as the adjustment and
5141 : initialize the accumulator with a neutral value instead. */
5142 3435 : if (!operand_equal_p (initial_value, main_adjustment))
5143 : return false;
5144 3425 : initial_values[0] = VECT_REDUC_INFO_NEUTRAL_OP (reduc_info);
5145 : }
5146 4015 : VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5147 4015 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).truncate (0);
5148 4015 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).splice (initial_values);
5149 4015 : VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info) = accumulator;
5150 4015 : return true;
5151 4559 : }
5152 :
5153 : /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5154 : CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5155 :
5156 : static tree
5157 4059 : vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5158 : gimple_seq *seq)
5159 : {
5160 4059 : gcc_assert (!VECTOR_BOOLEAN_TYPE_P (TREE_TYPE (vec_def))
5161 : || (GET_MODE_CLASS (TYPE_MODE (TREE_TYPE (vec_def)))
5162 : == MODE_VECTOR_INT));
5163 4059 : unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5164 4059 : unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5165 4059 : tree stype = TREE_TYPE (vectype);
5166 4059 : tree new_temp = vec_def;
5167 8261 : while (nunits > nunits1)
5168 : {
5169 4202 : nunits /= 2;
5170 4202 : tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5171 4202 : stype, nunits);
5172 4202 : unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5173 :
5174 : /* The target has to make sure we support lowpart/highpart
5175 : extraction, either via direct vector extract or through
5176 : an integer mode punning. */
5177 4202 : tree dst1, dst2;
5178 4202 : gimple *epilog_stmt;
5179 4202 : if (convert_optab_handler (vec_extract_optab,
5180 4202 : TYPE_MODE (TREE_TYPE (new_temp)),
5181 4202 : TYPE_MODE (vectype1))
5182 : != CODE_FOR_nothing)
5183 : {
5184 : /* Extract sub-vectors directly once vec_extract becomes
5185 : a conversion optab. */
5186 2590 : dst1 = make_ssa_name (vectype1);
5187 2590 : epilog_stmt
5188 5180 : = gimple_build_assign (dst1, BIT_FIELD_REF,
5189 : build3 (BIT_FIELD_REF, vectype1,
5190 2590 : new_temp, TYPE_SIZE (vectype1),
5191 : bitsize_int (0)));
5192 2590 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5193 2590 : dst2 = make_ssa_name (vectype1);
5194 2590 : epilog_stmt
5195 2590 : = gimple_build_assign (dst2, BIT_FIELD_REF,
5196 : build3 (BIT_FIELD_REF, vectype1,
5197 2590 : new_temp, TYPE_SIZE (vectype1),
5198 2590 : bitsize_int (bitsize)));
5199 2590 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5200 : }
5201 : else
5202 : {
5203 : /* Extract via punning to appropriately sized integer mode
5204 : vector. */
5205 1612 : tree eltype = build_nonstandard_integer_type (bitsize, 1);
5206 1612 : tree etype = build_vector_type (eltype, 2);
5207 3224 : gcc_assert (convert_optab_handler (vec_extract_optab,
5208 : TYPE_MODE (etype),
5209 : TYPE_MODE (eltype))
5210 : != CODE_FOR_nothing);
5211 1612 : tree tem = make_ssa_name (etype);
5212 1612 : epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5213 : build1 (VIEW_CONVERT_EXPR,
5214 : etype, new_temp));
5215 1612 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5216 1612 : new_temp = tem;
5217 1612 : tem = make_ssa_name (eltype);
5218 1612 : epilog_stmt
5219 3224 : = gimple_build_assign (tem, BIT_FIELD_REF,
5220 : build3 (BIT_FIELD_REF, eltype,
5221 1612 : new_temp, TYPE_SIZE (eltype),
5222 : bitsize_int (0)));
5223 1612 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5224 1612 : dst1 = make_ssa_name (vectype1);
5225 1612 : epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5226 : build1 (VIEW_CONVERT_EXPR,
5227 : vectype1, tem));
5228 1612 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5229 1612 : tem = make_ssa_name (eltype);
5230 1612 : epilog_stmt
5231 1612 : = gimple_build_assign (tem, BIT_FIELD_REF,
5232 : build3 (BIT_FIELD_REF, eltype,
5233 1612 : new_temp, TYPE_SIZE (eltype),
5234 1612 : bitsize_int (bitsize)));
5235 1612 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5236 1612 : dst2 = make_ssa_name (vectype1);
5237 1612 : epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5238 : build1 (VIEW_CONVERT_EXPR,
5239 : vectype1, tem));
5240 1612 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5241 : }
5242 :
5243 4202 : new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5244 : }
5245 4059 : if (!useless_type_conversion_p (vectype, TREE_TYPE (new_temp)))
5246 : {
5247 66 : tree dst3 = make_ssa_name (vectype);
5248 66 : gimple *epilog_stmt = gimple_build_assign (dst3, VIEW_CONVERT_EXPR,
5249 : build1 (VIEW_CONVERT_EXPR,
5250 : vectype, new_temp));
5251 66 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5252 66 : new_temp = dst3;
5253 : }
5254 :
5255 4059 : return new_temp;
5256 : }
5257 :
5258 : /* Function vect_create_epilog_for_reduction
5259 :
5260 : Create code at the loop-epilog to finalize the result of a reduction
5261 : computation.
5262 :
5263 : STMT_INFO is the scalar reduction stmt that is being vectorized.
5264 : SLP_NODE is an SLP node containing a group of reduction statements. The
5265 : first one in this group is STMT_INFO.
5266 : SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5267 : REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5268 : (counting from 0)
5269 : LOOP_EXIT is the edge to update in the merge block. In the case of a single
5270 : exit this edge is always the main loop exit.
5271 :
5272 : This function:
5273 : 1. Completes the reduction def-use cycles.
5274 : 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5275 : by calling the function specified by REDUC_FN if available, or by
5276 : other means (whole-vector shifts or a scalar loop).
5277 : The function also creates a new phi node at the loop exit to preserve
5278 : loop-closed form, as illustrated below.
5279 :
5280 : The flow at the entry to this function:
5281 :
5282 : loop:
5283 : vec_def = phi <vec_init, null> # REDUCTION_PHI
5284 : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5285 : s_loop = scalar_stmt # (scalar) STMT_INFO
5286 : loop_exit:
5287 : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5288 : use <s_out0>
5289 : use <s_out0>
5290 :
5291 : The above is transformed by this function into:
5292 :
5293 : loop:
5294 : vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5295 : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5296 : s_loop = scalar_stmt # (scalar) STMT_INFO
5297 : loop_exit:
5298 : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5299 : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5300 : v_out2 = reduce <v_out1>
5301 : s_out3 = extract_field <v_out2, 0>
5302 : s_out4 = adjust_result <s_out3>
5303 : use <s_out4>
5304 : use <s_out4>
5305 : */
5306 :
5307 : static void
5308 21951 : vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5309 : stmt_vec_info stmt_info,
5310 : slp_tree slp_node,
5311 : slp_instance slp_node_instance,
5312 : edge loop_exit)
5313 : {
5314 21951 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
5315 21951 : code_helper code = VECT_REDUC_INFO_CODE (reduc_info);
5316 21951 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
5317 21951 : tree vectype;
5318 21951 : machine_mode mode;
5319 21951 : basic_block exit_bb;
5320 21951 : gimple *new_phi = NULL, *phi = NULL;
5321 21951 : gimple_stmt_iterator exit_gsi;
5322 21951 : tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5323 21951 : gimple *epilog_stmt = NULL;
5324 21951 : gimple *exit_phi;
5325 21951 : tree def;
5326 21951 : tree orig_name, scalar_result;
5327 21951 : imm_use_iterator imm_iter;
5328 21951 : use_operand_p use_p;
5329 21951 : gimple *use_stmt;
5330 21951 : auto_vec<tree> reduc_inputs;
5331 21951 : int j, i;
5332 21951 : vec<tree> &scalar_results = VECT_REDUC_INFO_SCALAR_RESULTS (reduc_info);
5333 21951 : unsigned int k;
5334 : /* SLP reduction without reduction chain, e.g.,
5335 : # a1 = phi <a2, a0>
5336 : # b1 = phi <b2, b0>
5337 : a2 = operation (a1)
5338 : b2 = operation (b1) */
5339 21951 : const bool slp_reduc = !reduc_info->is_reduc_chain;
5340 21951 : tree induction_index = NULL_TREE;
5341 :
5342 21951 : unsigned int group_size = SLP_TREE_LANES (slp_node);
5343 :
5344 21951 : bool double_reduc = false;
5345 21951 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5346 21951 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5347 : {
5348 0 : double_reduc = true;
5349 0 : gcc_assert (slp_reduc);
5350 : }
5351 :
5352 21951 : vectype = VECT_REDUC_INFO_VECTYPE (reduc_info);
5353 21951 : gcc_assert (vectype);
5354 21951 : mode = TYPE_MODE (vectype);
5355 :
5356 21951 : tree induc_val = NULL_TREE;
5357 21951 : tree adjustment_def = NULL;
5358 : /* Optimize: for induction condition reduction, if we can't use zero
5359 : for induc_val, use initial_def. */
5360 21951 : if (VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5361 62 : induc_val = VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info);
5362 21889 : else if (double_reduc)
5363 : ;
5364 : else
5365 21889 : adjustment_def = VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info);
5366 :
5367 21951 : stmt_vec_info single_live_out_stmt[] = { stmt_info };
5368 21951 : array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5369 21951 : if (slp_reduc)
5370 : /* All statements produce live-out values. */
5371 43500 : live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5372 :
5373 21951 : unsigned vec_num
5374 21951 : = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5375 :
5376 : /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5377 : which is updated with the current index of the loop for every match of
5378 : the original loop's cond_expr (VEC_STMT). This results in a vector
5379 : containing the last time the condition passed for that vector lane.
5380 : The first match will be a 1 to allow 0 to be used for non-matching
5381 : indexes. If there are no matches at all then the vector will be all
5382 : zeroes.
5383 :
5384 : PR92772: This algorithm is broken for architectures that support
5385 : masked vectors, but do not provide fold_extract_last. */
5386 21951 : if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION)
5387 : {
5388 67 : gcc_assert (!double_reduc);
5389 67 : auto_vec<std::pair<tree, bool>, 2> ccompares;
5390 67 : slp_tree cond_node = slp_node_instance->root;
5391 143 : while (cond_node != slp_node_instance->reduc_phis)
5392 : {
5393 76 : stmt_vec_info cond_info = SLP_TREE_REPRESENTATIVE (cond_node);
5394 76 : if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5395 : {
5396 76 : gimple *vec_stmt
5397 76 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (cond_node)[0]);
5398 76 : gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5399 76 : ccompares.safe_push
5400 76 : (std::make_pair (gimple_assign_rhs1 (vec_stmt),
5401 76 : SLP_TREE_REDUC_IDX (cond_node) == 2));
5402 : }
5403 76 : int slp_reduc_idx = SLP_TREE_REDUC_IDX (cond_node);
5404 76 : cond_node = SLP_TREE_CHILDREN (cond_node)[slp_reduc_idx];
5405 : }
5406 67 : gcc_assert (ccompares.length () != 0);
5407 :
5408 67 : tree indx_before_incr, indx_after_incr;
5409 67 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5410 67 : int scalar_precision
5411 67 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5412 67 : tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5413 67 : tree cr_index_vector_type = get_related_vectype_for_scalar_type
5414 67 : (TYPE_MODE (vectype), cr_index_scalar_type,
5415 : TYPE_VECTOR_SUBPARTS (vectype));
5416 :
5417 : /* First we create a simple vector induction variable which starts
5418 : with the values {1,2,3,...} (SERIES_VECT) and increments by the
5419 : vector size (STEP). */
5420 :
5421 : /* Create a {1,2,3,...} vector. */
5422 67 : tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5423 :
5424 : /* Create a vector of the step value. */
5425 67 : tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5426 67 : tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5427 :
5428 : /* Create an induction variable. */
5429 67 : gimple_stmt_iterator incr_gsi;
5430 67 : bool insert_after;
5431 67 : vect_iv_increment_position (LOOP_VINFO_MAIN_EXIT (loop_vinfo),
5432 : &incr_gsi, &insert_after);
5433 67 : create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
5434 : insert_after, &indx_before_incr, &indx_after_incr);
5435 :
5436 : /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5437 : filled with zeros (VEC_ZERO). */
5438 :
5439 : /* Create a vector of 0s. */
5440 67 : tree zero = build_zero_cst (cr_index_scalar_type);
5441 67 : tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5442 :
5443 : /* Create a vector phi node. */
5444 67 : tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5445 67 : new_phi = create_phi_node (new_phi_tree, loop->header);
5446 67 : add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5447 : loop_preheader_edge (loop), UNKNOWN_LOCATION);
5448 :
5449 : /* Now take the condition from the loops original cond_exprs
5450 : and produce a new cond_exprs (INDEX_COND_EXPR) which for
5451 : every match uses values from the induction variable
5452 : (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5453 : (NEW_PHI_TREE).
5454 : Finally, we update the phi (NEW_PHI_TREE) to take the value of
5455 : the new cond_expr (INDEX_COND_EXPR). */
5456 67 : gimple_seq stmts = NULL;
5457 210 : for (int i = ccompares.length () - 1; i != -1; --i)
5458 : {
5459 76 : tree ccompare = ccompares[i].first;
5460 76 : if (ccompares[i].second)
5461 69 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5462 : cr_index_vector_type,
5463 : ccompare,
5464 : indx_before_incr, new_phi_tree);
5465 : else
5466 7 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5467 : cr_index_vector_type,
5468 : ccompare,
5469 : new_phi_tree, indx_before_incr);
5470 : }
5471 67 : gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5472 :
5473 : /* Update the phi with the vec cond. */
5474 67 : induction_index = new_phi_tree;
5475 67 : add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5476 : loop_latch_edge (loop), UNKNOWN_LOCATION);
5477 67 : }
5478 :
5479 : /* 2. Create epilog code.
5480 : The reduction epilog code operates across the elements of the vector
5481 : of partial results computed by the vectorized loop.
5482 : The reduction epilog code consists of:
5483 :
5484 : step 1: compute the scalar result in a vector (v_out2)
5485 : step 2: extract the scalar result (s_out3) from the vector (v_out2)
5486 : step 3: adjust the scalar result (s_out3) if needed.
5487 :
5488 : Step 1 can be accomplished using one the following three schemes:
5489 : (scheme 1) using reduc_fn, if available.
5490 : (scheme 2) using whole-vector shifts, if available.
5491 : (scheme 3) using a scalar loop. In this case steps 1+2 above are
5492 : combined.
5493 :
5494 : The overall epilog code looks like this:
5495 :
5496 : s_out0 = phi <s_loop> # original EXIT_PHI
5497 : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5498 : v_out2 = reduce <v_out1> # step 1
5499 : s_out3 = extract_field <v_out2, 0> # step 2
5500 : s_out4 = adjust_result <s_out3> # step 3
5501 :
5502 : (step 3 is optional, and steps 1 and 2 may be combined).
5503 : Lastly, the uses of s_out0 are replaced by s_out4. */
5504 :
5505 :
5506 : /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5507 : v_out1 = phi <VECT_DEF>
5508 : Store them in NEW_PHIS. */
5509 : /* We need to reduce values in all exits. */
5510 21951 : exit_bb = loop_exit->dest;
5511 21951 : exit_gsi = gsi_after_labels (exit_bb);
5512 21951 : reduc_inputs.create (vec_num);
5513 45376 : for (unsigned i = 0; i < vec_num; i++)
5514 : {
5515 23425 : gimple_seq stmts = NULL;
5516 23425 : def = vect_get_slp_vect_def (slp_node, i);
5517 23425 : tree new_def = copy_ssa_name (def);
5518 23425 : phi = create_phi_node (new_def, exit_bb);
5519 23425 : if (LOOP_VINFO_MAIN_EXIT (loop_vinfo) == loop_exit)
5520 23398 : SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
5521 : else
5522 : {
5523 57 : for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
5524 30 : SET_PHI_ARG_DEF (phi, k, def);
5525 : }
5526 23425 : new_def = gimple_convert (&stmts, vectype, new_def);
5527 23425 : reduc_inputs.quick_push (new_def);
5528 23425 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5529 : }
5530 :
5531 : /* 2.2 Get the original scalar reduction variable as defined in the loop.
5532 : In case STMT is a "pattern-stmt" (i.e. - it represents a reduction
5533 : pattern), the scalar-def is taken from the original stmt that the
5534 : pattern-stmt (STMT) replaces. */
5535 :
5536 22776 : tree scalar_dest = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
5537 21951 : tree scalar_type = TREE_TYPE (scalar_dest);
5538 21951 : scalar_results.truncate (0);
5539 21951 : scalar_results.reserve_exact (group_size);
5540 21951 : new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5541 :
5542 : /* True if we should implement SLP_REDUC using native reduction operations
5543 : instead of scalar operations. */
5544 21951 : const bool direct_slp_reduc
5545 21951 : = (reduc_fn != IFN_LAST
5546 21951 : && slp_reduc
5547 21951 : && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5548 :
5549 : /* If signed overflow is undefined we might need to perform reduction
5550 : computations in an unsigned type. */
5551 21951 : tree compute_vectype = vectype;
5552 21951 : if (ANY_INTEGRAL_TYPE_P (vectype)
5553 15012 : && TYPE_OVERFLOW_UNDEFINED (vectype)
5554 5571 : && code.is_tree_code ()
5555 27522 : && arith_code_with_undefined_signed_overflow ((tree_code) code))
5556 4105 : compute_vectype = unsigned_type_for (vectype);
5557 :
5558 : /* In case of reduction chain, e.g.,
5559 : # a1 = phi <a3, a0>
5560 : a2 = operation (a1)
5561 : a3 = operation (a2),
5562 :
5563 : we may end up with more than one vector result. Here we reduce them
5564 : to one vector.
5565 :
5566 : The same is true for a SLP reduction, e.g.,
5567 : # a1 = phi <a2, a0>
5568 : # b1 = phi <b2, b0>
5569 : a2 = operation (a1)
5570 : b2 = operation (a2),
5571 :
5572 : where we can end up with more than one vector as well. We can
5573 : easily accumulate vectors when the number of vector elements is
5574 : a multiple of the SLP group size.
5575 :
5576 : The same is true if we couldn't use a single defuse cycle. */
5577 21951 : if ((!slp_reduc
5578 : || direct_slp_reduc
5579 : || (slp_reduc
5580 21951 : && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size)))
5581 43902 : && reduc_inputs.length () > 1)
5582 : {
5583 544 : gimple_seq stmts = NULL;
5584 544 : tree single_input = reduc_inputs[0];
5585 544 : if (compute_vectype != vectype)
5586 159 : single_input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5587 : compute_vectype, single_input);
5588 1865 : for (k = 1; k < reduc_inputs.length (); k++)
5589 : {
5590 1321 : tree input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5591 1321 : compute_vectype, reduc_inputs[k]);
5592 1321 : single_input = gimple_build (&stmts, code, compute_vectype,
5593 : single_input, input);
5594 : }
5595 544 : if (compute_vectype != vectype)
5596 159 : single_input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5597 : vectype, single_input);
5598 544 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5599 :
5600 544 : reduc_inputs.truncate (0);
5601 544 : reduc_inputs.safe_push (single_input);
5602 : }
5603 :
5604 21951 : tree orig_reduc_input = reduc_inputs[0];
5605 :
5606 : /* If this loop is an epilogue loop that can be skipped after the
5607 : main loop, we can only share a reduction operation between the
5608 : main loop and the epilogue if we put it at the target of the
5609 : skip edge.
5610 :
5611 : We can still reuse accumulators if this check fails. Doing so has
5612 : the minor(?) benefit of making the epilogue loop's scalar result
5613 : independent of the main loop's scalar result. */
5614 21951 : bool unify_with_main_loop_p = false;
5615 21951 : if (VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)
5616 4015 : && loop_vinfo->skip_this_loop_edge
5617 3775 : && single_succ_p (exit_bb)
5618 21972 : && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5619 : {
5620 21 : unify_with_main_loop_p = true;
5621 :
5622 21 : basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5623 21 : reduc_inputs[0] = make_ssa_name (vectype);
5624 21 : gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5625 21 : add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5626 : UNKNOWN_LOCATION);
5627 21 : add_phi_arg (new_phi,
5628 21 : VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)->reduc_input,
5629 : loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5630 21 : exit_gsi = gsi_after_labels (reduc_block);
5631 : }
5632 :
5633 : /* Shouldn't be used beyond this point. */
5634 21951 : exit_bb = nullptr;
5635 :
5636 : /* If we are operating on a mask vector and do not support direct mask
5637 : reduction, work on a bool data vector instead of a mask vector. */
5638 21951 : if (VECTOR_BOOLEAN_TYPE_P (vectype)
5639 229 : && VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info)
5640 22145 : && vectype != VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info))
5641 : {
5642 194 : compute_vectype = vectype = VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info);
5643 194 : gimple_seq stmts = NULL;
5644 396 : for (unsigned i = 0; i < reduc_inputs.length (); ++i)
5645 404 : reduc_inputs[i] = gimple_build (&stmts, VEC_COND_EXPR, vectype,
5646 202 : reduc_inputs[i],
5647 : build_one_cst (vectype),
5648 : build_zero_cst (vectype));
5649 194 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5650 : }
5651 :
5652 21951 : if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
5653 67 : && reduc_fn != IFN_LAST)
5654 : {
5655 : /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5656 : various data values where the condition matched and another vector
5657 : (INDUCTION_INDEX) containing all the indexes of those matches. We
5658 : need to extract the last matching index (which will be the index with
5659 : highest value) and use this to index into the data vector.
5660 : For the case where there were no matches, the data vector will contain
5661 : all default values and the index vector will be all zeros. */
5662 :
5663 : /* Get various versions of the type of the vector of indexes. */
5664 4 : tree index_vec_type = TREE_TYPE (induction_index);
5665 4 : gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5666 4 : tree index_scalar_type = TREE_TYPE (index_vec_type);
5667 4 : tree index_vec_cmp_type = truth_type_for (index_vec_type);
5668 :
5669 : /* Get an unsigned integer version of the type of the data vector. */
5670 4 : int scalar_precision
5671 4 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5672 4 : tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5673 4 : tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5674 : vectype);
5675 :
5676 : /* First we need to create a vector (ZERO_VEC) of zeros and another
5677 : vector (MAX_INDEX_VEC) filled with the last matching index, which we
5678 : can create using a MAX reduction and then expanding.
5679 : In the case where the loop never made any matches, the max index will
5680 : be zero. */
5681 :
5682 : /* Vector of {0, 0, 0,...}. */
5683 4 : tree zero_vec = build_zero_cst (vectype);
5684 :
5685 : /* Find maximum value from the vector of found indexes. */
5686 4 : tree max_index = make_ssa_name (index_scalar_type);
5687 4 : gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5688 : 1, induction_index);
5689 4 : gimple_call_set_lhs (max_index_stmt, max_index);
5690 4 : gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5691 :
5692 : /* Vector of {max_index, max_index, max_index,...}. */
5693 4 : tree max_index_vec = make_ssa_name (index_vec_type);
5694 4 : tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5695 : max_index);
5696 4 : gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5697 : max_index_vec_rhs);
5698 4 : gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5699 :
5700 : /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5701 : with the vector (INDUCTION_INDEX) of found indexes, choosing values
5702 : from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5703 : otherwise. Only one value should match, resulting in a vector
5704 : (VEC_COND) with one data value and the rest zeros.
5705 : In the case where the loop never made any matches, every index will
5706 : match, resulting in a vector with all data values (which will all be
5707 : the default value). */
5708 :
5709 : /* Compare the max index vector to the vector of found indexes to find
5710 : the position of the max value. */
5711 4 : tree vec_compare = make_ssa_name (index_vec_cmp_type);
5712 4 : gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5713 : induction_index,
5714 : max_index_vec);
5715 4 : gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5716 :
5717 : /* Use the compare to choose either values from the data vector or
5718 : zero. */
5719 4 : tree vec_cond = make_ssa_name (vectype);
5720 4 : gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5721 : vec_compare,
5722 4 : reduc_inputs[0],
5723 : zero_vec);
5724 4 : gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5725 :
5726 : /* Finally we need to extract the data value from the vector (VEC_COND)
5727 : into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5728 : reduction, but because this doesn't exist, we can use a MAX reduction
5729 : instead. The data value might be signed or a float so we need to cast
5730 : it first.
5731 : In the case where the loop never made any matches, the data values are
5732 : all identical, and so will reduce down correctly. */
5733 :
5734 : /* Make the matched data values unsigned. */
5735 4 : tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5736 4 : tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5737 : vec_cond);
5738 4 : gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5739 : VIEW_CONVERT_EXPR,
5740 : vec_cond_cast_rhs);
5741 4 : gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5742 :
5743 : /* Reduce down to a scalar value. */
5744 4 : tree data_reduc = make_ssa_name (scalar_type_unsigned);
5745 4 : gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5746 : 1, vec_cond_cast);
5747 4 : gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5748 4 : gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5749 :
5750 : /* Convert the reduced value back to the result type and set as the
5751 : result. */
5752 4 : gimple_seq stmts = NULL;
5753 4 : new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5754 : data_reduc);
5755 4 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5756 4 : scalar_results.safe_push (new_temp);
5757 4 : }
5758 21947 : else if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
5759 63 : && reduc_fn == IFN_LAST)
5760 : {
5761 : /* Condition reduction without supported IFN_REDUC_MAX. Generate
5762 : idx = 0;
5763 : idx_val = induction_index[0];
5764 : val = data_reduc[0];
5765 : for (idx = 0, val = init, i = 0; i < nelts; ++i)
5766 : if (induction_index[i] > idx_val)
5767 : val = data_reduc[i], idx_val = induction_index[i];
5768 : return val; */
5769 :
5770 63 : tree data_eltype = TREE_TYPE (vectype);
5771 63 : tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5772 63 : unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5773 63 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5774 : /* Enforced by vectorizable_reduction, which ensures we have target
5775 : support before allowing a conditional reduction on variable-length
5776 : vectors. */
5777 63 : unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5778 63 : tree idx_val = NULL_TREE, val = NULL_TREE;
5779 419 : for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5780 : {
5781 356 : tree old_idx_val = idx_val;
5782 356 : tree old_val = val;
5783 356 : idx_val = make_ssa_name (idx_eltype);
5784 356 : epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5785 : build3 (BIT_FIELD_REF, idx_eltype,
5786 : induction_index,
5787 356 : bitsize_int (el_size),
5788 356 : bitsize_int (off)));
5789 356 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5790 356 : val = make_ssa_name (data_eltype);
5791 712 : epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5792 : build3 (BIT_FIELD_REF,
5793 : data_eltype,
5794 356 : reduc_inputs[0],
5795 356 : bitsize_int (el_size),
5796 356 : bitsize_int (off)));
5797 356 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5798 356 : if (off != 0)
5799 : {
5800 293 : tree new_idx_val = idx_val;
5801 293 : if (off != v_size - el_size)
5802 : {
5803 230 : new_idx_val = make_ssa_name (idx_eltype);
5804 230 : epilog_stmt = gimple_build_assign (new_idx_val,
5805 : MAX_EXPR, idx_val,
5806 : old_idx_val);
5807 230 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5808 : }
5809 293 : tree cond = make_ssa_name (boolean_type_node);
5810 293 : epilog_stmt = gimple_build_assign (cond, GT_EXPR,
5811 : idx_val, old_idx_val);
5812 293 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5813 293 : tree new_val = make_ssa_name (data_eltype);
5814 293 : epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
5815 : cond, val, old_val);
5816 293 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5817 293 : idx_val = new_idx_val;
5818 293 : val = new_val;
5819 : }
5820 : }
5821 : /* Convert the reduced value back to the result type and set as the
5822 : result. */
5823 63 : gimple_seq stmts = NULL;
5824 63 : val = gimple_convert (&stmts, scalar_type, val);
5825 63 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5826 63 : scalar_results.safe_push (val);
5827 63 : }
5828 :
5829 : /* 2.3 Create the reduction code, using one of the three schemes described
5830 : above. In SLP we simply need to extract all the elements from the
5831 : vector (without reducing them), so we use scalar shifts. */
5832 21884 : else if (reduc_fn != IFN_LAST && (!slp_reduc || group_size == 1))
5833 : {
5834 19914 : tree tmp;
5835 19914 : tree vec_elem_type;
5836 :
5837 : /* Case 1: Create:
5838 : v_out2 = reduc_expr <v_out1> */
5839 :
5840 19914 : if (dump_enabled_p ())
5841 1512 : dump_printf_loc (MSG_NOTE, vect_location,
5842 : "Reduce using direct vector reduction.\n");
5843 :
5844 19914 : gimple_seq stmts = NULL;
5845 19914 : vec_elem_type = TREE_TYPE (vectype);
5846 19914 : new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5847 19914 : vec_elem_type, reduc_inputs[0]);
5848 19914 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5849 19914 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5850 :
5851 19914 : if ((VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5852 62 : && induc_val)
5853 : {
5854 : /* Earlier we set the initial value to be a vector if induc_val
5855 : values. Check the result and if it is induc_val then replace
5856 : with the original initial value, unless induc_val is
5857 : the same as initial_def already. */
5858 60 : tree zcompare = make_ssa_name (boolean_type_node);
5859 60 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
5860 : new_temp, induc_val);
5861 60 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5862 60 : tree initial_def = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[0];
5863 60 : tmp = make_ssa_name (new_scalar_dest);
5864 60 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5865 : initial_def, new_temp);
5866 60 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5867 60 : new_temp = tmp;
5868 : }
5869 :
5870 19914 : scalar_results.safe_push (new_temp);
5871 19914 : }
5872 1783 : else if (direct_slp_reduc)
5873 : {
5874 : /* Here we create one vector for each of the GROUP_SIZE results,
5875 : with the elements for other SLP statements replaced with the
5876 : neutral value. We can then do a normal reduction on each vector. */
5877 :
5878 : /* Enforced by vectorizable_reduction. */
5879 : gcc_assert (reduc_inputs.length () == 1);
5880 : gcc_assert (pow2p_hwi (group_size));
5881 :
5882 : gimple_seq seq = NULL;
5883 :
5884 : /* Build a vector {0, 1, 2, ...}, with the same number of elements
5885 : and the same element size as VECTYPE. */
5886 : tree index = build_index_vector (vectype, 0, 1);
5887 : tree index_type = TREE_TYPE (index);
5888 : tree index_elt_type = TREE_TYPE (index_type);
5889 : tree mask_type = truth_type_for (index_type);
5890 :
5891 : /* Create a vector that, for each element, identifies which of
5892 : the results should use it. */
5893 : tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5894 : index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5895 : build_vector_from_val (index_type, index_mask));
5896 :
5897 : /* Get a neutral vector value. This is simply a splat of the neutral
5898 : scalar value if we have one, otherwise the initial scalar value
5899 : is itself a neutral value. */
5900 : tree vector_identity = NULL_TREE;
5901 : tree neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5902 : NULL_TREE, false);
5903 : if (neutral_op)
5904 : vector_identity = gimple_build_vector_from_val (&seq, vectype,
5905 : neutral_op);
5906 : for (unsigned int i = 0; i < group_size; ++i)
5907 : {
5908 : /* If there's no univeral neutral value, we can use the
5909 : initial scalar value from the original PHI. This is used
5910 : for MIN and MAX reduction, for example. */
5911 : if (!neutral_op)
5912 : {
5913 : tree scalar_value
5914 : = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[i];
5915 : scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5916 : scalar_value);
5917 : vector_identity = gimple_build_vector_from_val (&seq, vectype,
5918 : scalar_value);
5919 : }
5920 :
5921 : /* Calculate the equivalent of:
5922 :
5923 : sel[j] = (index[j] == i);
5924 :
5925 : which selects the elements of REDUC_INPUTS[0] that should
5926 : be included in the result. */
5927 : tree compare_val = build_int_cst (index_elt_type, i);
5928 : compare_val = build_vector_from_val (index_type, compare_val);
5929 : tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5930 : index, compare_val);
5931 :
5932 : /* Calculate the equivalent of:
5933 :
5934 : vec = seq ? reduc_inputs[0] : vector_identity;
5935 :
5936 : VEC is now suitable for a full vector reduction. */
5937 : tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5938 : sel, reduc_inputs[0], vector_identity);
5939 :
5940 : /* Do the reduction and convert it to the appropriate type. */
5941 : tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5942 : TREE_TYPE (vectype), vec);
5943 : scalar = gimple_convert (&seq, scalar_type, scalar);
5944 : scalar_results.safe_push (scalar);
5945 : }
5946 : gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5947 : }
5948 : else
5949 : {
5950 1783 : bool reduce_with_shift;
5951 1783 : tree vec_temp;
5952 :
5953 1783 : gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5954 :
5955 : /* See if the target wants to do the final (shift) reduction
5956 : in a vector mode of smaller size and first reduce upper/lower
5957 : halves against each other. */
5958 1970 : enum machine_mode mode1 = mode;
5959 1970 : tree stype = TREE_TYPE (vectype);
5960 1970 : if (compute_vectype != vectype)
5961 : {
5962 544 : stype = unsigned_type_for (stype);
5963 544 : gimple_seq stmts = NULL;
5964 1146 : for (unsigned i = 0; i < reduc_inputs.length (); ++i)
5965 : {
5966 602 : tree new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5967 602 : compute_vectype, reduc_inputs[i]);
5968 602 : reduc_inputs[i] = new_temp;
5969 : }
5970 544 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5971 : }
5972 1970 : unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5973 1970 : unsigned nunits1 = nunits;
5974 1970 : if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5975 1970 : && reduc_inputs.length () == 1)
5976 : {
5977 41 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5978 : /* For SLP reductions we have to make sure lanes match up, but
5979 : since we're doing individual element final reduction reducing
5980 : vector width here is even more important.
5981 : ??? We can also separate lanes with permutes, for the common
5982 : case of power-of-two group-size odd/even extracts would work. */
5983 41 : if (slp_reduc && nunits != nunits1)
5984 : {
5985 41 : nunits1 = least_common_multiple (nunits1, group_size);
5986 82 : gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5987 : }
5988 : }
5989 1929 : else if (!slp_reduc
5990 1929 : && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5991 0 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5992 :
5993 1970 : tree vectype1 = compute_vectype;
5994 1970 : if (mode1 != mode)
5995 : {
5996 47 : vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5997 47 : stype, nunits1);
5998 : /* First reduce the vector to the desired vector size we should
5999 : do shift reduction on by combining upper and lower halves. */
6000 47 : gimple_seq stmts = NULL;
6001 47 : new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6002 : code, &stmts);
6003 47 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6004 47 : reduc_inputs[0] = new_temp;
6005 : }
6006 :
6007 1970 : reduce_with_shift = have_whole_vector_shift (mode1);
6008 732 : if (!VECTOR_MODE_P (mode1)
6009 2700 : || !directly_supported_p (code, vectype1))
6010 : reduce_with_shift = false;
6011 :
6012 1953 : if (reduce_with_shift && (!slp_reduc || group_size == 1))
6013 : {
6014 1730 : int element_bitsize = vector_element_bits (vectype1);
6015 : /* Enforced by vectorizable_reduction, which disallows SLP reductions
6016 : for variable-length vectors and also requires direct target support
6017 : for loop reductions. */
6018 1730 : int nelements = TYPE_VECTOR_SUBPARTS (vectype1).to_constant ();
6019 1730 : vec_perm_builder sel;
6020 1730 : vec_perm_indices indices;
6021 :
6022 1730 : int elt_offset;
6023 :
6024 1730 : tree zero_vec = build_zero_cst (vectype1);
6025 : /* Case 2: Create:
6026 : for (offset = nelements/2; offset >= 1; offset/=2)
6027 : {
6028 : Create: va' = vec_shift <va, offset>
6029 : Create: va = vop <va, va'>
6030 : } */
6031 :
6032 1730 : if (dump_enabled_p ())
6033 366 : dump_printf_loc (MSG_NOTE, vect_location,
6034 : "Reduce using vector shifts\n");
6035 :
6036 1730 : gimple_seq stmts = NULL;
6037 1730 : new_temp = gimple_convert (&stmts, vectype1, reduc_inputs[0]);
6038 1730 : for (elt_offset = nelements / 2;
6039 3770 : elt_offset >= 1;
6040 2040 : elt_offset /= 2)
6041 : {
6042 2040 : calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6043 2040 : indices.new_vector (sel, 2, nelements);
6044 2040 : tree mask = vect_gen_perm_mask_any (vectype1, indices);
6045 2040 : new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6046 : new_temp, zero_vec, mask);
6047 2040 : new_temp = gimple_build (&stmts, code,
6048 : vectype1, new_name, new_temp);
6049 : }
6050 :
6051 : /* 2.4 Extract the final scalar result. Create:
6052 : s_out3 = extract_field <v_out2, bitpos> */
6053 :
6054 1730 : if (dump_enabled_p ())
6055 366 : dump_printf_loc (MSG_NOTE, vect_location,
6056 : "extract scalar result\n");
6057 :
6058 1730 : new_temp = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype1),
6059 1730 : new_temp, bitsize_int (element_bitsize),
6060 1730 : bitsize_zero_node);
6061 1730 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6062 1730 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6063 1730 : scalar_results.safe_push (new_temp);
6064 1730 : }
6065 : else
6066 : {
6067 : /* Case 3: Create:
6068 : s = extract_field <v_out2, 0>
6069 : for (offset = element_size;
6070 : offset < vector_size;
6071 : offset += element_size;)
6072 : {
6073 : Create: s' = extract_field <v_out2, offset>
6074 : Create: s = op <s, s'> // For non SLP cases
6075 : } */
6076 :
6077 240 : if (dump_enabled_p ())
6078 151 : dump_printf_loc (MSG_NOTE, vect_location,
6079 : "Reduce using scalar code.\n");
6080 :
6081 240 : tree compute_type = TREE_TYPE (vectype1);
6082 240 : unsigned element_bitsize = vector_element_bits (vectype1);
6083 240 : unsigned vec_size_in_bits = element_bitsize
6084 240 : * TYPE_VECTOR_SUBPARTS (vectype1).to_constant ();
6085 240 : tree bitsize = bitsize_int (element_bitsize);
6086 240 : gimple_seq stmts = NULL;
6087 633 : FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6088 : {
6089 393 : unsigned bit_offset;
6090 786 : new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6091 393 : vec_temp, bitsize, bitsize_zero_node);
6092 :
6093 : /* In SLP we don't need to apply reduction operation, so we just
6094 : collect s' values in SCALAR_RESULTS. */
6095 393 : if (slp_reduc)
6096 383 : scalar_results.safe_push (new_temp);
6097 :
6098 955 : for (bit_offset = element_bitsize;
6099 1348 : bit_offset < vec_size_in_bits;
6100 955 : bit_offset += element_bitsize)
6101 : {
6102 955 : tree bitpos = bitsize_int (bit_offset);
6103 955 : new_name = gimple_build (&stmts, BIT_FIELD_REF,
6104 : compute_type, vec_temp,
6105 : bitsize, bitpos);
6106 955 : if (slp_reduc)
6107 : {
6108 : /* In SLP we don't need to apply reduction operation, so
6109 : we just collect s' values in SCALAR_RESULTS. */
6110 945 : new_temp = new_name;
6111 945 : scalar_results.safe_push (new_name);
6112 : }
6113 : else
6114 10 : new_temp = gimple_build (&stmts, code, compute_type,
6115 : new_name, new_temp);
6116 : }
6117 : }
6118 :
6119 : /* The only case where we need to reduce scalar results in a SLP
6120 : reduction, is unrolling. If the size of SCALAR_RESULTS is
6121 : greater than GROUP_SIZE, we reduce them combining elements modulo
6122 : GROUP_SIZE. */
6123 240 : if (slp_reduc)
6124 : {
6125 230 : tree res, first_res, new_res;
6126 :
6127 : /* Reduce multiple scalar results in case of SLP unrolling. */
6128 878 : for (j = group_size; scalar_results.iterate (j, &res);
6129 : j++)
6130 : {
6131 648 : first_res = scalar_results[j % group_size];
6132 648 : new_res = gimple_build (&stmts, code, compute_type,
6133 : first_res, res);
6134 648 : scalar_results[j % group_size] = new_res;
6135 : }
6136 230 : scalar_results.truncate (group_size);
6137 1140 : for (k = 0; k < group_size; k++)
6138 1360 : scalar_results[k] = gimple_convert (&stmts, scalar_type,
6139 680 : scalar_results[k]);
6140 : }
6141 : else
6142 : {
6143 : /* Reduction chain - we have one scalar to keep in
6144 : SCALAR_RESULTS. */
6145 10 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6146 10 : scalar_results.safe_push (new_temp);
6147 : }
6148 :
6149 240 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6150 : }
6151 :
6152 1970 : if ((VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6153 0 : && induc_val)
6154 : {
6155 : /* Earlier we set the initial value to be a vector if induc_val
6156 : values. Check the result and if it is induc_val then replace
6157 : with the original initial value, unless induc_val is
6158 : the same as initial_def already. */
6159 0 : tree zcompare = make_ssa_name (boolean_type_node);
6160 0 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6161 0 : scalar_results[0], induc_val);
6162 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6163 0 : tree initial_def = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[0];
6164 0 : tree tmp = make_ssa_name (new_scalar_dest);
6165 0 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6166 0 : initial_def, scalar_results[0]);
6167 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6168 0 : scalar_results[0] = tmp;
6169 : }
6170 : }
6171 :
6172 : /* 2.5 Adjust the final result by the initial value of the reduction
6173 : variable. (When such adjustment is not needed, then
6174 : 'adjustment_def' is zero). For example, if code is PLUS we create:
6175 : new_temp = loop_exit_def + adjustment_def */
6176 :
6177 21951 : if (adjustment_def)
6178 : {
6179 15735 : gcc_assert (!slp_reduc || group_size == 1);
6180 15735 : gimple_seq stmts = NULL;
6181 15735 : if (double_reduc)
6182 : {
6183 0 : gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6184 0 : adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6185 0 : new_temp = gimple_build (&stmts, code, vectype,
6186 0 : reduc_inputs[0], adjustment_def);
6187 : }
6188 : else
6189 : {
6190 15735 : new_temp = scalar_results[0];
6191 15735 : gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6192 15735 : adjustment_def = gimple_convert (&stmts, TREE_TYPE (compute_vectype),
6193 : adjustment_def);
6194 15735 : new_temp = gimple_convert (&stmts, TREE_TYPE (compute_vectype),
6195 : new_temp);
6196 15735 : new_temp = gimple_build (&stmts, code, TREE_TYPE (compute_vectype),
6197 : new_temp, adjustment_def);
6198 15735 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6199 : }
6200 :
6201 15735 : epilog_stmt = gimple_seq_last_stmt (stmts);
6202 15735 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6203 15735 : scalar_results[0] = new_temp;
6204 : }
6205 :
6206 : /* Record this operation if it could be reused by the epilogue loop. */
6207 21951 : if (VECT_REDUC_INFO_TYPE (reduc_info) == TREE_CODE_REDUCTION
6208 21951 : && reduc_inputs.length () == 1)
6209 21767 : loop_vinfo->reusable_accumulators.put (scalar_results[0],
6210 : { orig_reduc_input, reduc_info });
6211 :
6212 : /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6213 : phis with new adjusted scalar results, i.e., replace use <s_out0>
6214 : with use <s_out4>.
6215 :
6216 : Transform:
6217 : loop_exit:
6218 : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6219 : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6220 : v_out2 = reduce <v_out1>
6221 : s_out3 = extract_field <v_out2, 0>
6222 : s_out4 = adjust_result <s_out3>
6223 : use <s_out0>
6224 : use <s_out0>
6225 :
6226 : into:
6227 :
6228 : loop_exit:
6229 : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6230 : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6231 : v_out2 = reduce <v_out1>
6232 : s_out3 = extract_field <v_out2, 0>
6233 : s_out4 = adjust_result <s_out3>
6234 : use <s_out4>
6235 : use <s_out4> */
6236 :
6237 43902 : gcc_assert (live_out_stmts.size () == scalar_results.length ());
6238 21951 : auto_vec<gimple *> phis;
6239 44352 : for (k = 0; k < live_out_stmts.size (); k++)
6240 : {
6241 22401 : stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6242 22401 : tree scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6243 :
6244 : /* Find the loop-closed-use at the loop exit of the original scalar
6245 : result. (The reduction result is expected to have two immediate uses,
6246 : one at the latch block, and one at the loop exit). Note with
6247 : early break we can have two exit blocks, so pick the correct PHI. */
6248 113541 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6249 68739 : if (!is_gimple_debug (USE_STMT (use_p))
6250 68739 : && !flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6251 : {
6252 22396 : gcc_assert (is_a <gphi *> (USE_STMT (use_p)));
6253 22396 : if (gimple_bb (USE_STMT (use_p)) == loop_exit->dest)
6254 22388 : phis.safe_push (USE_STMT (use_p));
6255 22401 : }
6256 :
6257 44789 : FOR_EACH_VEC_ELT (phis, i, exit_phi)
6258 : {
6259 : /* Replace the uses: */
6260 22388 : orig_name = PHI_RESULT (exit_phi);
6261 :
6262 : /* Look for a single use at the target of the skip edge. */
6263 22388 : if (unify_with_main_loop_p)
6264 : {
6265 38 : use_operand_p use_p;
6266 38 : gimple *user;
6267 38 : if (!single_imm_use (orig_name, &use_p, &user))
6268 0 : gcc_unreachable ();
6269 38 : orig_name = gimple_get_lhs (user);
6270 : }
6271 :
6272 22388 : scalar_result = scalar_results[k];
6273 82987 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6274 : {
6275 38211 : gphi *use_phi = dyn_cast <gphi *> (use_stmt);
6276 114677 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6277 : {
6278 38233 : if (use_phi
6279 38233 : && (phi_arg_edge_from_use (use_p)->flags & EDGE_ABNORMAL))
6280 : {
6281 0 : gcc_assert (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (orig_name));
6282 0 : SSA_NAME_OCCURS_IN_ABNORMAL_PHI (scalar_result) = 1;
6283 : }
6284 38233 : SET_USE (use_p, scalar_result);
6285 : }
6286 38211 : update_stmt (use_stmt);
6287 22388 : }
6288 : }
6289 :
6290 22401 : phis.truncate (0);
6291 : }
6292 21951 : }
6293 :
6294 : /* Return a vector of type VECTYPE that is equal to the vector select
6295 : operation "MASK ? VEC : IDENTITY". Insert the select statements
6296 : before GSI. */
6297 :
6298 : static tree
6299 9 : merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6300 : tree vec, tree identity)
6301 : {
6302 9 : tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6303 9 : gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6304 : mask, vec, identity);
6305 9 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6306 9 : return cond;
6307 : }
6308 :
6309 : /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6310 : order, starting with LHS. Insert the extraction statements before GSI and
6311 : associate the new scalar SSA names with variable SCALAR_DEST.
6312 : If MASK is nonzero mask the input and then operate on it unconditionally.
6313 : Return the SSA name for the result. */
6314 :
6315 : static tree
6316 1097 : vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6317 : tree_code code, tree lhs, tree vector_rhs,
6318 : tree mask)
6319 : {
6320 1097 : tree vectype = TREE_TYPE (vector_rhs);
6321 1097 : tree scalar_type = TREE_TYPE (vectype);
6322 1097 : tree bitsize = TYPE_SIZE (scalar_type);
6323 1097 : unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6324 1097 : unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6325 :
6326 : /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6327 : to perform an unconditional element-wise reduction of it. */
6328 1097 : if (mask)
6329 : {
6330 77 : tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
6331 : "masked_vector_rhs");
6332 77 : tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
6333 : false);
6334 77 : tree vector_identity = build_vector_from_val (vectype, neutral_op);
6335 77 : gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
6336 : mask, vector_rhs, vector_identity);
6337 77 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6338 77 : vector_rhs = masked_vector_rhs;
6339 : }
6340 :
6341 1097 : for (unsigned HOST_WIDE_INT bit_offset = 0;
6342 5109 : bit_offset < vec_size_in_bits;
6343 4012 : bit_offset += element_bitsize)
6344 : {
6345 4012 : tree bitpos = bitsize_int (bit_offset);
6346 4012 : tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6347 : bitsize, bitpos);
6348 :
6349 4012 : gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6350 4012 : rhs = make_ssa_name (scalar_dest, stmt);
6351 4012 : gimple_assign_set_lhs (stmt, rhs);
6352 4012 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6353 : /* Fold the vector extract, combining it with a previous reversal
6354 : like seen in PR90579. */
6355 4012 : auto gsi2 = gsi_for_stmt (stmt);
6356 4012 : if (fold_stmt (&gsi2, follow_all_ssa_edges))
6357 356 : update_stmt (gsi_stmt (gsi2));
6358 :
6359 4012 : stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6360 4012 : tree new_name = make_ssa_name (scalar_dest, stmt);
6361 4012 : gimple_assign_set_lhs (stmt, new_name);
6362 4012 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6363 4012 : lhs = new_name;
6364 : }
6365 1097 : return lhs;
6366 : }
6367 :
6368 : /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6369 : type of the vector input. */
6370 :
6371 : static internal_fn
6372 2956 : get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6373 : {
6374 2956 : internal_fn mask_reduc_fn;
6375 2956 : internal_fn mask_len_reduc_fn;
6376 :
6377 2956 : switch (reduc_fn)
6378 : {
6379 0 : case IFN_FOLD_LEFT_PLUS:
6380 0 : mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6381 0 : mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6382 0 : break;
6383 :
6384 : default:
6385 : return IFN_LAST;
6386 : }
6387 :
6388 0 : if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6389 : OPTIMIZE_FOR_SPEED))
6390 : return mask_reduc_fn;
6391 0 : if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6392 : OPTIMIZE_FOR_SPEED))
6393 : return mask_len_reduc_fn;
6394 : return IFN_LAST;
6395 : }
6396 :
6397 : /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6398 : statement that sets the live-out value. REDUC_DEF_STMT is the phi
6399 : statement. CODE is the operation performed by STMT_INFO and OPS are
6400 : its scalar operands. REDUC_INDEX is the index of the operand in
6401 : OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6402 : implements in-order reduction, or IFN_LAST if we should open-code it.
6403 : VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6404 : that should be used to control the operation in a fully-masked loop. */
6405 :
6406 : static bool
6407 839 : vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6408 : stmt_vec_info stmt_info,
6409 : gimple_stmt_iterator *gsi,
6410 : slp_tree slp_node,
6411 : code_helper code, internal_fn reduc_fn,
6412 : int num_ops, tree vectype_in,
6413 : int reduc_index, vec_loop_masks *masks,
6414 : vec_loop_lens *lens)
6415 : {
6416 839 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6417 839 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
6418 839 : internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6419 :
6420 839 : gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6421 :
6422 839 : bool is_cond_op = false;
6423 839 : if (!code.is_tree_code ())
6424 : {
6425 23 : code = conditional_internal_fn_code (internal_fn (code));
6426 23 : gcc_assert (code != ERROR_MARK);
6427 : is_cond_op = true;
6428 : }
6429 :
6430 839 : gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
6431 :
6432 839 : gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6433 : TYPE_VECTOR_SUBPARTS (vectype_in)));
6434 :
6435 : /* ??? We should, when transforming the cycle PHI, record the existing
6436 : scalar def as vector def so looking up the vector def works. This
6437 : would also allow generalizing this for reduction paths of length > 1
6438 : and/or SLP reductions. */
6439 839 : slp_tree reduc_node = SLP_TREE_CHILDREN (slp_node)[reduc_index];
6440 839 : stmt_vec_info reduc_var_def = SLP_TREE_SCALAR_STMTS (reduc_node)[0];
6441 839 : tree reduc_var = gimple_get_lhs (STMT_VINFO_STMT (reduc_var_def));
6442 :
6443 : /* The operands either come from a binary operation or an IFN_COND operation.
6444 : The former is a gimple assign with binary rhs and the latter is a
6445 : gimple call with four arguments. */
6446 839 : gcc_assert (num_ops == 2 || num_ops == 4);
6447 :
6448 839 : auto_vec<tree> vec_oprnds0, vec_opmask;
6449 839 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[(is_cond_op ? 2 : 0)
6450 839 : + (1 - reduc_index)],
6451 : &vec_oprnds0);
6452 : /* For an IFN_COND_OP we also need the vector mask operand. */
6453 839 : if (is_cond_op)
6454 23 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], &vec_opmask);
6455 :
6456 : /* The transform below relies on preserving the original scalar PHI
6457 : and its latch def which we replace. So work backwards from there. */
6458 839 : tree scalar_dest
6459 839 : = gimple_phi_arg_def_from_edge (as_a <gphi *> (STMT_VINFO_STMT
6460 : (reduc_var_def)),
6461 839 : loop_latch_edge (loop));
6462 839 : stmt_vec_info scalar_dest_def_info
6463 839 : = vect_stmt_to_vectorize (loop_vinfo->lookup_def (scalar_dest));
6464 839 : tree scalar_type = TREE_TYPE (scalar_dest);
6465 :
6466 839 : int vec_num = vec_oprnds0.length ();
6467 839 : tree vec_elem_type = TREE_TYPE (vectype_out);
6468 839 : gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6469 :
6470 839 : tree vector_identity = NULL_TREE;
6471 839 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6472 : {
6473 2 : vector_identity = build_zero_cst (vectype_out);
6474 2 : if (!HONOR_SIGNED_ZEROS (vectype_out))
6475 : ;
6476 : else
6477 : {
6478 2 : gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
6479 2 : vector_identity = const_unop (NEGATE_EXPR, vectype_out,
6480 : vector_identity);
6481 : }
6482 : }
6483 :
6484 839 : tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6485 839 : int i;
6486 839 : tree def0;
6487 1936 : FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6488 : {
6489 1097 : gimple *new_stmt;
6490 1097 : tree mask = NULL_TREE;
6491 1097 : tree len = NULL_TREE;
6492 1097 : tree bias = NULL_TREE;
6493 1097 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6494 : {
6495 9 : tree loop_mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
6496 : vec_num, vectype_in, i);
6497 9 : if (is_cond_op)
6498 9 : mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (loop_mask),
6499 9 : loop_mask, vec_opmask[i], gsi);
6500 : else
6501 : mask = loop_mask;
6502 : }
6503 1088 : else if (is_cond_op)
6504 68 : mask = vec_opmask[i];
6505 1097 : if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
6506 : {
6507 0 : len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
6508 : i, 1, false);
6509 0 : signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
6510 0 : bias = build_int_cst (intQI_type_node, biasval);
6511 0 : if (!is_cond_op)
6512 0 : mask = build_minus_one_cst (truth_type_for (vectype_in));
6513 : }
6514 :
6515 : /* Handle MINUS by adding the negative. */
6516 1097 : if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6517 : {
6518 0 : tree negated = make_ssa_name (vectype_out);
6519 0 : new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6520 0 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6521 0 : def0 = negated;
6522 : }
6523 :
6524 9 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
6525 1106 : && mask && mask_reduc_fn == IFN_LAST)
6526 9 : def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6527 : vector_identity);
6528 :
6529 : /* On the first iteration the input is simply the scalar phi
6530 : result, and for subsequent iterations it is the output of
6531 : the preceding operation. */
6532 1097 : if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6533 : {
6534 0 : if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6535 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
6536 : def0, mask, len, bias);
6537 0 : else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
6538 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6539 : def0, mask);
6540 : else
6541 0 : new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6542 : def0);
6543 : /* For chained SLP reductions the output of the previous reduction
6544 : operation serves as the input of the next. For the final statement
6545 : the output cannot be a temporary - we reuse the original
6546 : scalar destination of the last statement. */
6547 0 : if (i != vec_num - 1)
6548 : {
6549 0 : gimple_set_lhs (new_stmt, scalar_dest_var);
6550 0 : reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6551 0 : gimple_set_lhs (new_stmt, reduc_var);
6552 : }
6553 : }
6554 : else
6555 : {
6556 1097 : reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
6557 : tree_code (code), reduc_var, def0,
6558 : mask);
6559 1097 : new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6560 : /* Remove the statement, so that we can use the same code paths
6561 : as for statements that we've just created. */
6562 1097 : gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6563 1097 : gsi_remove (&tmp_gsi, true);
6564 : }
6565 :
6566 1097 : if (i == vec_num - 1)
6567 : {
6568 839 : gimple_set_lhs (new_stmt, scalar_dest);
6569 839 : vect_finish_replace_stmt (loop_vinfo,
6570 : scalar_dest_def_info,
6571 : new_stmt);
6572 : }
6573 : else
6574 258 : vect_finish_stmt_generation (loop_vinfo,
6575 : scalar_dest_def_info,
6576 : new_stmt, gsi);
6577 :
6578 1097 : slp_node->push_vec_def (new_stmt);
6579 : }
6580 :
6581 839 : return true;
6582 839 : }
6583 :
6584 : /* Function is_nonwrapping_integer_induction.
6585 :
6586 : Check if STMT_VINO (which is part of loop LOOP) both increments and
6587 : does not cause overflow. */
6588 :
6589 : static bool
6590 408 : is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6591 : {
6592 408 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6593 408 : tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6594 408 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6595 408 : tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6596 408 : widest_int ni, max_loop_value, lhs_max;
6597 408 : wi::overflow_type overflow = wi::OVF_NONE;
6598 :
6599 : /* Make sure the loop is integer based. */
6600 408 : if (TREE_CODE (base) != INTEGER_CST
6601 109 : || TREE_CODE (step) != INTEGER_CST)
6602 : return false;
6603 :
6604 : /* Check that the max size of the loop will not wrap. */
6605 :
6606 109 : if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6607 : return true;
6608 :
6609 8 : if (! max_stmt_executions (loop, &ni))
6610 : return false;
6611 :
6612 8 : max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6613 8 : &overflow);
6614 8 : if (overflow)
6615 : return false;
6616 :
6617 8 : max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6618 16 : TYPE_SIGN (lhs_type), &overflow);
6619 8 : if (overflow)
6620 : return false;
6621 :
6622 8 : return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6623 8 : <= TYPE_PRECISION (lhs_type));
6624 408 : }
6625 :
6626 : /* Check if masking can be supported by inserting a conditional expression.
6627 : CODE is the code for the operation. COND_FN is the conditional internal
6628 : function, if it exists. VECTYPE_IN is the type of the vector input. */
6629 : static bool
6630 5919 : use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6631 : tree vectype_in)
6632 : {
6633 5919 : if (cond_fn != IFN_LAST
6634 5919 : && direct_internal_fn_supported_p (cond_fn, vectype_in,
6635 : OPTIMIZE_FOR_SPEED))
6636 : return false;
6637 :
6638 4177 : if (code.is_tree_code ())
6639 4171 : switch (tree_code (code))
6640 : {
6641 : case DOT_PROD_EXPR:
6642 : case SAD_EXPR:
6643 : return true;
6644 :
6645 : default:
6646 : break;
6647 : }
6648 : return false;
6649 : }
6650 :
6651 : /* Insert a conditional expression to enable masked vectorization. CODE is the
6652 : code for the operation. VOP is the array of operands. MASK is the loop
6653 : mask. GSI is a statement iterator used to place the new conditional
6654 : expression. */
6655 : static void
6656 4 : build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6657 : gimple_stmt_iterator *gsi)
6658 : {
6659 4 : switch (tree_code (code))
6660 : {
6661 4 : case DOT_PROD_EXPR:
6662 4 : {
6663 4 : tree vectype = TREE_TYPE (vop[1]);
6664 4 : tree zero = build_zero_cst (vectype);
6665 4 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6666 4 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6667 : mask, vop[1], zero);
6668 4 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6669 4 : vop[1] = masked_op1;
6670 4 : break;
6671 : }
6672 :
6673 0 : case SAD_EXPR:
6674 0 : {
6675 0 : tree vectype = TREE_TYPE (vop[1]);
6676 0 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6677 0 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6678 : mask, vop[1], vop[0]);
6679 0 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6680 0 : vop[1] = masked_op1;
6681 0 : break;
6682 : }
6683 :
6684 0 : default:
6685 0 : gcc_unreachable ();
6686 : }
6687 4 : }
6688 :
6689 : /* Given an operation with CODE in loop reduction path whose reduction PHI is
6690 : specified by REDUC_INFO, the operation has TYPE of scalar result, and its
6691 : input vectype is represented by VECTYPE_IN. The vectype of vectorized result
6692 : may be different from VECTYPE_IN, either in base type or vectype lanes,
6693 : lane-reducing operation is the case. This function check if it is possible,
6694 : and how to perform partial vectorization on the operation in the context
6695 : of LOOP_VINFO. */
6696 :
6697 : static void
6698 4128 : vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo,
6699 : vect_reduc_info reduc_info,
6700 : slp_tree slp_node,
6701 : code_helper code, tree type,
6702 : tree vectype_in)
6703 : {
6704 4128 : enum vect_reduction_type reduc_type = VECT_REDUC_INFO_TYPE (reduc_info);
6705 4128 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
6706 4128 : internal_fn cond_fn
6707 1156 : = ((code.is_internal_fn ()
6708 1156 : && internal_fn_mask_index ((internal_fn)code) != -1)
6709 4128 : ? (internal_fn)code : get_conditional_internal_fn (code, type));
6710 :
6711 4128 : if (reduc_type != FOLD_LEFT_REDUCTION
6712 3338 : && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6713 7353 : && (cond_fn == IFN_LAST
6714 3225 : || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6715 : OPTIMIZE_FOR_SPEED)))
6716 : {
6717 2011 : if (dump_enabled_p ())
6718 98 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6719 : "can't operate on partial vectors because"
6720 : " no conditional operation is available.\n");
6721 2011 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6722 : }
6723 2117 : else if (reduc_type == FOLD_LEFT_REDUCTION
6724 2117 : && reduc_fn == IFN_LAST
6725 2117 : && !expand_vec_cond_expr_p (vectype_in, truth_type_for (vectype_in)))
6726 : {
6727 0 : if (dump_enabled_p ())
6728 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6729 : "can't operate on partial vectors because"
6730 : " no conditional operation is available.\n");
6731 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6732 : }
6733 2117 : else if (reduc_type == FOLD_LEFT_REDUCTION
6734 790 : && internal_fn_mask_index (reduc_fn) == -1
6735 790 : && FLOAT_TYPE_P (vectype_in)
6736 2900 : && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
6737 : {
6738 0 : if (dump_enabled_p ())
6739 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6740 : "can't operate on partial vectors because"
6741 : " signed zeros cannot be preserved.\n");
6742 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6743 : }
6744 : else
6745 : {
6746 2117 : internal_fn mask_reduc_fn
6747 2117 : = get_masked_reduction_fn (reduc_fn, vectype_in);
6748 2117 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6749 2117 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
6750 2117 : unsigned nvectors = vect_get_num_copies (loop_vinfo, slp_node);
6751 :
6752 2117 : if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6753 0 : vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_in, 1);
6754 : else
6755 2117 : vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_in, NULL);
6756 : }
6757 4128 : }
6758 :
6759 : /* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
6760 : the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC,
6761 : and the analysis is for slp if SLP_NODE is not NULL.
6762 :
6763 : For a lane-reducing operation, the loop reduction path that it lies in,
6764 : may contain normal operation, or other lane-reducing operation of different
6765 : input type size, an example as:
6766 :
6767 : int sum = 0;
6768 : for (i)
6769 : {
6770 : ...
6771 : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
6772 : sum += w[i]; // widen-sum <vector(16) char>
6773 : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
6774 : sum += n[i]; // normal <vector(4) int>
6775 : ...
6776 : }
6777 :
6778 : Vectorization factor is essentially determined by operation whose input
6779 : vectype has the most lanes ("vector(16) char" in the example), while we
6780 : need to choose input vectype with the least lanes ("vector(4) int" in the
6781 : example) to determine effective number of vector reduction PHIs. */
6782 :
6783 : bool
6784 380355 : vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
6785 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
6786 : {
6787 380355 : gimple *stmt = stmt_info->stmt;
6788 :
6789 380355 : if (!lane_reducing_stmt_p (stmt))
6790 : return false;
6791 :
6792 714 : tree type = TREE_TYPE (gimple_assign_lhs (stmt));
6793 :
6794 714 : if (!INTEGRAL_TYPE_P (type))
6795 : return false;
6796 :
6797 : /* Do not try to vectorize bit-precision reductions. */
6798 714 : if (!type_has_mode_precision_p (type))
6799 : return false;
6800 :
6801 714 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
6802 :
6803 : /* TODO: Support lane-reducing operation that does not directly participate
6804 : in loop reduction. */
6805 714 : if (!reduc_info)
6806 : return false;
6807 :
6808 : /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
6809 : recoginized. */
6810 714 : gcc_assert (!nested_in_vect_loop_p (LOOP_VINFO_LOOP (loop_vinfo), stmt_info));
6811 714 : gcc_assert (VECT_REDUC_INFO_TYPE (reduc_info) == TREE_CODE_REDUCTION);
6812 :
6813 2856 : for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
6814 : {
6815 2142 : slp_tree slp_op;
6816 2142 : tree op;
6817 2142 : tree vectype;
6818 2142 : enum vect_def_type dt;
6819 :
6820 2142 : if (!vect_is_simple_use (loop_vinfo, slp_node, i, &op,
6821 : &slp_op, &dt, &vectype))
6822 : {
6823 0 : if (dump_enabled_p ())
6824 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6825 : "use not simple.\n");
6826 0 : return false;
6827 : }
6828 :
6829 2142 : if (!vectype)
6830 : {
6831 6 : vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
6832 : slp_op);
6833 6 : if (!vectype)
6834 : return false;
6835 : }
6836 :
6837 2142 : if (!vect_maybe_update_slp_op_vectype (slp_op, vectype))
6838 : {
6839 0 : if (dump_enabled_p ())
6840 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6841 : "incompatible vector types for invariants\n");
6842 0 : return false;
6843 : }
6844 :
6845 2142 : if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6846 714 : continue;
6847 :
6848 : /* There should be at most one cycle def in the stmt. */
6849 1428 : if (VECTORIZABLE_CYCLE_DEF (dt))
6850 : return false;
6851 : }
6852 :
6853 714 : slp_tree node_in = SLP_TREE_CHILDREN (slp_node)[0];
6854 714 : tree vectype_in = SLP_TREE_VECTYPE (node_in);
6855 714 : gcc_assert (vectype_in);
6856 :
6857 : /* Compute number of effective vector statements for costing. */
6858 714 : unsigned int ncopies_for_cost = vect_get_num_copies (loop_vinfo, node_in);
6859 714 : gcc_assert (ncopies_for_cost >= 1);
6860 :
6861 714 : if (vect_is_emulated_mixed_dot_prod (slp_node))
6862 : {
6863 : /* We need extra two invariants: one that contains the minimum signed
6864 : value and one that contains half of its negative. */
6865 15 : int prologue_stmts = 2;
6866 15 : unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
6867 : scalar_to_vec, slp_node, 0,
6868 : vect_prologue);
6869 15 : if (dump_enabled_p ())
6870 0 : dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
6871 : "extra prologue_cost = %d .\n", cost);
6872 :
6873 : /* Three dot-products and a subtraction. */
6874 15 : ncopies_for_cost *= 4;
6875 : }
6876 :
6877 714 : record_stmt_cost (cost_vec, (int) ncopies_for_cost, vector_stmt, slp_node,
6878 : 0, vect_body);
6879 :
6880 714 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
6881 : {
6882 113 : enum tree_code code = gimple_assign_rhs_code (stmt);
6883 113 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
6884 113 : node_in, code, type,
6885 : vectype_in);
6886 : }
6887 :
6888 : /* Transform via vect_transform_reduction. */
6889 714 : SLP_TREE_TYPE (slp_node) = reduc_vec_info_type;
6890 714 : return true;
6891 : }
6892 :
6893 : /* Function vectorizable_reduction.
6894 :
6895 : Check if STMT_INFO performs a reduction operation that can be vectorized.
6896 : If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6897 : stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6898 : Return true if STMT_INFO is vectorizable in this way.
6899 :
6900 : This function also handles reduction idioms (patterns) that have been
6901 : recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6902 : may be of this form:
6903 : X = pattern_expr (arg0, arg1, ..., X)
6904 : and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6905 : sequence that had been detected and replaced by the pattern-stmt
6906 : (STMT_INFO).
6907 :
6908 : This function also handles reduction of condition expressions, for example:
6909 : for (int i = 0; i < N; i++)
6910 : if (a[i] < value)
6911 : last = a[i];
6912 : This is handled by vectorising the loop and creating an additional vector
6913 : containing the loop indexes for which "a[i] < value" was true. In the
6914 : function epilogue this is reduced to a single max value and then used to
6915 : index into the vector of results.
6916 :
6917 : In some cases of reduction patterns, the type of the reduction variable X is
6918 : different than the type of the other arguments of STMT_INFO.
6919 : In such cases, the vectype that is used when transforming STMT_INFO into
6920 : a vector stmt is different than the vectype that is used to determine the
6921 : vectorization factor, because it consists of a different number of elements
6922 : than the actual number of elements that are being operated upon in parallel.
6923 :
6924 : For example, consider an accumulation of shorts into an int accumulator.
6925 : On some targets it's possible to vectorize this pattern operating on 8
6926 : shorts at a time (hence, the vectype for purposes of determining the
6927 : vectorization factor should be V8HI); on the other hand, the vectype that
6928 : is used to create the vector form is actually V4SI (the type of the result).
6929 :
6930 : Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6931 : indicates what is the actual level of parallelism (V8HI in the example), so
6932 : that the right vectorization factor would be derived. This vectype
6933 : corresponds to the type of arguments to the reduction stmt, and should *NOT*
6934 : be used to create the vectorized stmt. The right vectype for the vectorized
6935 : stmt is obtained from the type of the result X:
6936 : get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6937 :
6938 : This means that, contrary to "regular" reductions (or "regular" stmts in
6939 : general), the following equation:
6940 : STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6941 : does *NOT* necessarily hold for reduction patterns. */
6942 :
6943 : bool
6944 379641 : vectorizable_reduction (loop_vec_info loop_vinfo,
6945 : stmt_vec_info stmt_info, slp_tree slp_node,
6946 : slp_instance slp_node_instance,
6947 : stmt_vector_for_cost *cost_vec)
6948 : {
6949 379641 : tree vectype_in = NULL_TREE;
6950 379641 : enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6951 379641 : stmt_vec_info cond_stmt_vinfo = NULL;
6952 379641 : int i;
6953 379641 : int ncopies;
6954 379641 : bool single_defuse_cycle = false;
6955 379641 : tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6956 379641 : tree cond_reduc_val = NULL_TREE;
6957 :
6958 : /* Make sure it was already recognized as a reduction computation. */
6959 379641 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6960 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6961 379641 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6962 : return false;
6963 :
6964 : /* The reduction meta. */
6965 84188 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
6966 :
6967 84188 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6968 : {
6969 1490 : gcc_assert (is_a <gphi *> (stmt_info->stmt));
6970 : /* We eventually need to set a vector type on invariant arguments. */
6971 : unsigned j;
6972 : slp_tree child;
6973 4462 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6974 2980 : if (!vect_maybe_update_slp_op_vectype (child,
6975 : SLP_TREE_VECTYPE (slp_node)))
6976 : {
6977 0 : if (dump_enabled_p ())
6978 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6979 : "incompatible vector types for "
6980 : "invariants\n");
6981 0 : return false;
6982 : }
6983 2980 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
6984 2980 : && !useless_type_conversion_p (SLP_TREE_VECTYPE (slp_node),
6985 : SLP_TREE_VECTYPE (child)))
6986 : {
6987 : /* With bools we can have mask and non-mask precision vectors
6988 : or different non-mask precisions. while pattern recog is
6989 : supposed to guarantee consistency here, we do not have
6990 : pattern stmts for PHIs (PR123316).
6991 : Deal with that here instead of ICEing later. */
6992 8 : if (dump_enabled_p ())
6993 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6994 : "incompatible vector type setup from "
6995 : "bool pattern detection\n");
6996 8 : return false;
6997 : }
6998 : /* Analysis for double-reduction is done on the outer
6999 : loop PHI, nested cycles have no further restrictions. */
7000 1482 : SLP_TREE_TYPE (slp_node) = cycle_phi_info_type;
7001 1482 : return true;
7002 : }
7003 :
7004 82698 : if (!is_a <gphi *> (stmt_info->stmt))
7005 : {
7006 7943 : gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def);
7007 7943 : SLP_TREE_TYPE (slp_node) = reduc_vec_info_type;
7008 7943 : return true;
7009 : }
7010 :
7011 74755 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7012 74755 : stmt_vec_info phi_info = stmt_info;
7013 74755 : bool double_reduc = false;
7014 74755 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7015 : {
7016 : /* We arrive here for both the inner loop LC PHI and the
7017 : outer loop PHI. The latter is what we want to analyze the
7018 : reduction with. The LC PHI is handled by vectorizable_lc_phi. */
7019 312 : if (gimple_bb (stmt_info->stmt) != loop->header)
7020 0 : return false;
7021 :
7022 : /* Set loop and phi_info to the inner loop. */
7023 312 : use_operand_p use_p;
7024 312 : gimple *use_stmt;
7025 312 : bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7026 : &use_p, &use_stmt);
7027 312 : gcc_assert (res);
7028 312 : phi_info = loop_vinfo->lookup_stmt (use_stmt);
7029 312 : loop = loop->inner;
7030 312 : double_reduc = true;
7031 : }
7032 :
7033 74755 : const bool reduc_chain = reduc_info->is_reduc_chain;
7034 74755 : slp_node_instance->reduc_phis = slp_node;
7035 : /* ??? We're leaving slp_node to point to the PHIs, we only
7036 : need it to get at the number of vector stmts which wasn't
7037 : yet initialized for the instance root. */
7038 :
7039 : /* PHIs should not participate in patterns. */
7040 74755 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7041 74755 : gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7042 :
7043 : /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7044 : and compute the reduction chain length. Discover the real
7045 : reduction operation stmt on the way (slp_for_stmt_info). */
7046 74755 : unsigned reduc_chain_length = 0;
7047 74755 : stmt_info = NULL;
7048 74755 : slp_tree slp_for_stmt_info = NULL;
7049 74755 : slp_tree vdef_slp = slp_node_instance->root;
7050 165046 : while (vdef_slp != slp_node)
7051 : {
7052 91333 : int reduc_idx = SLP_TREE_REDUC_IDX (vdef_slp);
7053 91333 : if (reduc_idx == -1)
7054 : {
7055 1034 : if (dump_enabled_p ())
7056 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7057 : "reduction chain broken by patterns.\n");
7058 1042 : return false;
7059 : }
7060 90299 : stmt_vec_info vdef = SLP_TREE_REPRESENTATIVE (vdef_slp);
7061 90299 : if (is_a <gphi *> (vdef->stmt))
7062 : {
7063 624 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[reduc_idx];
7064 : /* Do not count PHIs towards the chain length. */
7065 624 : continue;
7066 : }
7067 89675 : gimple_match_op op;
7068 89675 : if (!gimple_extract_op (vdef->stmt, &op))
7069 : {
7070 0 : if (dump_enabled_p ())
7071 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7072 : "reduction chain includes unsupported"
7073 : " statement type.\n");
7074 0 : return false;
7075 : }
7076 89675 : if (CONVERT_EXPR_CODE_P (op.code))
7077 : {
7078 5234 : if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7079 : {
7080 8 : if (dump_enabled_p ())
7081 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7082 : "conversion in the reduction chain.\n");
7083 8 : return false;
7084 : }
7085 5226 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[0];
7086 : }
7087 : else
7088 : {
7089 : /* First non-conversion stmt. */
7090 84441 : if (!slp_for_stmt_info)
7091 73713 : slp_for_stmt_info = vdef_slp;
7092 :
7093 84441 : if (lane_reducing_op_p (op.code))
7094 : {
7095 : /* The last operand of lane-reducing operation is for
7096 : reduction. */
7097 714 : gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
7098 :
7099 714 : slp_tree op_node = SLP_TREE_CHILDREN (vdef_slp)[0];
7100 714 : tree vectype_op = SLP_TREE_VECTYPE (op_node);
7101 714 : tree type_op = TREE_TYPE (op.ops[0]);
7102 714 : if (!vectype_op)
7103 : {
7104 9 : vectype_op = get_vectype_for_scalar_type (loop_vinfo,
7105 : type_op);
7106 9 : if (!vectype_op
7107 9 : || !vect_maybe_update_slp_op_vectype (op_node,
7108 : vectype_op))
7109 0 : return false;
7110 : }
7111 :
7112 : /* To accommodate lane-reducing operations of mixed input
7113 : vectypes, choose input vectype with the least lanes for the
7114 : reduction PHI statement, which would result in the most
7115 : ncopies for vectorized reduction results. */
7116 714 : if (!vectype_in
7117 714 : || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7118 58 : < GET_MODE_SIZE (SCALAR_TYPE_MODE (type_op))))
7119 685 : vectype_in = vectype_op;
7120 : }
7121 83727 : else if (!vectype_in)
7122 73028 : vectype_in = SLP_TREE_VECTYPE (slp_node);
7123 84441 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[reduc_idx];
7124 : }
7125 89667 : reduc_chain_length++;
7126 : }
7127 73713 : if (!slp_for_stmt_info)
7128 : {
7129 0 : if (dump_enabled_p ())
7130 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7131 : "only noop-conversions in the reduction chain.\n");
7132 0 : return false;
7133 : }
7134 73713 : stmt_info = SLP_TREE_REPRESENTATIVE (slp_for_stmt_info);
7135 :
7136 : /* PHIs should not participate in patterns. */
7137 73713 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7138 :
7139 : /* 1. Is vectorizable reduction? */
7140 : /* Not supportable if the reduction variable is used in the loop, unless
7141 : it's a reduction chain. */
7142 73713 : if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7143 0 : && !reduc_chain)
7144 : return false;
7145 :
7146 : /* Reductions that are not used even in an enclosing outer-loop,
7147 : are expected to be "live" (used out of the loop). */
7148 73713 : if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7149 0 : && !STMT_VINFO_LIVE_P (stmt_info))
7150 : return false;
7151 :
7152 : /* 2. Has this been recognized as a reduction pattern?
7153 :
7154 : Check if STMT represents a pattern that has been recognized
7155 : in earlier analysis stages. For stmts that represent a pattern,
7156 : the STMT_VINFO_RELATED_STMT field records the last stmt in
7157 : the original sequence that constitutes the pattern. */
7158 :
7159 73713 : stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7160 73713 : if (orig_stmt_info)
7161 : {
7162 5111 : gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7163 5111 : gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7164 : }
7165 :
7166 : /* 3. Check the operands of the operation. The first operands are defined
7167 : inside the loop body. The last operand is the reduction variable,
7168 : which is defined by the loop-header-phi. */
7169 :
7170 73713 : tree vectype_out = SLP_TREE_VECTYPE (slp_for_stmt_info);
7171 73713 : VECT_REDUC_INFO_VECTYPE (reduc_info) = vectype_out;
7172 :
7173 73713 : gimple_match_op op;
7174 73713 : if (!gimple_extract_op (stmt_info->stmt, &op))
7175 0 : gcc_unreachable ();
7176 73713 : bool lane_reducing = lane_reducing_op_p (op.code);
7177 :
7178 73713 : if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7179 21980 : && !SCALAR_FLOAT_TYPE_P (op.type))
7180 : return false;
7181 :
7182 : /* Do not try to vectorize bit-precision reductions. */
7183 73713 : if (!type_has_mode_precision_p (op.type)
7184 1717 : && op.code != BIT_AND_EXPR
7185 1582 : && op.code != BIT_IOR_EXPR
7186 74189 : && op.code != BIT_XOR_EXPR)
7187 : return false;
7188 :
7189 : /* Lane-reducing ops also never can be used in a SLP reduction group
7190 : since we'll mix lanes belonging to different reductions. But it's
7191 : OK to use them in a reduction chain or when the reduction group
7192 : has just one element. */
7193 73403 : if (lane_reducing
7194 73403 : && !reduc_chain
7195 650 : && SLP_TREE_LANES (slp_node) > 1)
7196 : {
7197 0 : if (dump_enabled_p ())
7198 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7199 : "lane-reducing reduction in reduction group.\n");
7200 0 : return false;
7201 : }
7202 :
7203 : /* All uses but the last are expected to be defined in the loop.
7204 : The last use is the reduction variable. In case of nested cycle this
7205 : assumption is not true: we use reduc_index to record the index of the
7206 : reduction variable. */
7207 73403 : slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7208 73403 : tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7209 73403 : gcc_assert (op.code != COND_EXPR || !COMPARISON_CLASS_P (op.ops[0]));
7210 235269 : for (i = 0; i < (int) op.num_ops; i++)
7211 : {
7212 : /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7213 161866 : if (i == 0 && op.code == COND_EXPR)
7214 81005 : continue;
7215 :
7216 161032 : stmt_vec_info def_stmt_info;
7217 161032 : enum vect_def_type dt;
7218 161032 : if (!vect_is_simple_use (loop_vinfo, slp_for_stmt_info,
7219 : i, &op.ops[i], &slp_op[i], &dt,
7220 161032 : &vectype_op[i], &def_stmt_info))
7221 : {
7222 0 : if (dump_enabled_p ())
7223 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7224 : "use not simple.\n");
7225 0 : return false;
7226 : }
7227 :
7228 : /* Skip reduction operands, and for an IFN_COND_OP we might hit the
7229 : reduction operand twice (once as definition, once as else). */
7230 161032 : if (SLP_TREE_CHILDREN (slp_for_stmt_info)[i]
7231 322064 : == SLP_TREE_CHILDREN
7232 161032 : (slp_for_stmt_info)[SLP_TREE_REDUC_IDX (slp_for_stmt_info)])
7233 80171 : continue;
7234 :
7235 : /* There should be only one cycle def in the stmt, the one
7236 : leading to reduc_def. */
7237 80861 : if (SLP_TREE_CHILDREN (slp_for_stmt_info)[i]->cycle_info.id != -1)
7238 : return false;
7239 :
7240 80861 : if (!vectype_op[i])
7241 7382 : vectype_op[i]
7242 7382 : = get_vectype_for_scalar_type (loop_vinfo,
7243 7382 : TREE_TYPE (op.ops[i]), slp_op[i]);
7244 :
7245 : /* Record how the non-reduction-def value of COND_EXPR is defined.
7246 : ??? For a chain of multiple CONDs we'd have to match them up all. */
7247 80861 : if (op.code == COND_EXPR && reduc_chain_length == 1)
7248 : {
7249 811 : if (dt == vect_constant_def)
7250 : {
7251 98 : cond_reduc_dt = dt;
7252 98 : cond_reduc_val = op.ops[i];
7253 : }
7254 713 : else if (dt == vect_induction_def
7255 408 : && def_stmt_info
7256 1121 : && is_nonwrapping_integer_induction (def_stmt_info, loop))
7257 : {
7258 109 : cond_reduc_dt = dt;
7259 109 : cond_stmt_vinfo = def_stmt_info;
7260 : }
7261 : }
7262 : }
7263 :
7264 73403 : enum vect_reduction_type reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7265 : /* If we have a condition reduction, see if we can simplify it further. */
7266 73403 : if (reduction_type == COND_REDUCTION)
7267 : {
7268 822 : if (SLP_TREE_LANES (slp_node) != 1)
7269 : return false;
7270 :
7271 : /* When the condition uses the reduction value in the condition, fail. */
7272 798 : if (SLP_TREE_REDUC_IDX (slp_node) == 0)
7273 : {
7274 0 : if (dump_enabled_p ())
7275 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7276 : "condition depends on previous iteration\n");
7277 0 : return false;
7278 : }
7279 :
7280 798 : if (reduc_chain_length == 1
7281 798 : && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7282 : OPTIMIZE_FOR_SPEED)
7283 775 : || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7284 : vectype_in,
7285 : OPTIMIZE_FOR_SPEED)))
7286 : {
7287 0 : if (dump_enabled_p ())
7288 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7289 : "optimizing condition reduction with"
7290 : " FOLD_EXTRACT_LAST.\n");
7291 0 : VECT_REDUC_INFO_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7292 : }
7293 798 : else if (cond_reduc_dt == vect_induction_def)
7294 : {
7295 109 : tree base
7296 : = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7297 109 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7298 :
7299 109 : gcc_assert (TREE_CODE (base) == INTEGER_CST
7300 : && TREE_CODE (step) == INTEGER_CST);
7301 109 : cond_reduc_val = NULL_TREE;
7302 109 : enum tree_code cond_reduc_op_code = ERROR_MARK;
7303 109 : tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7304 109 : if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7305 : ;
7306 : /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7307 : above base; punt if base is the minimum value of the type for
7308 : MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7309 97 : else if (tree_int_cst_sgn (step) == -1)
7310 : {
7311 18 : cond_reduc_op_code = MIN_EXPR;
7312 18 : if (tree_int_cst_sgn (base) == -1)
7313 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7314 18 : else if (tree_int_cst_lt (base,
7315 18 : TYPE_MAX_VALUE (TREE_TYPE (base))))
7316 18 : cond_reduc_val
7317 18 : = int_const_binop (PLUS_EXPR, base, integer_one_node);
7318 : }
7319 : else
7320 : {
7321 79 : cond_reduc_op_code = MAX_EXPR;
7322 79 : if (tree_int_cst_sgn (base) == 1)
7323 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7324 79 : else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7325 : base))
7326 79 : cond_reduc_val
7327 79 : = int_const_binop (MINUS_EXPR, base, integer_one_node);
7328 : }
7329 97 : if (cond_reduc_val)
7330 : {
7331 97 : if (dump_enabled_p ())
7332 61 : dump_printf_loc (MSG_NOTE, vect_location,
7333 : "condition expression based on "
7334 : "integer induction.\n");
7335 97 : VECT_REDUC_INFO_CODE (reduc_info) = cond_reduc_op_code;
7336 97 : VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info)
7337 97 : = cond_reduc_val;
7338 97 : VECT_REDUC_INFO_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7339 : }
7340 : }
7341 689 : else if (cond_reduc_dt == vect_constant_def)
7342 : {
7343 88 : enum vect_def_type cond_initial_dt;
7344 88 : tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7345 88 : vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7346 88 : if (cond_initial_dt == vect_constant_def
7347 113 : && types_compatible_p (TREE_TYPE (cond_initial_val),
7348 25 : TREE_TYPE (cond_reduc_val)))
7349 : {
7350 25 : tree e = fold_binary (LE_EXPR, boolean_type_node,
7351 : cond_initial_val, cond_reduc_val);
7352 25 : if (e && (integer_onep (e) || integer_zerop (e)))
7353 : {
7354 25 : if (dump_enabled_p ())
7355 16 : dump_printf_loc (MSG_NOTE, vect_location,
7356 : "condition expression based on "
7357 : "compile time constant.\n");
7358 : /* Record reduction code at analysis stage. */
7359 25 : VECT_REDUC_INFO_CODE (reduc_info)
7360 25 : = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7361 25 : VECT_REDUC_INFO_TYPE (reduc_info) = CONST_COND_REDUCTION;
7362 : }
7363 : }
7364 : }
7365 : }
7366 :
7367 73379 : if (STMT_VINFO_LIVE_P (phi_info))
7368 : return false;
7369 :
7370 73379 : ncopies = vect_get_num_copies (loop_vinfo, slp_node);
7371 :
7372 73379 : gcc_assert (ncopies >= 1);
7373 :
7374 73379 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7375 :
7376 : /* 4.2. Check support for the epilog operation.
7377 :
7378 : If STMT represents a reduction pattern, then the type of the
7379 : reduction variable may be different than the type of the rest
7380 : of the arguments. For example, consider the case of accumulation
7381 : of shorts into an int accumulator; The original code:
7382 : S1: int_a = (int) short_a;
7383 : orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7384 :
7385 : was replaced with:
7386 : STMT: int_acc = widen_sum <short_a, int_acc>
7387 :
7388 : This means that:
7389 : 1. The tree-code that is used to create the vector operation in the
7390 : epilog code (that reduces the partial results) is not the
7391 : tree-code of STMT, but is rather the tree-code of the original
7392 : stmt from the pattern that STMT is replacing. I.e, in the example
7393 : above we want to use 'widen_sum' in the loop, but 'plus' in the
7394 : epilog.
7395 : 2. The type (mode) we use to check available target support
7396 : for the vector operation to be created in the *epilog*, is
7397 : determined by the type of the reduction variable (in the example
7398 : above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7399 : However the type (mode) we use to check available target support
7400 : for the vector operation to be created *inside the loop*, is
7401 : determined by the type of the other arguments to STMT (in the
7402 : example we'd check this: optab_handler (widen_sum_optab,
7403 : vect_short_mode)).
7404 :
7405 : This is contrary to "regular" reductions, in which the types of all
7406 : the arguments are the same as the type of the reduction variable.
7407 : For "regular" reductions we can therefore use the same vector type
7408 : (and also the same tree-code) when generating the epilog code and
7409 : when generating the code inside the loop. */
7410 :
7411 73379 : code_helper orig_code = VECT_REDUC_INFO_CODE (reduc_info);
7412 :
7413 : /* If conversion might have created a conditional operation like
7414 : IFN_COND_ADD already. Use the internal code for the following checks. */
7415 73379 : if (orig_code.is_internal_fn ())
7416 : {
7417 6836 : tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7418 6836 : orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7419 : }
7420 :
7421 73379 : VECT_REDUC_INFO_CODE (reduc_info) = orig_code;
7422 :
7423 73379 : reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7424 73379 : if (reduction_type == TREE_CODE_REDUCTION)
7425 : {
7426 : /* Check whether it's ok to change the order of the computation.
7427 : Generally, when vectorizing a reduction we change the order of the
7428 : computation. This may change the behavior of the program in some
7429 : cases, so we need to check that this is ok. One exception is when
7430 : vectorizing an outer-loop: the inner-loop is executed sequentially,
7431 : and therefore vectorizing reductions in the inner-loop during
7432 : outer-loop vectorization is safe. Likewise when we are vectorizing
7433 : a series of reductions using SLP and the VF is one the reductions
7434 : are performed in scalar order. */
7435 72581 : if (!reduc_chain
7436 72581 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7437 : ;
7438 72424 : else if (needs_fold_left_reduction_p (op.type, orig_code))
7439 : {
7440 : /* When vectorizing a reduction chain w/o SLP the reduction PHI
7441 : is not directy used in stmt. */
7442 5168 : if (reduc_chain_length != 1)
7443 : {
7444 67 : if (dump_enabled_p ())
7445 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7446 : "in-order reduction chain without SLP.\n");
7447 67 : return false;
7448 : }
7449 : /* Code generation doesn't support function calls other
7450 : than .COND_*. */
7451 5101 : if (!op.code.is_tree_code ()
7452 5307 : && !(op.code.is_internal_fn ()
7453 103 : && conditional_internal_fn_code (internal_fn (op.code))
7454 : != ERROR_MARK))
7455 : {
7456 18 : if (dump_enabled_p ())
7457 16 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7458 : "in-order reduction chain operation not "
7459 : "supported.\n");
7460 18 : return false;
7461 : }
7462 5083 : VECT_REDUC_INFO_TYPE (reduc_info)
7463 5083 : = reduction_type = FOLD_LEFT_REDUCTION;
7464 : }
7465 67256 : else if (!commutative_binary_op_p (orig_code, op.type)
7466 67256 : || !associative_binary_op_p (orig_code, op.type))
7467 : {
7468 144 : if (dump_enabled_p ())
7469 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7470 : "reduction: not commutative/associative\n");
7471 144 : return false;
7472 : }
7473 : }
7474 :
7475 5083 : if ((reduction_type == COND_REDUCTION
7476 : || reduction_type == INTEGER_INDUC_COND_REDUCTION
7477 : || reduction_type == CONST_COND_REDUCTION
7478 68067 : || reduction_type == EXTRACT_LAST_REDUCTION)
7479 798 : && ncopies > 1)
7480 : {
7481 276 : if (dump_enabled_p ())
7482 60 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7483 : "multiple types in condition reduction.\n");
7484 276 : return false;
7485 : }
7486 :
7487 : /* See if we can convert a mask vector to a corresponding bool data vector
7488 : to perform the epilogue reduction. */
7489 72874 : tree alt_vectype_out = NULL_TREE;
7490 72874 : if (VECTOR_BOOLEAN_TYPE_P (vectype_out))
7491 : {
7492 1121 : alt_vectype_out
7493 2242 : = get_related_vectype_for_scalar_type (loop_vinfo->vector_mode,
7494 1121 : TREE_TYPE (vectype_out),
7495 : TYPE_VECTOR_SUBPARTS
7496 : (vectype_out));
7497 1121 : if (!alt_vectype_out
7498 1121 : || maybe_ne (TYPE_VECTOR_SUBPARTS (alt_vectype_out),
7499 2220 : TYPE_VECTOR_SUBPARTS (vectype_out))
7500 2242 : || !expand_vec_cond_expr_p (alt_vectype_out, vectype_out))
7501 22 : alt_vectype_out = NULL_TREE;
7502 : }
7503 :
7504 72874 : internal_fn reduc_fn = IFN_LAST;
7505 72874 : if (reduction_type == TREE_CODE_REDUCTION
7506 72874 : || reduction_type == FOLD_LEFT_REDUCTION
7507 : || reduction_type == INTEGER_INDUC_COND_REDUCTION
7508 522 : || reduction_type == CONST_COND_REDUCTION)
7509 : {
7510 67383 : if (reduction_type == FOLD_LEFT_REDUCTION
7511 76763 : ? fold_left_reduction_fn (orig_code, &reduc_fn)
7512 67383 : : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7513 : {
7514 71794 : internal_fn sbool_fn = IFN_LAST;
7515 71794 : if (reduc_fn == IFN_LAST)
7516 : ;
7517 69813 : else if ((!VECTOR_BOOLEAN_TYPE_P (vectype_out)
7518 1121 : || (GET_MODE_CLASS (TYPE_MODE (vectype_out))
7519 : == MODE_VECTOR_BOOL))
7520 138505 : && direct_internal_fn_supported_p (reduc_fn, vectype_out,
7521 : OPTIMIZE_FOR_SPEED))
7522 : ;
7523 18306 : else if (VECTOR_BOOLEAN_TYPE_P (vectype_out)
7524 1121 : && sbool_reduction_fn_for_fn (reduc_fn, &sbool_fn)
7525 19427 : && direct_internal_fn_supported_p (sbool_fn, vectype_out,
7526 : OPTIMIZE_FOR_SPEED))
7527 110 : reduc_fn = sbool_fn;
7528 18196 : else if (reduction_type != FOLD_LEFT_REDUCTION
7529 18196 : && alt_vectype_out
7530 18196 : && direct_internal_fn_supported_p (reduc_fn, alt_vectype_out,
7531 : OPTIMIZE_FOR_SPEED))
7532 790 : VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info) = alt_vectype_out;
7533 : else
7534 : {
7535 17406 : if (dump_enabled_p ())
7536 922 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7537 : "reduc op not supported by target.\n");
7538 :
7539 17406 : reduc_fn = IFN_LAST;
7540 : }
7541 : }
7542 : else
7543 : {
7544 672 : if (dump_enabled_p ())
7545 48 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7546 : "no reduc code for scalar code.\n");
7547 :
7548 672 : return false;
7549 : }
7550 71794 : if (reduc_fn == IFN_LAST
7551 71794 : && VECTOR_BOOLEAN_TYPE_P (vectype_out))
7552 : {
7553 221 : if (!alt_vectype_out)
7554 : {
7555 12 : if (dump_enabled_p ())
7556 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7557 : "cannot turn mask into bool data vector for "
7558 : "reduction epilogue.\n");
7559 12 : return false;
7560 : }
7561 209 : VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info) = alt_vectype_out;
7562 : }
7563 : }
7564 408 : else if (reduction_type == COND_REDUCTION)
7565 : {
7566 408 : int scalar_precision
7567 408 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7568 408 : cr_index_scalar_type = make_unsigned_type (scalar_precision);
7569 408 : cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7570 : vectype_out);
7571 :
7572 408 : if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7573 : OPTIMIZE_FOR_SPEED))
7574 12 : reduc_fn = IFN_REDUC_MAX;
7575 : }
7576 72190 : VECT_REDUC_INFO_FN (reduc_info) = reduc_fn;
7577 :
7578 72190 : if (reduction_type != EXTRACT_LAST_REDUCTION
7579 : && reduc_fn == IFN_LAST
7580 : && !nunits_out.is_constant ())
7581 : {
7582 : if (dump_enabled_p ())
7583 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7584 : "missing target support for reduction on"
7585 : " variable-length vectors.\n");
7586 : return false;
7587 : }
7588 :
7589 : /* For SLP reductions, see if there is a neutral value we can use. */
7590 72190 : tree neutral_op = NULL_TREE;
7591 72190 : tree initial_value = NULL_TREE;
7592 72190 : if (reduc_chain)
7593 2216 : initial_value = vect_phi_initial_value (reduc_def_phi);
7594 72190 : neutral_op = neutral_op_for_reduction (TREE_TYPE
7595 : (gimple_phi_result (reduc_def_phi)),
7596 : orig_code, initial_value);
7597 72190 : VECT_REDUC_INFO_NEUTRAL_OP (reduc_info) = neutral_op;
7598 :
7599 72190 : if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7600 : {
7601 : /* We can't support in-order reductions of code such as this:
7602 :
7603 : for (int i = 0; i < n1; ++i)
7604 : for (int j = 0; j < n2; ++j)
7605 : l += a[j];
7606 :
7607 : since GCC effectively transforms the loop when vectorizing:
7608 :
7609 : for (int i = 0; i < n1 / VF; ++i)
7610 : for (int j = 0; j < n2; ++j)
7611 : for (int k = 0; k < VF; ++k)
7612 : l += a[j];
7613 :
7614 : which is a reassociation of the original operation. */
7615 56 : if (dump_enabled_p ())
7616 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7617 : "in-order double reduction not supported.\n");
7618 :
7619 56 : return false;
7620 : }
7621 :
7622 72134 : if (reduction_type == FOLD_LEFT_REDUCTION
7623 4355 : && SLP_TREE_LANES (slp_node) > 1
7624 159 : && !reduc_chain)
7625 : {
7626 : /* We cannot use in-order reductions in this case because there is
7627 : an implicit reassociation of the operations involved. */
7628 64 : if (dump_enabled_p ())
7629 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7630 : "in-order unchained SLP reductions not supported.\n");
7631 64 : return false;
7632 : }
7633 :
7634 : /* For double reductions, and for SLP reductions with a neutral value,
7635 : we construct a variable-length initial vector by loading a vector
7636 : full of the neutral value and then shift-and-inserting the start
7637 : values into the low-numbered elements. This is however not needed
7638 : when neutral and initial value are equal or we can handle the
7639 : initial value via adjustment in the epilogue. */
7640 72070 : if ((double_reduc || neutral_op)
7641 : && !nunits_out.is_constant ()
7642 : && reduction_type != INTEGER_INDUC_COND_REDUCTION
7643 : && !((SLP_TREE_LANES (slp_node) == 1 || reduc_chain)
7644 : && neutral_op
7645 : && (!double_reduc
7646 : || operand_equal_p (neutral_op,
7647 : vect_phi_initial_value (reduc_def_phi))))
7648 : && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7649 : vectype_out, OPTIMIZE_FOR_BOTH))
7650 : {
7651 : if (dump_enabled_p ())
7652 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7653 : "reduction on variable-length vectors requires"
7654 : " target support for a vector-shift-and-insert"
7655 : " operation.\n");
7656 : return false;
7657 : }
7658 :
7659 : /* Check extra constraints for variable-length unchained SLP reductions. */
7660 72070 : if (!reduc_chain
7661 : && !nunits_out.is_constant ())
7662 : {
7663 : /* We checked above that we could build the initial vector when
7664 : there's a neutral element value. Check here for the case in
7665 : which each SLP statement has its own initial value and in which
7666 : that value needs to be repeated for every instance of the
7667 : statement within the initial vector. */
7668 : unsigned int group_size = SLP_TREE_LANES (slp_node);
7669 : if (!neutral_op
7670 : && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7671 : TREE_TYPE (vectype_out)))
7672 : {
7673 : if (dump_enabled_p ())
7674 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7675 : "unsupported form of SLP reduction for"
7676 : " variable-length vectors: cannot build"
7677 : " initial vector.\n");
7678 : return false;
7679 : }
7680 : /* The epilogue code relies on the number of elements being a multiple
7681 : of the group size. The duplicate-and-interleave approach to setting
7682 : up the initial vector does too. */
7683 : if (!multiple_p (nunits_out, group_size))
7684 : {
7685 : if (dump_enabled_p ())
7686 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7687 : "unsupported form of SLP reduction for"
7688 : " variable-length vectors: the vector size"
7689 : " is not a multiple of the number of results.\n");
7690 : return false;
7691 : }
7692 : }
7693 :
7694 72070 : if (reduction_type == COND_REDUCTION)
7695 : {
7696 408 : widest_int ni;
7697 :
7698 408 : if (! max_loop_iterations (loop, &ni))
7699 : {
7700 14 : if (dump_enabled_p ())
7701 0 : dump_printf_loc (MSG_NOTE, vect_location,
7702 : "loop count not known, cannot create cond "
7703 : "reduction.\n");
7704 14 : return false;
7705 : }
7706 : /* Convert backedges to iterations. */
7707 394 : ni += 1;
7708 :
7709 : /* The additional index will be the same type as the condition. Check
7710 : that the loop can fit into this less one (because we'll use up the
7711 : zero slot for when there are no matches). */
7712 394 : tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7713 394 : if (wi::geu_p (ni, wi::to_widest (max_index)))
7714 : {
7715 90 : if (dump_enabled_p ())
7716 54 : dump_printf_loc (MSG_NOTE, vect_location,
7717 : "loop size is greater than data size.\n");
7718 90 : return false;
7719 : }
7720 408 : }
7721 :
7722 : /* In case the vectorization factor (VF) is bigger than the number
7723 : of elements that we can fit in a vectype (nunits), we have to generate
7724 : more than one vector stmt - i.e - we need to "unroll" the
7725 : vector stmt by a factor VF/nunits. For more details see documentation
7726 : in vectorizable_operation. */
7727 :
7728 : /* If the reduction is used in an outer loop we need to generate
7729 : VF intermediate results, like so (e.g. for ncopies=2):
7730 : r0 = phi (init, r0)
7731 : r1 = phi (init, r1)
7732 : r0 = x0 + r0;
7733 : r1 = x1 + r1;
7734 : (i.e. we generate VF results in 2 registers).
7735 : In this case we have a separate def-use cycle for each copy, and therefore
7736 : for each copy we get the vector def for the reduction variable from the
7737 : respective phi node created for this copy.
7738 :
7739 : Otherwise (the reduction is unused in the loop nest), we can combine
7740 : together intermediate results, like so (e.g. for ncopies=2):
7741 : r = phi (init, r)
7742 : r = x0 + r;
7743 : r = x1 + r;
7744 : (i.e. we generate VF/2 results in a single register).
7745 : In this case for each copy we get the vector def for the reduction variable
7746 : from the vectorized reduction operation generated in the previous iteration.
7747 :
7748 : This only works when we see both the reduction PHI and its only consumer
7749 : in vectorizable_reduction and there are no intermediate stmts
7750 : participating. When unrolling we want each unrolled iteration to have its
7751 : own reduction accumulator since one of the main goals of unrolling a
7752 : reduction is to reduce the aggregate loop-carried latency. */
7753 71966 : if (ncopies > 1
7754 71966 : && !reduc_chain
7755 8001 : && SLP_TREE_LANES (slp_node) == 1
7756 7830 : && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7757 7807 : && reduc_chain_length == 1
7758 7420 : && loop_vinfo->suggested_unroll_factor == 1)
7759 71966 : single_defuse_cycle = true;
7760 :
7761 71966 : if (single_defuse_cycle && !lane_reducing)
7762 : {
7763 6475 : gcc_assert (op.code != COND_EXPR);
7764 :
7765 : /* 4. check support for the operation in the loop
7766 :
7767 : This isn't necessary for the lane reduction codes, since they
7768 : can only be produced by pattern matching, and it's up to the
7769 : pattern matcher to test for support. The main reason for
7770 : specifically skipping this step is to avoid rechecking whether
7771 : mixed-sign dot-products can be implemented using signed
7772 : dot-products. */
7773 6475 : machine_mode vec_mode = TYPE_MODE (vectype_in);
7774 6475 : if (!directly_supported_p (op.code, vectype_in, optab_vector))
7775 : {
7776 2065 : if (dump_enabled_p ())
7777 44 : dump_printf (MSG_NOTE, "op not supported by target.\n");
7778 4130 : if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7779 2065 : || !vect_can_vectorize_without_simd_p (op.code))
7780 : single_defuse_cycle = false;
7781 : else
7782 5 : if (dump_enabled_p ())
7783 0 : dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7784 : }
7785 :
7786 6475 : if (vect_emulated_vector_p (vectype_in)
7787 6475 : && !vect_can_vectorize_without_simd_p (op.code))
7788 : {
7789 0 : if (dump_enabled_p ())
7790 0 : dump_printf (MSG_NOTE, "using word mode not possible.\n");
7791 0 : return false;
7792 : }
7793 : }
7794 71966 : if (dump_enabled_p () && single_defuse_cycle)
7795 695 : dump_printf_loc (MSG_NOTE, vect_location,
7796 : "using single def-use cycle for reduction by reducing "
7797 : "multiple vectors to one in the loop body\n");
7798 71966 : VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7799 :
7800 : /* For lane-reducing operation, the below processing related to single
7801 : defuse-cycle will be done in its own vectorizable function. One more
7802 : thing to note is that the operation must not be involved in fold-left
7803 : reduction. */
7804 71966 : single_defuse_cycle &= !lane_reducing;
7805 :
7806 71966 : if (single_defuse_cycle || reduction_type == FOLD_LEFT_REDUCTION)
7807 28315 : for (i = 0; i < (int) op.num_ops; i++)
7808 19674 : if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7809 : {
7810 0 : if (dump_enabled_p ())
7811 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7812 : "incompatible vector types for invariants\n");
7813 0 : return false;
7814 : }
7815 :
7816 71966 : vect_model_reduction_cost (loop_vinfo, slp_for_stmt_info, reduc_fn,
7817 : reduction_type, ncopies, cost_vec);
7818 : /* Cost the reduction op inside the loop if transformed via
7819 : vect_transform_reduction for non-lane-reducing operation. Otherwise
7820 : this is costed by the separate vectorizable_* routines. */
7821 71966 : if (single_defuse_cycle)
7822 4415 : record_stmt_cost (cost_vec, ncopies, vector_stmt,
7823 : slp_for_stmt_info, 0, vect_body);
7824 :
7825 71966 : if (dump_enabled_p ()
7826 71966 : && reduction_type == FOLD_LEFT_REDUCTION)
7827 244 : dump_printf_loc (MSG_NOTE, vect_location,
7828 : "using an in-order (fold-left) reduction.\n");
7829 71966 : SLP_TREE_TYPE (slp_node) = cycle_phi_info_type;
7830 :
7831 : /* All but single defuse-cycle optimized and fold-left reductions go
7832 : through their own vectorizable_* routines. */
7833 71966 : stmt_vec_info tem
7834 71966 : = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (slp_node_instance));
7835 71966 : if (!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
7836 63325 : STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7837 : else
7838 : {
7839 8641 : STMT_VINFO_DEF_TYPE (tem) = vect_reduction_def;
7840 8641 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7841 4015 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
7842 : slp_node, op.code, op.type,
7843 : vectype_in);
7844 : }
7845 : return true;
7846 : }
7847 :
7848 : /* STMT_INFO is a dot-product reduction whose multiplication operands
7849 : have different signs. Emit a sequence to emulate the operation
7850 : using a series of signed DOT_PROD_EXPRs and return the last
7851 : statement generated. VEC_DEST is the result of the vector operation
7852 : and VOP lists its inputs. */
7853 :
7854 : static gassign *
7855 4 : vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7856 : gimple_stmt_iterator *gsi, tree vec_dest,
7857 : tree vop[3])
7858 : {
7859 4 : tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
7860 4 : tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
7861 4 : tree narrow_elttype = TREE_TYPE (narrow_vectype);
7862 4 : gimple *new_stmt;
7863 :
7864 : /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
7865 4 : if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
7866 0 : std::swap (vop[0], vop[1]);
7867 :
7868 : /* Convert all inputs to signed types. */
7869 12 : for (int i = 1; i < 3; ++i)
7870 8 : if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
7871 : {
7872 0 : tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
7873 0 : new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
7874 0 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7875 0 : vop[i] = tmp;
7876 : }
7877 :
7878 : /* In the comments below we assume 8-bit inputs for simplicity,
7879 : but the approach works for any full integer type. */
7880 :
7881 : /* Create a vector of -128. */
7882 4 : tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
7883 4 : tree min_narrow = build_vector_from_val (TREE_TYPE (vop[0]),
7884 4 : fold_convert
7885 : (TREE_TYPE (TREE_TYPE (vop[0])),
7886 : min_narrow_elttype));
7887 :
7888 : /* Create a vector of 64. */
7889 4 : auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
7890 4 : tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
7891 4 : half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
7892 :
7893 : /* Emit: SUB_RES = VOP[0] - 128 in an unsigned type. */
7894 4 : tree sub_res = make_ssa_name (TREE_TYPE (vop[0]));
7895 4 : new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
7896 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7897 :
7898 4 : vop[0] = make_ssa_name (narrow_vectype);
7899 4 : new_stmt = gimple_build_assign (vop[0], VIEW_CONVERT_EXPR,
7900 : build1 (VIEW_CONVERT_EXPR, narrow_vectype,
7901 : sub_res));
7902 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7903 :
7904 : /* Emit:
7905 :
7906 : STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
7907 : STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
7908 : STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
7909 :
7910 : on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
7911 : Doing the two 64 * y steps first allows more time to compute x. */
7912 4 : tree stage1 = make_ssa_name (wide_vectype);
7913 4 : new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
7914 : vop[1], half_narrow, vop[2]);
7915 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7916 :
7917 4 : tree stage2 = make_ssa_name (wide_vectype);
7918 4 : new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
7919 : vop[1], half_narrow, stage1);
7920 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7921 :
7922 4 : tree stage3 = make_ssa_name (wide_vectype);
7923 4 : new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
7924 : vop[0], vop[1], stage2);
7925 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7926 :
7927 : /* Convert STAGE3 to the reduction type. */
7928 4 : return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
7929 4 : }
7930 :
7931 : /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7932 : value. */
7933 :
7934 : bool
7935 2581 : vect_transform_reduction (loop_vec_info loop_vinfo,
7936 : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7937 : slp_tree slp_node)
7938 : {
7939 2581 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
7940 2581 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7941 2581 : unsigned vec_num;
7942 :
7943 2581 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
7944 :
7945 2581 : if (nested_in_vect_loop_p (loop, stmt_info))
7946 : {
7947 0 : loop = loop->inner;
7948 0 : gcc_assert (VECT_REDUC_INFO_DEF_TYPE (reduc_info)
7949 : == vect_double_reduction_def);
7950 : }
7951 :
7952 2581 : gimple_match_op op;
7953 2581 : if (!gimple_extract_op (stmt_info->stmt, &op))
7954 0 : gcc_unreachable ();
7955 :
7956 : /* All uses but the last are expected to be defined in the loop.
7957 : The last use is the reduction variable. In case of nested cycle this
7958 : assumption is not true: we use reduc_index to record the index of the
7959 : reduction variable. */
7960 2581 : int reduc_index = SLP_TREE_REDUC_IDX (slp_node);
7961 2581 : tree vectype_in = SLP_TREE_VECTYPE (slp_node);
7962 2581 : if (lane_reducing_op_p (op.code))
7963 260 : vectype_in = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (slp_node)[0]);
7964 :
7965 2581 : vec_num = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
7966 :
7967 2581 : code_helper code = canonicalize_code (op.code, op.type);
7968 2581 : internal_fn cond_fn
7969 476 : = ((code.is_internal_fn ()
7970 476 : && internal_fn_mask_index ((internal_fn)code) != -1)
7971 2581 : ? (internal_fn)code : get_conditional_internal_fn (code, op.type));
7972 :
7973 2581 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7974 2581 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
7975 2581 : bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7976 :
7977 : /* Transform. */
7978 2581 : tree new_temp = NULL_TREE;
7979 18067 : auto_vec<tree> vec_oprnds[3];
7980 :
7981 2581 : if (dump_enabled_p ())
7982 748 : dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7983 :
7984 : /* A binary COND_OP reduction must have the same definition and else
7985 : value. */
7986 3057 : bool cond_fn_p = code.is_internal_fn ()
7987 476 : && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
7988 476 : if (cond_fn_p)
7989 : {
7990 476 : gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
7991 : || code == IFN_COND_MUL || code == IFN_COND_AND
7992 : || code == IFN_COND_IOR || code == IFN_COND_XOR
7993 : || code == IFN_COND_MIN || code == IFN_COND_MAX);
7994 476 : gcc_assert (op.num_ops == 4
7995 : && (op.ops[reduc_index]
7996 : == op.ops[internal_fn_else_index ((internal_fn) code)]));
7997 : }
7998 :
7999 2581 : bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8000 :
8001 2581 : vect_reduction_type reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
8002 2581 : if (reduction_type == FOLD_LEFT_REDUCTION)
8003 : {
8004 839 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
8005 839 : gcc_assert (code.is_tree_code () || cond_fn_p);
8006 839 : return vectorize_fold_left_reduction
8007 839 : (loop_vinfo, stmt_info, gsi, slp_node,
8008 839 : code, reduc_fn, op.num_ops, vectype_in,
8009 839 : reduc_index, masks, lens);
8010 : }
8011 :
8012 1742 : bool single_defuse_cycle = VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info);
8013 1742 : bool lane_reducing = lane_reducing_op_p (code);
8014 1482 : gcc_assert (single_defuse_cycle || lane_reducing);
8015 :
8016 1742 : if (lane_reducing)
8017 : {
8018 : /* The last operand of lane-reducing op is for reduction. */
8019 260 : gcc_assert (reduc_index == (int) op.num_ops - 1);
8020 : }
8021 :
8022 : /* Create the destination vector */
8023 1742 : tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8024 1742 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8025 :
8026 : /* Get NCOPIES vector definitions for all operands except the reduction
8027 : definition. */
8028 1742 : if (!cond_fn_p)
8029 : {
8030 1289 : gcc_assert (reduc_index >= 0 && reduc_index <= 2);
8031 2131 : vect_get_vec_defs (loop_vinfo, slp_node,
8032 1289 : single_defuse_cycle && reduc_index == 0
8033 : ? NULL_TREE : op.ops[0], &vec_oprnds[0],
8034 1289 : single_defuse_cycle && reduc_index == 1
8035 : ? NULL_TREE : op.ops[1], &vec_oprnds[1],
8036 1289 : op.num_ops == 3
8037 260 : && !(single_defuse_cycle && reduc_index == 2)
8038 : ? op.ops[2] : NULL_TREE, &vec_oprnds[2]);
8039 : }
8040 : else
8041 : {
8042 : /* For a conditional operation pass the truth type as mask
8043 : vectype. */
8044 453 : gcc_assert (single_defuse_cycle
8045 : && (reduc_index == 1 || reduc_index == 2));
8046 453 : vect_get_vec_defs (loop_vinfo, slp_node, op.ops[0],
8047 : &vec_oprnds[0],
8048 : reduc_index == 1 ? NULL_TREE : op.ops[1],
8049 : &vec_oprnds[1],
8050 : reduc_index == 2 ? NULL_TREE : op.ops[2],
8051 : &vec_oprnds[2]);
8052 : }
8053 :
8054 : /* For single def-use cycles get one copy of the vectorized reduction
8055 : definition. */
8056 1742 : if (single_defuse_cycle)
8057 : {
8058 1653 : vect_get_vec_defs (loop_vinfo, slp_node,
8059 : reduc_index == 0 ? op.ops[0] : NULL_TREE,
8060 : &vec_oprnds[0],
8061 : reduc_index == 1 ? op.ops[1] : NULL_TREE,
8062 : &vec_oprnds[1],
8063 : reduc_index == 2 ? op.ops[2] : NULL_TREE,
8064 : &vec_oprnds[2]);
8065 : }
8066 89 : else if (lane_reducing)
8067 : {
8068 : /* For normal reduction, consistency between vectorized def/use is
8069 : naturally ensured when mapping from scalar statement. But if lane-
8070 : reducing op is involved in reduction, thing would become somewhat
8071 : complicated in that the op's result and operand for accumulation are
8072 : limited to less lanes than other operands, which certainly causes
8073 : def/use mismatch on adjacent statements around the op if do not have
8074 : any kind of specific adjustment. One approach is to refit lane-
8075 : reducing op in the way of introducing new trivial pass-through copies
8076 : to fix possible def/use gap, so as to make it behave like a normal op.
8077 : And vector reduction PHIs are always generated to the full extent, no
8078 : matter lane-reducing op exists or not. If some copies or PHIs are
8079 : actually superfluous, they would be cleaned up by passes after
8080 : vectorization. An example for single-lane slp, lane-reducing ops
8081 : with mixed input vectypes in a reduction chain, is given as below.
8082 : Similarly, this handling is applicable for multiple-lane slp as well.
8083 :
8084 : int sum = 1;
8085 : for (i)
8086 : {
8087 : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
8088 : sum += w[i]; // widen-sum <vector(16) char>
8089 : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
8090 : sum += n[i]; // normal <vector(4) int>
8091 : }
8092 :
8093 : The vector size is 128-bit,vectorization factor is 16. Reduction
8094 : statements would be transformed as:
8095 :
8096 : vector<4> int sum_v0 = { 0, 0, 0, 1 };
8097 : vector<4> int sum_v1 = { 0, 0, 0, 0 };
8098 : vector<4> int sum_v2 = { 0, 0, 0, 0 };
8099 : vector<4> int sum_v3 = { 0, 0, 0, 0 };
8100 :
8101 : for (i / 16)
8102 : {
8103 : sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
8104 : sum_v1 = sum_v1; // copy
8105 : sum_v2 = sum_v2; // copy
8106 : sum_v3 = sum_v3; // copy
8107 :
8108 : sum_v0 = sum_v0; // copy
8109 : sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
8110 : sum_v2 = sum_v2; // copy
8111 : sum_v3 = sum_v3; // copy
8112 :
8113 : sum_v0 = sum_v0; // copy
8114 : sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1);
8115 : sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2);
8116 : sum_v3 = sum_v3; // copy
8117 :
8118 : sum_v0 += n_v0[i: 0 ~ 3 ];
8119 : sum_v1 += n_v1[i: 4 ~ 7 ];
8120 : sum_v2 += n_v2[i: 8 ~ 11];
8121 : sum_v3 += n_v3[i: 12 ~ 15];
8122 : }
8123 :
8124 : Moreover, for a higher instruction parallelism in final vectorized
8125 : loop, it is considered to make those effective vector lane-reducing
8126 : ops be distributed evenly among all def-use cycles. In the above
8127 : example, DOT_PROD, WIDEN_SUM and SADs are generated into disparate
8128 : cycles, instruction dependency among them could be eliminated. */
8129 89 : unsigned effec_ncopies = vec_oprnds[0].length ();
8130 89 : unsigned total_ncopies = vec_oprnds[reduc_index].length ();
8131 :
8132 89 : gcc_assert (effec_ncopies <= total_ncopies);
8133 :
8134 89 : if (effec_ncopies < total_ncopies)
8135 : {
8136 267 : for (unsigned i = 0; i < op.num_ops - 1; i++)
8137 : {
8138 356 : gcc_assert (vec_oprnds[i].length () == effec_ncopies);
8139 178 : vec_oprnds[i].safe_grow_cleared (total_ncopies);
8140 : }
8141 : }
8142 :
8143 89 : tree reduc_vectype_in = vectype_in;
8144 89 : gcc_assert (reduc_vectype_in);
8145 :
8146 89 : unsigned effec_reduc_ncopies
8147 89 : = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
8148 :
8149 89 : gcc_assert (effec_ncopies <= effec_reduc_ncopies);
8150 :
8151 89 : if (effec_ncopies < effec_reduc_ncopies)
8152 : {
8153 : /* Find suitable def-use cycles to generate vectorized statements
8154 : into, and reorder operands based on the selection. */
8155 0 : unsigned curr_pos = VECT_REDUC_INFO_RESULT_POS (reduc_info);
8156 0 : unsigned next_pos = (curr_pos + effec_ncopies) % effec_reduc_ncopies;
8157 :
8158 0 : gcc_assert (curr_pos < effec_reduc_ncopies);
8159 0 : VECT_REDUC_INFO_RESULT_POS (reduc_info) = next_pos;
8160 :
8161 0 : if (curr_pos)
8162 : {
8163 0 : unsigned count = effec_reduc_ncopies - effec_ncopies;
8164 0 : unsigned start = curr_pos - count;
8165 :
8166 0 : if ((int) start < 0)
8167 : {
8168 0 : count = curr_pos;
8169 0 : start = 0;
8170 : }
8171 :
8172 0 : for (unsigned i = 0; i < op.num_ops - 1; i++)
8173 : {
8174 0 : for (unsigned j = effec_ncopies; j > start; j--)
8175 : {
8176 0 : unsigned k = j - 1;
8177 0 : std::swap (vec_oprnds[i][k], vec_oprnds[i][k + count]);
8178 0 : gcc_assert (!vec_oprnds[i][k]);
8179 : }
8180 : }
8181 : }
8182 : }
8183 : }
8184 :
8185 1742 : bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (slp_node);
8186 2991 : unsigned num = vec_oprnds[reduc_index == 0 ? 1 : 0].length ();
8187 1742 : unsigned mask_index = 0;
8188 :
8189 7661 : for (unsigned i = 0; i < num; ++i)
8190 : {
8191 5919 : gimple *new_stmt;
8192 5919 : tree vop[3] = { vec_oprnds[0][i], vec_oprnds[1][i], NULL_TREE };
8193 5919 : if (!vop[0] || !vop[1])
8194 : {
8195 473 : tree reduc_vop = vec_oprnds[reduc_index][i];
8196 :
8197 : /* If could not generate an effective vector statement for current
8198 : portion of reduction operand, insert a trivial copy to simply
8199 : handle over the operand to other dependent statements. */
8200 473 : gcc_assert (reduc_vop);
8201 :
8202 473 : if (TREE_CODE (reduc_vop) == SSA_NAME
8203 473 : && !SSA_NAME_IS_DEFAULT_DEF (reduc_vop))
8204 473 : new_stmt = SSA_NAME_DEF_STMT (reduc_vop);
8205 : else
8206 : {
8207 0 : new_temp = make_ssa_name (vec_dest);
8208 0 : new_stmt = gimple_build_assign (new_temp, reduc_vop);
8209 0 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
8210 : gsi);
8211 : }
8212 : }
8213 5446 : else if (masked_loop_p && !mask_by_cond_expr)
8214 : {
8215 : /* No conditional ifns have been defined for lane-reducing op
8216 : yet. */
8217 16 : gcc_assert (!lane_reducing);
8218 :
8219 16 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8220 : vec_num, vectype_in,
8221 : mask_index++);
8222 16 : gcall *call;
8223 24 : if (code.is_internal_fn () && cond_fn_p)
8224 : {
8225 16 : gcc_assert (op.num_ops >= 3
8226 : && internal_fn_mask_index (internal_fn (code)) == 0);
8227 8 : vop[2] = vec_oprnds[2][i];
8228 8 : mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask),
8229 : mask, vop[0], gsi);
8230 8 : call = gimple_build_call_internal (cond_fn, 4, mask, vop[1],
8231 : vop[2], vop[reduc_index]);
8232 : }
8233 : else
8234 8 : call = gimple_build_call_internal (cond_fn, 4, mask, vop[0],
8235 : vop[1], vop[reduc_index]);
8236 16 : new_temp = make_ssa_name (vec_dest, call);
8237 16 : gimple_call_set_lhs (call, new_temp);
8238 16 : gimple_call_set_nothrow (call, true);
8239 16 : vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8240 16 : new_stmt = call;
8241 : }
8242 : else
8243 : {
8244 5430 : if (op.num_ops >= 3)
8245 1770 : vop[2] = vec_oprnds[2][i];
8246 :
8247 5430 : if (masked_loop_p && mask_by_cond_expr)
8248 : {
8249 4 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8250 : vec_num, vectype_in,
8251 : mask_index++);
8252 4 : build_vect_cond_expr (code, vop, mask, gsi);
8253 : }
8254 :
8255 5430 : if (emulated_mixed_dot_prod)
8256 4 : new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8257 : vec_dest, vop);
8258 :
8259 6768 : else if (code.is_internal_fn () && !cond_fn_p)
8260 0 : new_stmt = gimple_build_call_internal (internal_fn (code),
8261 : op.num_ops,
8262 : vop[0], vop[1], vop[2]);
8263 6768 : else if (code.is_internal_fn () && cond_fn_p)
8264 1342 : new_stmt = gimple_build_call_internal (internal_fn (code),
8265 : op.num_ops,
8266 : vop[0], vop[1], vop[2],
8267 : vop[reduc_index]);
8268 : else
8269 4084 : new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8270 : vop[0], vop[1], vop[2]);
8271 5430 : new_temp = make_ssa_name (vec_dest, new_stmt);
8272 5430 : gimple_set_lhs (new_stmt, new_temp);
8273 5430 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8274 : }
8275 :
8276 5919 : if (single_defuse_cycle && i < num - 1)
8277 3546 : vec_oprnds[reduc_index].safe_push (gimple_get_lhs (new_stmt));
8278 : else
8279 2373 : slp_node->push_vec_def (new_stmt);
8280 : }
8281 :
8282 : return true;
8283 10324 : }
8284 :
8285 : /* Transform phase of a cycle PHI. */
8286 :
8287 : bool
8288 23476 : vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8289 : stmt_vec_info stmt_info,
8290 : slp_tree slp_node, slp_instance slp_node_instance)
8291 : {
8292 23476 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
8293 23476 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8294 23476 : int i;
8295 23476 : bool nested_cycle = false;
8296 23476 : int vec_num;
8297 :
8298 23614 : if (nested_in_vect_loop_p (loop, stmt_info))
8299 : {
8300 : loop = loop->inner;
8301 : nested_cycle = true;
8302 : }
8303 :
8304 23476 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
8305 23476 : if (reduc_info
8306 22814 : && (VECT_REDUC_INFO_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8307 22814 : || VECT_REDUC_INFO_TYPE (reduc_info) == FOLD_LEFT_REDUCTION))
8308 : /* Leave the scalar phi in place. */
8309 : return true;
8310 :
8311 21975 : if (reduc_info && reduc_info->is_reduc_chain && dump_enabled_p ())
8312 118 : dump_printf_loc (MSG_NOTE, vect_location,
8313 : "vectorizing a reduction chain\n");
8314 :
8315 22637 : vec_num = vect_get_num_copies (loop_vinfo, slp_node);
8316 :
8317 : /* Check whether we should use a single PHI node and accumulate
8318 : vectors to one before the backedge. */
8319 22637 : if (reduc_info && VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info))
8320 22637 : vec_num = 1;
8321 :
8322 : /* Create the destination vector */
8323 22637 : gphi *phi = as_a <gphi *> (stmt_info->stmt);
8324 22637 : tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8325 : vectype_out);
8326 :
8327 : /* Get the loop-entry arguments. */
8328 22637 : auto_vec<tree> vec_initial_defs;
8329 22637 : vec_initial_defs.reserve (vec_num);
8330 : /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8331 : and we can't use zero for induc_val, use initial_def. Similarly
8332 : for REDUC_MIN and initial_def larger than the base. */
8333 22637 : if (reduc_info
8334 21975 : && VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8335 : {
8336 62 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
8337 62 : tree initial_def = vect_phi_initial_value (phi);
8338 62 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).safe_push (initial_def);
8339 62 : tree induc_val = VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info);
8340 62 : if (TREE_CODE (initial_def) == INTEGER_CST
8341 60 : && !integer_zerop (induc_val)
8342 122 : && ((VECT_REDUC_INFO_CODE (reduc_info) == MAX_EXPR
8343 42 : && tree_int_cst_lt (initial_def, induc_val))
8344 58 : || (VECT_REDUC_INFO_CODE (reduc_info) == MIN_EXPR
8345 18 : && tree_int_cst_lt (induc_val, initial_def))))
8346 : {
8347 2 : induc_val = initial_def;
8348 : /* Communicate we used the initial_def to epilouge
8349 : generation. */
8350 2 : VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8351 : }
8352 62 : vec_initial_defs.quick_push
8353 62 : (build_vector_from_val (vectype_out, induc_val));
8354 62 : }
8355 22575 : else if (nested_cycle)
8356 : {
8357 748 : unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8358 748 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8359 : &vec_initial_defs);
8360 : }
8361 : else
8362 : {
8363 21827 : gcc_assert (slp_node == slp_node_instance->reduc_phis);
8364 21827 : vec<tree> &initial_values = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info);
8365 21827 : vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8366 :
8367 21827 : unsigned int num_phis = stmts.length ();
8368 21827 : if (reduc_info->is_reduc_chain)
8369 185 : num_phis = 1;
8370 21827 : initial_values.reserve (num_phis);
8371 44099 : for (unsigned int i = 0; i < num_phis; ++i)
8372 : {
8373 22272 : gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8374 22272 : initial_values.quick_push (vect_phi_initial_value (this_phi));
8375 : }
8376 21827 : tree neutral_op = VECT_REDUC_INFO_NEUTRAL_OP (reduc_info);
8377 21827 : if (vec_num == 1
8378 21827 : && vect_find_reusable_accumulator (loop_vinfo,
8379 : reduc_info, vectype_out))
8380 : ;
8381 : /* Try to simplify the vector initialization by applying an
8382 : adjustment after the reduction has been performed. This
8383 : can also break a critical path but on the other hand
8384 : requires to keep the initial value live across the loop. */
8385 17812 : else if (neutral_op
8386 17255 : && initial_values.length () == 1
8387 17071 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8388 34806 : && !operand_equal_p (neutral_op, initial_values[0]))
8389 : {
8390 12152 : VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info)
8391 12152 : = initial_values[0];
8392 12152 : initial_values[0] = neutral_op;
8393 : }
8394 21827 : if (!VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)
8395 4015 : || loop_vinfo->main_loop_edge)
8396 43208 : get_initial_defs_for_reduction (loop_vinfo, reduc_info, vectype_out,
8397 : &vec_initial_defs, vec_num,
8398 : stmts.length (), neutral_op);
8399 : }
8400 :
8401 22637 : if (reduc_info)
8402 21975 : if (auto *accumulator = VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info))
8403 : {
8404 4015 : tree def = accumulator->reduc_input;
8405 4015 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8406 : {
8407 4012 : unsigned int nreduc;
8408 8024 : bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8409 4012 : (TREE_TYPE (def)),
8410 4012 : TYPE_VECTOR_SUBPARTS (vectype_out),
8411 : &nreduc);
8412 0 : gcc_assert (res);
8413 4012 : gimple_seq stmts = NULL;
8414 : /* Reduce the single vector to a smaller one. */
8415 4012 : if (nreduc != 1)
8416 : {
8417 : /* Perform the reduction in the appropriate type. */
8418 4012 : tree rvectype = vectype_out;
8419 4012 : if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8420 4012 : TREE_TYPE (TREE_TYPE (def))))
8421 235 : rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8422 : TYPE_VECTOR_SUBPARTS
8423 470 : (vectype_out));
8424 4012 : def = vect_create_partial_epilog (def, rvectype,
8425 : VECT_REDUC_INFO_CODE
8426 : (reduc_info),
8427 : &stmts);
8428 : }
8429 : /* The epilogue loop might use a different vector mode, like
8430 : VNx2DI vs. V2DI. */
8431 4012 : if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8432 : {
8433 0 : tree reduc_type = build_vector_type_for_mode
8434 0 : (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8435 0 : def = gimple_convert (&stmts, reduc_type, def);
8436 : }
8437 : /* Adjust the input so we pick up the partially reduced value
8438 : for the skip edge in vect_create_epilog_for_reduction. */
8439 4012 : accumulator->reduc_input = def;
8440 : /* And the reduction could be carried out using a different sign. */
8441 4012 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8442 235 : def = gimple_convert (&stmts, vectype_out, def);
8443 4012 : edge e;
8444 4012 : if ((e = loop_vinfo->main_loop_edge)
8445 4012 : || (e = loop_vinfo->skip_this_loop_edge))
8446 : {
8447 : /* While we'd like to insert on the edge this will split
8448 : blocks and disturb bookkeeping, we also will eventually
8449 : need this on the skip edge. Rely on sinking to
8450 : fixup optimal placement and insert in the pred. */
8451 3789 : gimple_stmt_iterator gsi = gsi_last_bb (e->src);
8452 : /* Insert before a cond that eventually skips the
8453 : epilogue. */
8454 3789 : if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8455 3772 : gsi_prev (&gsi);
8456 3789 : gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8457 : }
8458 : else
8459 223 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8460 : stmts);
8461 : }
8462 4015 : if (loop_vinfo->main_loop_edge)
8463 3792 : vec_initial_defs[0]
8464 3792 : = vect_get_main_loop_result (loop_vinfo, def,
8465 3792 : vec_initial_defs[0]);
8466 : else
8467 223 : vec_initial_defs.safe_push (def);
8468 : }
8469 :
8470 : /* Generate the reduction PHIs upfront. */
8471 47061 : for (i = 0; i < vec_num; i++)
8472 : {
8473 24424 : tree vec_init_def = vec_initial_defs[i];
8474 : /* Create the reduction-phi that defines the reduction
8475 : operand. */
8476 24424 : gphi *new_phi = create_phi_node (vec_dest, loop->header);
8477 24424 : add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8478 : UNKNOWN_LOCATION);
8479 :
8480 : /* The loop-latch arg is set in epilogue processing. */
8481 :
8482 24424 : slp_node->push_vec_def (new_phi);
8483 : }
8484 :
8485 22637 : return true;
8486 22637 : }
8487 :
8488 : /* Vectorizes LC PHIs. */
8489 :
8490 : bool
8491 181291 : vectorizable_lc_phi (loop_vec_info loop_vinfo,
8492 : stmt_vec_info stmt_info,
8493 : slp_tree slp_node)
8494 : {
8495 181291 : if (!loop_vinfo
8496 181291 : || !is_a <gphi *> (stmt_info->stmt)
8497 216748 : || gimple_phi_num_args (stmt_info->stmt) != 1)
8498 : return false;
8499 :
8500 821 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8501 0 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8502 : return false;
8503 :
8504 : /* Deal with copies from externs or constants that disguise as
8505 : loop-closed PHI nodes (PR97886). */
8506 821 : if (!vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8507 : SLP_TREE_VECTYPE (slp_node)))
8508 : {
8509 0 : if (dump_enabled_p ())
8510 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8511 : "incompatible vector types for invariants\n");
8512 0 : return false;
8513 : }
8514 :
8515 : /* ??? This can happen with data vs. mask uses of boolean. */
8516 821 : if (!useless_type_conversion_p (SLP_TREE_VECTYPE (slp_node),
8517 821 : SLP_TREE_VECTYPE
8518 : (SLP_TREE_CHILDREN (slp_node)[0])))
8519 : {
8520 0 : if (dump_enabled_p ())
8521 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8522 : "missed mask promotion\n");
8523 0 : return false;
8524 : }
8525 :
8526 821 : SLP_TREE_TYPE (slp_node) = lc_phi_info_type;
8527 821 : return true;
8528 : }
8529 :
8530 : bool
8531 530 : vect_transform_lc_phi (loop_vec_info loop_vinfo,
8532 : stmt_vec_info stmt_info,
8533 : slp_tree slp_node)
8534 : {
8535 :
8536 530 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8537 530 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8538 530 : basic_block bb = gimple_bb (stmt_info->stmt);
8539 530 : edge e = single_pred_edge (bb);
8540 530 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8541 530 : auto_vec<tree> vec_oprnds;
8542 1060 : vect_get_vec_defs (loop_vinfo, slp_node,
8543 530 : gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8544 1175 : for (unsigned i = 0; i < vec_oprnds.length (); i++)
8545 : {
8546 : /* Create the vectorized LC PHI node. */
8547 645 : gphi *new_phi = create_phi_node (vec_dest, bb);
8548 645 : add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8549 645 : slp_node->push_vec_def (new_phi);
8550 : }
8551 :
8552 530 : return true;
8553 530 : }
8554 :
8555 : /* Vectorizes PHIs. */
8556 :
8557 : bool
8558 138734 : vectorizable_phi (bb_vec_info vinfo,
8559 : stmt_vec_info stmt_info,
8560 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8561 : {
8562 138734 : if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8563 : return false;
8564 :
8565 71190 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8566 : return false;
8567 :
8568 71190 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8569 :
8570 71190 : if (cost_vec) /* transformation not required. */
8571 : {
8572 : slp_tree child;
8573 : unsigned i;
8574 194320 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8575 137309 : if (!child)
8576 : {
8577 0 : if (dump_enabled_p ())
8578 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8579 : "PHI node with unvectorized backedge def\n");
8580 0 : return false;
8581 : }
8582 137309 : else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8583 : {
8584 18 : if (dump_enabled_p ())
8585 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8586 : "incompatible vector types for invariants\n");
8587 18 : return false;
8588 : }
8589 137291 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8590 137291 : && !useless_type_conversion_p (vectype,
8591 : SLP_TREE_VECTYPE (child)))
8592 : {
8593 : /* With bools we can have mask and non-mask precision vectors
8594 : or different non-mask precisions. while pattern recog is
8595 : supposed to guarantee consistency here bugs in it can cause
8596 : mismatches (PR103489 and PR103800 for example).
8597 : Deal with them here instead of ICEing later. */
8598 18 : if (dump_enabled_p ())
8599 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8600 : "incompatible vector type setup from "
8601 : "bool pattern detection\n");
8602 18 : return false;
8603 : }
8604 :
8605 : /* For single-argument PHIs assume coalescing which means zero cost
8606 : for the scalar and the vector PHIs. This avoids artificially
8607 : favoring the vector path (but may pessimize it in some cases). */
8608 57011 : if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8609 51717 : record_stmt_cost (cost_vec, vect_get_num_copies (vinfo, slp_node),
8610 : vector_stmt, slp_node, vectype, 0, vect_body);
8611 57011 : SLP_TREE_TYPE (slp_node) = phi_info_type;
8612 57011 : return true;
8613 : }
8614 :
8615 14143 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8616 14143 : basic_block bb = gimple_bb (stmt_info->stmt);
8617 14143 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8618 14143 : auto_vec<gphi *> new_phis;
8619 51205 : for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8620 : {
8621 37062 : slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8622 :
8623 : /* Skip not yet vectorized defs. */
8624 37511 : if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8625 37062 : && SLP_TREE_VEC_DEFS (child).is_empty ())
8626 449 : continue;
8627 :
8628 36613 : auto_vec<tree> vec_oprnds;
8629 36613 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8630 36613 : if (!new_phis.exists ())
8631 : {
8632 14143 : new_phis.create (vec_oprnds.length ());
8633 29922 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
8634 : {
8635 : /* Create the vectorized LC PHI node. */
8636 15779 : new_phis.quick_push (create_phi_node (vec_dest, bb));
8637 15779 : slp_node->push_vec_def (new_phis[j]);
8638 : }
8639 : }
8640 36613 : edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8641 79942 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
8642 43329 : add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8643 36613 : }
8644 : /* We should have at least one already vectorized child. */
8645 14143 : gcc_assert (new_phis.exists ());
8646 :
8647 14143 : return true;
8648 14143 : }
8649 :
8650 : /* Vectorizes first order recurrences. An overview of the transformation
8651 : is described below. Suppose we have the following loop.
8652 :
8653 : int t = 0;
8654 : for (int i = 0; i < n; ++i)
8655 : {
8656 : b[i] = a[i] - t;
8657 : t = a[i];
8658 : }
8659 :
8660 : There is a first-order recurrence on 'a'. For this loop, the scalar IR
8661 : looks (simplified) like:
8662 :
8663 : scalar.preheader:
8664 : init = 0;
8665 :
8666 : scalar.body:
8667 : i = PHI <0(scalar.preheader), i+1(scalar.body)>
8668 : _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8669 : _1 = a[i]
8670 : b[i] = _1 - _2
8671 : if (i < n) goto scalar.body
8672 :
8673 : In this example, _2 is a recurrence because it's value depends on the
8674 : previous iteration. We vectorize this as (VF = 4)
8675 :
8676 : vector.preheader:
8677 : vect_init = vect_cst(..., ..., ..., 0)
8678 :
8679 : vector.body
8680 : i = PHI <0(vector.preheader), i+4(vector.body)>
8681 : vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8682 : vect_2 = a[i, i+1, i+2, i+3];
8683 : vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8684 : b[i, i+1, i+2, i+3] = vect_2 - vect_3
8685 : if (..) goto vector.body
8686 :
8687 : In this function, vectorizable_recurr, we code generate both the
8688 : vector PHI node and the permute since those together compute the
8689 : vectorized value of the scalar PHI. We do not yet have the
8690 : backedge value to fill in there nor into the vec_perm. Those
8691 : are filled in vect_schedule_scc.
8692 :
8693 : TODO: Since the scalar loop does not have a use of the recurrence
8694 : outside of the loop the natural way to implement peeling via
8695 : vectorizing the live value doesn't work. For now peeling of loops
8696 : with a recurrence is not implemented. For SLP the supported cases
8697 : are restricted to those requiring a single vector recurrence PHI. */
8698 :
8699 : bool
8700 180513 : vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8701 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8702 : {
8703 180513 : if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8704 : return false;
8705 :
8706 34679 : gphi *phi = as_a<gphi *> (stmt_info->stmt);
8707 :
8708 : /* So far we only support first-order recurrence auto-vectorization. */
8709 34679 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8710 : return false;
8711 :
8712 416 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8713 416 : unsigned ncopies = vect_get_num_copies (loop_vinfo, slp_node);
8714 416 : poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8715 416 : unsigned dist = SLP_TREE_LANES (slp_node);
8716 : /* We need to be able to make progress with a single vector. */
8717 416 : if (maybe_gt (dist * 2, nunits))
8718 : {
8719 0 : if (dump_enabled_p ())
8720 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8721 : "first order recurrence exceeds half of "
8722 : "a vector\n");
8723 0 : return false;
8724 : }
8725 :
8726 : /* We need to be able to build a { ..., a, b } init vector with
8727 : dist number of distinct trailing values. Always possible
8728 : when dist == 1 or when nunits is constant or when the initializations
8729 : are uniform. */
8730 416 : tree uniform_initval = NULL_TREE;
8731 416 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8732 1688 : for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
8733 : {
8734 452 : gphi *phi = as_a <gphi *> (s->stmt);
8735 452 : if (! uniform_initval)
8736 416 : uniform_initval = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8737 36 : else if (! operand_equal_p (uniform_initval,
8738 36 : PHI_ARG_DEF_FROM_EDGE (phi, pe)))
8739 : {
8740 : uniform_initval = NULL_TREE;
8741 : break;
8742 : }
8743 : }
8744 416 : if (!uniform_initval && !nunits.is_constant ())
8745 : {
8746 : if (dump_enabled_p ())
8747 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8748 : "cannot build initialization vector for "
8749 : "first order recurrence\n");
8750 : return false;
8751 : }
8752 :
8753 : /* First-order recurrence autovectorization needs to handle permutation
8754 : with indices = [nunits-1, nunits, nunits+1, ...]. */
8755 416 : vec_perm_builder sel (nunits, 1, 3);
8756 1664 : for (int i = 0; i < 3; ++i)
8757 1248 : sel.quick_push (nunits - dist + i);
8758 416 : vec_perm_indices indices (sel, 2, nunits);
8759 :
8760 416 : if (cost_vec) /* transformation not required. */
8761 : {
8762 373 : if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8763 : indices))
8764 : return false;
8765 :
8766 : /* We eventually need to set a vector type on invariant
8767 : arguments. */
8768 : unsigned j;
8769 : slp_tree child;
8770 783 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8771 522 : if (!vect_maybe_update_slp_op_vectype (child, vectype))
8772 : {
8773 0 : if (dump_enabled_p ())
8774 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8775 : "incompatible vector types for "
8776 : "invariants\n");
8777 0 : return false;
8778 : }
8779 :
8780 : /* Verify we have set up compatible types. */
8781 261 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8782 261 : slp_tree latch_def = SLP_TREE_CHILDREN (slp_node)[le->dest_idx];
8783 261 : tree latch_vectype = SLP_TREE_VECTYPE (latch_def);
8784 261 : if (!types_compatible_p (latch_vectype, vectype))
8785 : return false;
8786 :
8787 : /* The recurrence costs the initialization vector and one permute
8788 : for each copy. With SLP the prologue value is explicitly
8789 : represented and costed separately. */
8790 261 : unsigned prologue_cost = 0;
8791 261 : unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8792 : slp_node, 0, vect_body);
8793 261 : if (dump_enabled_p ())
8794 50 : dump_printf_loc (MSG_NOTE, vect_location,
8795 : "vectorizable_recurr: inside_cost = %d, "
8796 : "prologue_cost = %d .\n", inside_cost,
8797 : prologue_cost);
8798 :
8799 261 : SLP_TREE_TYPE (slp_node) = recurr_info_type;
8800 261 : return true;
8801 : }
8802 :
8803 43 : tree vec_init;
8804 43 : if (! uniform_initval)
8805 : {
8806 6 : vec<constructor_elt, va_gc> *v = NULL;
8807 6 : vec_alloc (v, nunits.to_constant ());
8808 33 : for (unsigned i = 0; i < nunits.to_constant () - dist; ++i)
8809 27 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
8810 : build_zero_cst (TREE_TYPE (vectype)));
8811 39 : for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
8812 : {
8813 21 : gphi *phi = as_a <gphi *> (s->stmt);
8814 21 : tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8815 21 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
8816 21 : TREE_TYPE (preheader)))
8817 : {
8818 0 : gimple_seq stmts = NULL;
8819 0 : preheader = gimple_convert (&stmts,
8820 0 : TREE_TYPE (vectype), preheader);
8821 0 : gsi_insert_seq_on_edge_immediate (pe, stmts);
8822 : }
8823 21 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, preheader);
8824 : }
8825 6 : vec_init = build_constructor (vectype, v);
8826 : }
8827 : else
8828 : vec_init = uniform_initval;
8829 43 : vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8830 :
8831 : /* Create the vectorized first-order PHI node. */
8832 43 : tree vec_dest = vect_get_new_vect_var (vectype,
8833 : vect_simple_var, "vec_recur_");
8834 43 : basic_block bb = gimple_bb (phi);
8835 43 : gphi *new_phi = create_phi_node (vec_dest, bb);
8836 43 : add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8837 :
8838 : /* Insert shuffles the first-order recurrence autovectorization.
8839 : result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
8840 43 : tree perm = vect_gen_perm_mask_checked (vectype, indices);
8841 :
8842 : /* Insert the required permute after the latch definition. The
8843 : second and later operands are tentative and will be updated when we have
8844 : vectorized the latch definition. */
8845 43 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8846 43 : gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8847 43 : gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8848 51 : do
8849 : {
8850 51 : gsi_next (&gsi2);
8851 : }
8852 : /* Skip inserted vectorized stmts for the latch definition. We have to
8853 : insert after those. */
8854 94 : while (gsi_stmt (gsi2) && gimple_uid (gsi_stmt (gsi2)) == 0);
8855 :
8856 123 : for (unsigned i = 0; i < ncopies; ++i)
8857 : {
8858 80 : vec_dest = make_ssa_name (vectype);
8859 80 : gassign *vperm
8860 123 : = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8861 43 : i == 0 ? gimple_phi_result (new_phi) : NULL,
8862 : NULL, perm);
8863 80 : vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8864 :
8865 80 : slp_node->push_vec_def (vperm);
8866 : }
8867 :
8868 : return true;
8869 416 : }
8870 :
8871 : /* Return true if VECTYPE represents a vector that requires lowering
8872 : by the vector lowering pass. */
8873 :
8874 : bool
8875 804771 : vect_emulated_vector_p (tree vectype)
8876 : {
8877 1609542 : return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8878 808850 : && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8879 4061 : || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8880 : }
8881 :
8882 : /* Return true if we can emulate CODE on an integer mode representation
8883 : of a vector. */
8884 :
8885 : bool
8886 11714 : vect_can_vectorize_without_simd_p (tree_code code)
8887 : {
8888 11714 : switch (code)
8889 : {
8890 : case PLUS_EXPR:
8891 : case MINUS_EXPR:
8892 : case NEGATE_EXPR:
8893 : case BIT_AND_EXPR:
8894 : case BIT_IOR_EXPR:
8895 : case BIT_XOR_EXPR:
8896 : case BIT_NOT_EXPR:
8897 : return true;
8898 :
8899 11149 : default:
8900 11149 : return false;
8901 : }
8902 : }
8903 :
8904 : /* Likewise, but taking a code_helper. */
8905 :
8906 : bool
8907 992 : vect_can_vectorize_without_simd_p (code_helper code)
8908 : {
8909 992 : return (code.is_tree_code ()
8910 992 : && vect_can_vectorize_without_simd_p (tree_code (code)));
8911 : }
8912 :
8913 : /* Create vector init for vectorized iv. */
8914 : static tree
8915 916 : vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8916 : tree step_expr, poly_uint64 nunits,
8917 : tree vectype,
8918 : enum vect_induction_op_type induction_type)
8919 : {
8920 916 : unsigned HOST_WIDE_INT const_nunits;
8921 916 : tree vec_shift, vec_init, new_name;
8922 916 : unsigned i;
8923 916 : tree itype = TREE_TYPE (vectype);
8924 :
8925 : /* iv_loop is the loop to be vectorized. Create:
8926 : vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
8927 916 : new_name = gimple_convert (stmts, itype, init_expr);
8928 916 : switch (induction_type)
8929 : {
8930 18 : case vect_step_op_shr:
8931 18 : case vect_step_op_shl:
8932 : /* Build the Initial value from shift_expr. */
8933 18 : vec_init = gimple_build_vector_from_val (stmts,
8934 : vectype,
8935 : new_name);
8936 18 : vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
8937 : build_zero_cst (itype), step_expr);
8938 18 : vec_init = gimple_build (stmts,
8939 : (induction_type == vect_step_op_shr
8940 : ? RSHIFT_EXPR : LSHIFT_EXPR),
8941 : vectype, vec_init, vec_shift);
8942 18 : break;
8943 :
8944 822 : case vect_step_op_neg:
8945 822 : {
8946 822 : vec_init = gimple_build_vector_from_val (stmts,
8947 : vectype,
8948 : new_name);
8949 822 : tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
8950 : vectype, vec_init);
8951 : /* The encoding has 2 interleaved stepped patterns. */
8952 822 : vec_perm_builder sel (nunits, 2, 3);
8953 822 : sel.quick_grow (6);
8954 4110 : for (i = 0; i < 3; i++)
8955 : {
8956 2466 : sel[2 * i] = i;
8957 2466 : sel[2 * i + 1] = i + nunits;
8958 : }
8959 822 : vec_perm_indices indices (sel, 2, nunits);
8960 : /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
8961 : fail when vec_init is const vector. In that situation vec_perm is not
8962 : really needed. */
8963 822 : tree perm_mask_even
8964 822 : = vect_gen_perm_mask_any (vectype, indices);
8965 822 : vec_init = gimple_build (stmts, VEC_PERM_EXPR,
8966 : vectype,
8967 : vec_init, vec_neg,
8968 : perm_mask_even);
8969 822 : }
8970 822 : break;
8971 :
8972 76 : case vect_step_op_mul:
8973 76 : {
8974 : /* Use unsigned mult to avoid UD integer overflow. */
8975 76 : gcc_assert (nunits.is_constant (&const_nunits));
8976 76 : tree utype = unsigned_type_for (itype);
8977 76 : tree uvectype = build_vector_type (utype,
8978 76 : TYPE_VECTOR_SUBPARTS (vectype));
8979 76 : new_name = gimple_convert (stmts, utype, new_name);
8980 76 : vec_init = gimple_build_vector_from_val (stmts,
8981 : uvectype,
8982 : new_name);
8983 76 : tree_vector_builder elts (uvectype, const_nunits, 1);
8984 76 : tree elt_step = build_one_cst (utype);
8985 :
8986 76 : elts.quick_push (elt_step);
8987 660 : for (i = 1; i < const_nunits; i++)
8988 : {
8989 : /* Create: new_name_i = new_name + step_expr. */
8990 508 : elt_step = gimple_build (stmts, MULT_EXPR,
8991 : utype, elt_step, step_expr);
8992 508 : elts.quick_push (elt_step);
8993 : }
8994 : /* Create a vector from [new_name_0, new_name_1, ...,
8995 : new_name_nunits-1]. */
8996 76 : tree vec_mul = gimple_build_vector (stmts, &elts);
8997 76 : vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
8998 : vec_init, vec_mul);
8999 76 : vec_init = gimple_convert (stmts, vectype, vec_init);
9000 76 : }
9001 76 : break;
9002 :
9003 0 : default:
9004 0 : gcc_unreachable ();
9005 : }
9006 :
9007 916 : return vec_init;
9008 : }
9009 :
9010 : /* Peel init_expr by skip_niter for induction_type. */
9011 : tree
9012 84 : vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9013 : tree skip_niters, tree step_expr,
9014 : enum vect_induction_op_type induction_type,
9015 : bool early_exit_p)
9016 : {
9017 84 : gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST || early_exit_p);
9018 84 : tree type = TREE_TYPE (init_expr);
9019 84 : unsigned prec = TYPE_PRECISION (type);
9020 84 : switch (induction_type)
9021 : {
9022 : /* neg inductions are typically not used for loop termination conditions but
9023 : are typically implemented as b = -b. That is every scalar iteration b is
9024 : negated. That means that for the initial value of b we will have to
9025 : determine whether the number of skipped iteration is a multiple of 2
9026 : because every 2 scalar iterations we are back at "b". */
9027 0 : case vect_step_op_neg:
9028 : /* For early exits the neg induction will always be the same value at the
9029 : start of the iteration. */
9030 0 : if (early_exit_p)
9031 : break;
9032 :
9033 0 : if (TREE_INT_CST_LOW (skip_niters) % 2)
9034 0 : init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9035 : /* else no change. */
9036 : break;
9037 :
9038 12 : case vect_step_op_shr:
9039 12 : case vect_step_op_shl:
9040 12 : skip_niters = fold_build1 (NOP_EXPR, type, skip_niters);
9041 12 : step_expr = fold_build1 (NOP_EXPR, type, step_expr);
9042 12 : step_expr = fold_build2 (MULT_EXPR, type, step_expr, skip_niters);
9043 : /* When shift mount >= precision, need to avoid UD.
9044 : In the original loop, there's no UD, and according to semantic,
9045 : init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9046 12 : if ((!tree_fits_uhwi_p (step_expr)
9047 12 : || tree_to_uhwi (step_expr) >= prec)
9048 6 : && !early_exit_p)
9049 : {
9050 6 : if (induction_type == vect_step_op_shl
9051 6 : || TYPE_UNSIGNED (type))
9052 4 : init_expr = build_zero_cst (type);
9053 : else
9054 2 : init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9055 : init_expr,
9056 4 : wide_int_to_tree (type, prec - 1));
9057 : }
9058 : else
9059 : {
9060 8 : init_expr = fold_build2 ((induction_type == vect_step_op_shr
9061 : ? RSHIFT_EXPR : LSHIFT_EXPR),
9062 : type, init_expr, step_expr);
9063 6 : init_expr = force_gimple_operand (init_expr, stmts, false, NULL);
9064 : }
9065 : break;
9066 :
9067 72 : case vect_step_op_mul:
9068 72 : {
9069 : /* Due to UB we can't support vect_step_op_mul with early break for now.
9070 : so assert and block. */
9071 72 : gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9072 72 : tree utype = unsigned_type_for (type);
9073 72 : init_expr = gimple_convert (stmts, utype, init_expr);
9074 72 : wide_int skipn = wi::to_wide (skip_niters);
9075 72 : wide_int begin = wi::to_wide (step_expr);
9076 72 : auto_mpz base, exp, mod, res;
9077 72 : wi::to_mpz (begin, base, TYPE_SIGN (type));
9078 72 : wi::to_mpz (skipn, exp, UNSIGNED);
9079 72 : mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9080 72 : mpz_powm (res, base, exp, mod);
9081 72 : begin = wi::from_mpz (utype, res, true);
9082 72 : tree mult_expr = wide_int_to_tree (utype, begin);
9083 72 : init_expr = gimple_build (stmts, MULT_EXPR, utype,
9084 : init_expr, mult_expr);
9085 72 : init_expr = gimple_convert (stmts, type, init_expr);
9086 72 : }
9087 72 : break;
9088 :
9089 0 : default:
9090 0 : gcc_unreachable ();
9091 : }
9092 :
9093 84 : return init_expr;
9094 : }
9095 :
9096 : /* Create vector step for vectorized iv. */
9097 : static tree
9098 1202 : vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9099 : poly_uint64 vf,
9100 : enum vect_induction_op_type induction_type)
9101 : {
9102 1202 : tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9103 1202 : tree new_name = NULL;
9104 : /* Step should be pow (step, vf) for mult induction. */
9105 1202 : if (induction_type == vect_step_op_mul)
9106 : {
9107 76 : gcc_assert (vf.is_constant ());
9108 76 : wide_int begin = wi::to_wide (step_expr);
9109 :
9110 584 : for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9111 508 : begin = wi::mul (begin, wi::to_wide (step_expr));
9112 :
9113 76 : new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9114 76 : }
9115 1126 : else if (induction_type == vect_step_op_neg)
9116 : /* Do nothing. */
9117 : ;
9118 : else
9119 18 : new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9120 : expr, step_expr);
9121 1202 : return new_name;
9122 : }
9123 :
9124 : static tree
9125 1202 : vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9126 : stmt_vec_info stmt_info,
9127 : tree new_name, tree vectype,
9128 : enum vect_induction_op_type induction_type)
9129 : {
9130 : /* No step is needed for neg induction. */
9131 1202 : if (induction_type == vect_step_op_neg)
9132 : return NULL;
9133 :
9134 94 : tree t = unshare_expr (new_name);
9135 94 : gcc_assert (CONSTANT_CLASS_P (new_name)
9136 : || TREE_CODE (new_name) == SSA_NAME);
9137 94 : tree new_vec = build_vector_from_val (vectype, t);
9138 94 : tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9139 : new_vec, vectype, NULL);
9140 94 : return vec_step;
9141 : }
9142 :
9143 : /* Update vectorized iv with vect_step, induc_def is init. */
9144 : static tree
9145 1390 : vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9146 : tree induc_def, tree vec_step,
9147 : enum vect_induction_op_type induction_type)
9148 : {
9149 1390 : tree vec_def = induc_def;
9150 1390 : switch (induction_type)
9151 : {
9152 76 : case vect_step_op_mul:
9153 76 : {
9154 : /* Use unsigned mult to avoid UD integer overflow. */
9155 76 : tree uvectype = unsigned_type_for (vectype);
9156 76 : vec_def = gimple_convert (stmts, uvectype, vec_def);
9157 76 : vec_step = gimple_convert (stmts, uvectype, vec_step);
9158 76 : vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9159 : vec_def, vec_step);
9160 76 : vec_def = gimple_convert (stmts, vectype, vec_def);
9161 : }
9162 76 : break;
9163 :
9164 12 : case vect_step_op_shr:
9165 12 : vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9166 : vec_def, vec_step);
9167 12 : break;
9168 :
9169 6 : case vect_step_op_shl:
9170 6 : vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9171 : vec_def, vec_step);
9172 6 : break;
9173 : case vect_step_op_neg:
9174 : vec_def = induc_def;
9175 : /* Do nothing. */
9176 : break;
9177 0 : default:
9178 0 : gcc_unreachable ();
9179 : }
9180 :
9181 1390 : return vec_def;
9182 :
9183 : }
9184 :
9185 : /* Function vectorizable_nonlinear_induction
9186 :
9187 : Check if STMT_INFO performs an nonlinear induction computation that can be
9188 : vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9189 : a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9190 : basic block.
9191 : Return true if STMT_INFO is vectorizable in this way. */
9192 :
9193 : static bool
9194 9204 : vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9195 : stmt_vec_info stmt_info,
9196 : slp_tree slp_node,
9197 : stmt_vector_for_cost *cost_vec)
9198 : {
9199 9204 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9200 9204 : unsigned ncopies;
9201 9204 : bool nested_in_vect_loop = false;
9202 9204 : class loop *iv_loop;
9203 9204 : tree vec_def;
9204 9204 : edge pe = loop_preheader_edge (loop);
9205 9204 : basic_block new_bb;
9206 9204 : tree vec_init, vec_step;
9207 9204 : tree new_name;
9208 9204 : gimple *new_stmt;
9209 9204 : gphi *induction_phi;
9210 9204 : tree induc_def, vec_dest;
9211 9204 : tree init_expr, step_expr;
9212 9204 : tree niters_skip;
9213 9204 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9214 9204 : unsigned i;
9215 9204 : gimple_stmt_iterator si;
9216 :
9217 9204 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9218 :
9219 9204 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9220 9204 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9221 9204 : enum vect_induction_op_type induction_type
9222 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9223 :
9224 9204 : gcc_assert (induction_type > vect_step_op_add);
9225 :
9226 9204 : ncopies = vect_get_num_copies (loop_vinfo, slp_node);
9227 9204 : gcc_assert (ncopies >= 1);
9228 :
9229 : /* FORNOW. Only handle nonlinear induction in the same loop. */
9230 9204 : if (nested_in_vect_loop_p (loop, stmt_info))
9231 : {
9232 0 : if (dump_enabled_p ())
9233 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9234 : "nonlinear induction in nested loop.\n");
9235 0 : return false;
9236 : }
9237 :
9238 9204 : iv_loop = loop;
9239 9204 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9240 :
9241 : /* TODO: Support multi-lane SLP for nonlinear iv. There should be separate
9242 : vector iv update for each iv and a permutation to generate wanted
9243 : vector iv. */
9244 9204 : if (SLP_TREE_LANES (slp_node) > 1)
9245 : {
9246 0 : if (dump_enabled_p ())
9247 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9248 : "SLP induction not supported for nonlinear"
9249 : " induction.\n");
9250 0 : return false;
9251 : }
9252 :
9253 9204 : if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9254 : {
9255 0 : if (dump_enabled_p ())
9256 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9257 : "floating point nonlinear induction vectorization"
9258 : " not supported.\n");
9259 0 : return false;
9260 : }
9261 :
9262 9204 : step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9263 9204 : init_expr = vect_phi_initial_value (phi);
9264 9204 : gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9265 : && TREE_CODE (step_expr) == INTEGER_CST);
9266 : /* step_expr should be aligned with init_expr,
9267 : .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9268 9204 : step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9269 :
9270 9204 : if (TREE_CODE (init_expr) == INTEGER_CST)
9271 4097 : init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9272 5107 : else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9273 : {
9274 : /* INIT_EXPR could be a bit_field, bail out for such case. */
9275 4 : if (dump_enabled_p ())
9276 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9277 : "nonlinear induction vectorization failed:"
9278 : " component type of vectype is not a nop conversion"
9279 : " from type of init_expr.\n");
9280 4 : return false;
9281 : }
9282 :
9283 9200 : switch (induction_type)
9284 : {
9285 3718 : case vect_step_op_neg:
9286 3718 : if (maybe_eq (TYPE_VECTOR_SUBPARTS (vectype), 1u))
9287 : return false;
9288 3556 : if (TREE_CODE (init_expr) != INTEGER_CST
9289 282 : && TREE_CODE (init_expr) != REAL_CST)
9290 : {
9291 : /* Check for backend support of NEGATE_EXPR and vec_perm. */
9292 282 : if (!directly_supported_p (NEGATE_EXPR, vectype))
9293 0 : return false;
9294 :
9295 : /* The encoding has 2 interleaved stepped patterns. */
9296 282 : vec_perm_builder sel (nunits, 2, 3);
9297 282 : machine_mode mode = TYPE_MODE (vectype);
9298 282 : sel.quick_grow (6);
9299 1410 : for (i = 0; i < 3; i++)
9300 : {
9301 846 : sel[i * 2] = i;
9302 846 : sel[i * 2 + 1] = i + nunits;
9303 : }
9304 282 : vec_perm_indices indices (sel, 2, nunits);
9305 282 : if (!can_vec_perm_const_p (mode, mode, indices))
9306 0 : return false;
9307 282 : }
9308 : break;
9309 :
9310 1066 : case vect_step_op_mul:
9311 1066 : {
9312 : /* Check for backend support of MULT_EXPR. */
9313 1066 : if (!directly_supported_p (MULT_EXPR, vectype))
9314 : return false;
9315 :
9316 : /* ?? How to construct vector step for variable number vector.
9317 : [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9318 : if (!vf.is_constant ())
9319 : return false;
9320 : }
9321 : break;
9322 :
9323 4098 : case vect_step_op_shr:
9324 : /* Check for backend support of RSHIFT_EXPR. */
9325 4098 : if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9326 : return false;
9327 :
9328 : /* Don't shift more than type precision to avoid UD. */
9329 26 : if (!tree_fits_uhwi_p (step_expr)
9330 26 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9331 : TYPE_PRECISION (TREE_TYPE (init_expr))))
9332 : return false;
9333 : break;
9334 :
9335 318 : case vect_step_op_shl:
9336 : /* Check for backend support of RSHIFT_EXPR. */
9337 318 : if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9338 : return false;
9339 :
9340 : /* Don't shift more than type precision to avoid UD. */
9341 12 : if (!tree_fits_uhwi_p (step_expr)
9342 12 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9343 : TYPE_PRECISION (TREE_TYPE (init_expr))))
9344 : return false;
9345 :
9346 : break;
9347 :
9348 0 : default:
9349 0 : gcc_unreachable ();
9350 : }
9351 :
9352 4420 : if (cost_vec) /* transformation not required. */
9353 : {
9354 3504 : unsigned inside_cost = 0, prologue_cost = 0;
9355 : /* loop cost for vec_loop. Neg induction doesn't have any
9356 : inside_cost. */
9357 3504 : inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9358 : slp_node, 0, vect_body);
9359 :
9360 : /* loop cost for vec_loop. Neg induction doesn't have any
9361 : inside_cost. */
9362 3504 : if (induction_type == vect_step_op_neg)
9363 2734 : inside_cost = 0;
9364 :
9365 : /* prologue cost for vec_init and vec_step. */
9366 3504 : prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9367 : slp_node, 0, vect_prologue);
9368 :
9369 3504 : if (dump_enabled_p ())
9370 68 : dump_printf_loc (MSG_NOTE, vect_location,
9371 : "vect_model_induction_cost: inside_cost = %d, "
9372 : "prologue_cost = %d. \n", inside_cost,
9373 : prologue_cost);
9374 :
9375 3504 : SLP_TREE_TYPE (slp_node) = induc_vec_info_type;
9376 3504 : DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9377 3504 : return true;
9378 : }
9379 :
9380 : /* Transform. */
9381 :
9382 : /* Compute a vector variable, initialized with the first VF values of
9383 : the induction variable. E.g., for an iv with IV_PHI='X' and
9384 : evolution S, for a vector of 4 units, we want to compute:
9385 : [X, X + S, X + 2*S, X + 3*S]. */
9386 :
9387 916 : if (dump_enabled_p ())
9388 32 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9389 :
9390 916 : pe = loop_preheader_edge (iv_loop);
9391 : /* Find the first insertion point in the BB. */
9392 916 : basic_block bb = gimple_bb (phi);
9393 916 : si = gsi_after_labels (bb);
9394 :
9395 916 : gimple_seq stmts = NULL;
9396 :
9397 916 : niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9398 : /* If we are using the loop mask to "peel" for alignment then we need
9399 : to adjust the start value here. */
9400 916 : if (niters_skip != NULL_TREE)
9401 0 : init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9402 : step_expr, induction_type, false);
9403 :
9404 916 : vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9405 : step_expr, nunits, vectype,
9406 : induction_type);
9407 916 : if (stmts)
9408 : {
9409 162 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9410 162 : gcc_assert (!new_bb);
9411 : }
9412 :
9413 916 : stmts = NULL;
9414 916 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9415 : vf, induction_type);
9416 916 : if (stmts)
9417 : {
9418 0 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9419 0 : gcc_assert (!new_bb);
9420 : }
9421 :
9422 916 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9423 : new_name, vectype,
9424 : induction_type);
9425 : /* Create the following def-use cycle:
9426 : loop prolog:
9427 : vec_init = ...
9428 : vec_step = ...
9429 : loop:
9430 : vec_iv = PHI <vec_init, vec_loop>
9431 : ...
9432 : STMT
9433 : ...
9434 : vec_loop = vec_iv + vec_step; */
9435 :
9436 : /* Create the induction-phi that defines the induction-operand. */
9437 916 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9438 916 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
9439 916 : induc_def = PHI_RESULT (induction_phi);
9440 :
9441 : /* Create the iv update inside the loop. */
9442 916 : stmts = NULL;
9443 916 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9444 : induc_def, vec_step,
9445 : induction_type);
9446 :
9447 916 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9448 916 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9449 :
9450 : /* Set the arguments of the phi node: */
9451 916 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9452 916 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9453 : UNKNOWN_LOCATION);
9454 :
9455 916 : slp_node->push_vec_def (induction_phi);
9456 :
9457 : /* In case that vectorization factor (VF) is bigger than the number
9458 : of elements that we can fit in a vectype (nunits), we have to generate
9459 : more than one vector stmt - i.e - we need to "unroll" the
9460 : vector stmt by a factor VF/nunits. For more details see documentation
9461 : in vectorizable_operation. */
9462 :
9463 916 : if (ncopies > 1)
9464 : {
9465 286 : stmts = NULL;
9466 : /* FORNOW. This restriction should be relaxed. */
9467 286 : gcc_assert (!nested_in_vect_loop);
9468 :
9469 286 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9470 : nunits, induction_type);
9471 :
9472 286 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9473 : new_name, vectype,
9474 : induction_type);
9475 286 : vec_def = induc_def;
9476 1046 : for (i = 1; i < ncopies; i++)
9477 : {
9478 : /* vec_i = vec_prev + vec_step. */
9479 474 : stmts = NULL;
9480 474 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9481 : vec_def, vec_step,
9482 : induction_type);
9483 474 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9484 474 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9485 474 : slp_node->push_vec_def (new_stmt);
9486 : }
9487 : }
9488 :
9489 916 : if (dump_enabled_p ())
9490 64 : dump_printf_loc (MSG_NOTE, vect_location,
9491 : "transform induction: created def-use cycle: %G%G",
9492 32 : (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9493 :
9494 : return true;
9495 : }
9496 :
9497 : /* Function vectorizable_induction
9498 :
9499 : Check if STMT_INFO performs an induction computation that can be vectorized.
9500 : If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9501 : phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9502 : Return true if STMT_INFO is vectorizable in this way. */
9503 :
9504 : bool
9505 314445 : vectorizable_induction (loop_vec_info loop_vinfo,
9506 : stmt_vec_info stmt_info,
9507 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9508 : {
9509 314445 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9510 314445 : bool nested_in_vect_loop = false;
9511 314445 : class loop *iv_loop;
9512 314445 : tree vec_def;
9513 314445 : edge pe = loop_preheader_edge (loop);
9514 314445 : basic_block new_bb;
9515 314445 : tree vec_init = NULL_TREE, vec_step, t;
9516 314445 : tree new_name;
9517 314445 : gphi *induction_phi;
9518 314445 : tree induc_def, vec_dest;
9519 314445 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9520 314445 : unsigned i;
9521 314445 : tree expr;
9522 314445 : tree index_vectype = NULL_TREE;
9523 314445 : gimple_stmt_iterator si;
9524 314445 : enum vect_induction_op_type induction_type
9525 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9526 :
9527 345118 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9528 168611 : if (!phi)
9529 : return false;
9530 :
9531 168611 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
9532 : return false;
9533 :
9534 : /* Make sure it was recognized as induction computation. */
9535 168611 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9536 : return false;
9537 :
9538 : /* Handle nonlinear induction in a separate place. */
9539 164620 : if (induction_type != vect_step_op_add)
9540 9204 : return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9541 9204 : slp_node, cost_vec);
9542 :
9543 155416 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9544 155416 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9545 :
9546 : /* FORNOW. These restrictions should be relaxed. */
9547 155416 : if (nested_in_vect_loop_p (loop, stmt_info))
9548 : {
9549 813 : imm_use_iterator imm_iter;
9550 813 : use_operand_p use_p;
9551 813 : gimple *exit_phi;
9552 813 : edge latch_e;
9553 813 : tree loop_arg;
9554 :
9555 813 : exit_phi = NULL;
9556 813 : latch_e = loop_latch_edge (loop->inner);
9557 813 : loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9558 2475 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9559 : {
9560 873 : gimple *use_stmt = USE_STMT (use_p);
9561 873 : if (is_gimple_debug (use_stmt))
9562 36 : continue;
9563 :
9564 837 : if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9565 : {
9566 : exit_phi = use_stmt;
9567 : break;
9568 : }
9569 813 : }
9570 813 : if (exit_phi)
9571 : {
9572 24 : stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9573 24 : if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9574 8 : && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9575 : {
9576 16 : if (dump_enabled_p ())
9577 16 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9578 : "inner-loop induction only used outside "
9579 : "of the outer vectorized loop.\n");
9580 16 : return false;
9581 : }
9582 : }
9583 :
9584 797 : nested_in_vect_loop = true;
9585 797 : iv_loop = loop->inner;
9586 : }
9587 : else
9588 : iv_loop = loop;
9589 155400 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9590 :
9591 155400 : if (!nunits.is_constant () && SLP_TREE_LANES (slp_node) != 1)
9592 : {
9593 : /* The current SLP code creates the step value element-by-element. */
9594 : if (dump_enabled_p ())
9595 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9596 : "SLP induction not supported for variable-length"
9597 : " vectors.\n");
9598 : return false;
9599 : }
9600 :
9601 155400 : if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9602 : {
9603 12 : if (dump_enabled_p ())
9604 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9605 : "floating point induction vectorization disabled\n");
9606 12 : return false;
9607 : }
9608 :
9609 155388 : tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9610 155388 : gcc_assert (step_expr != NULL_TREE);
9611 310752 : if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
9612 310653 : && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
9613 : {
9614 12 : if (dump_enabled_p ())
9615 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9616 : "bit-precision induction vectorization not "
9617 : "supported.\n");
9618 12 : return false;
9619 : }
9620 155376 : tree stept = TREE_TYPE (step_expr);
9621 155376 : tree step_vectype = get_same_sized_vectype (stept, vectype);
9622 155376 : stept = TREE_TYPE (step_vectype);
9623 :
9624 : /* Check for target support of the vectorized arithmetic used here. */
9625 155376 : if (!target_supports_op_p (step_vectype, PLUS_EXPR, optab_default)
9626 155376 : || !target_supports_op_p (step_vectype, MINUS_EXPR, optab_default))
9627 26642 : return false;
9628 128734 : if (!nunits.is_constant ())
9629 : {
9630 : if (!target_supports_op_p (step_vectype, MULT_EXPR, optab_default))
9631 : return false;
9632 : /* FLOAT_EXPR when computing VEC_INIT for float inductions. */
9633 : if (SCALAR_FLOAT_TYPE_P (stept))
9634 : {
9635 : tree index_type = build_nonstandard_integer_type
9636 : (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (stept)), 1);
9637 :
9638 : index_vectype = build_vector_type (index_type, nunits);
9639 : if (!can_float_p (TYPE_MODE (step_vectype),
9640 : TYPE_MODE (index_vectype), 1))
9641 : return false;
9642 : }
9643 : }
9644 :
9645 128734 : unsigned nvects = vect_get_num_copies (loop_vinfo, slp_node);
9646 128734 : if (cost_vec) /* transformation not required. */
9647 : {
9648 340365 : unsigned inside_cost = 0, prologue_cost = 0;
9649 : /* We eventually need to set a vector type on invariant
9650 : arguments. */
9651 : unsigned j;
9652 : slp_tree child;
9653 340365 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9654 226910 : if (!vect_maybe_update_slp_op_vectype
9655 226910 : (child, SLP_TREE_VECTYPE (slp_node)))
9656 : {
9657 0 : if (dump_enabled_p ())
9658 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9659 : "incompatible vector types for "
9660 : "invariants\n");
9661 0 : return false;
9662 : }
9663 : /* loop cost for vec_loop. */
9664 113455 : inside_cost = record_stmt_cost (cost_vec, nvects,
9665 : vector_stmt, slp_node, 0, vect_body);
9666 : /* prologue cost for vec_init (if not nested) and step. */
9667 113455 : prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9668 : scalar_to_vec,
9669 : slp_node, 0, vect_prologue);
9670 113455 : if (dump_enabled_p ())
9671 4068 : dump_printf_loc (MSG_NOTE, vect_location,
9672 : "vect_model_induction_cost: inside_cost = %d, "
9673 : "prologue_cost = %d .\n", inside_cost,
9674 : prologue_cost);
9675 :
9676 113455 : SLP_TREE_TYPE (slp_node) = induc_vec_info_type;
9677 113455 : DUMP_VECT_SCOPE ("vectorizable_induction");
9678 113455 : return true;
9679 : }
9680 :
9681 : /* Transform. */
9682 :
9683 : /* Compute a vector variable, initialized with the first VF values of
9684 : the induction variable. E.g., for an iv with IV_PHI='X' and
9685 : evolution S, for a vector of 4 units, we want to compute:
9686 : [X, X + S, X + 2*S, X + 3*S]. */
9687 :
9688 15279 : if (dump_enabled_p ())
9689 2771 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9690 :
9691 15279 : pe = loop_preheader_edge (iv_loop);
9692 : /* Find the first insertion point in the BB. */
9693 15279 : basic_block bb = gimple_bb (phi);
9694 15279 : si = gsi_after_labels (bb);
9695 :
9696 : /* For SLP induction we have to generate several IVs as for example
9697 : with group size 3 we need
9698 : [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9699 : [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9700 15279 : gimple_stmt_iterator incr_si;
9701 15279 : bool insert_after;
9702 15279 : standard_iv_increment_position (iv_loop, &incr_si, &insert_after);
9703 :
9704 : /* The initial values are vectorized, but any lanes > group_size
9705 : need adjustment. */
9706 15279 : slp_tree init_node
9707 15279 : = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9708 :
9709 : /* Gather steps. Since we do not vectorize inductions as
9710 : cycles we have to reconstruct the step from SCEV data. */
9711 15279 : unsigned group_size = SLP_TREE_LANES (slp_node);
9712 15279 : tree *steps = XALLOCAVEC (tree, group_size);
9713 15279 : tree *inits = XALLOCAVEC (tree, group_size);
9714 15279 : stmt_vec_info phi_info;
9715 47065 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9716 : {
9717 16507 : steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9718 16507 : if (!init_node)
9719 16262 : inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9720 : pe->dest_idx);
9721 : }
9722 :
9723 : /* Now generate the IVs. */
9724 30558 : gcc_assert (multiple_p (nunits * nvects, group_size));
9725 15279 : unsigned nivs;
9726 15279 : unsigned HOST_WIDE_INT const_nunits;
9727 15279 : if (nested_in_vect_loop)
9728 : nivs = nvects;
9729 15055 : else if (nunits.is_constant (&const_nunits))
9730 : {
9731 : /* Compute the number of distinct IVs we need. First reduce
9732 : group_size if it is a multiple of const_nunits so we get
9733 : one IV for a group_size of 4 but const_nunits 2. */
9734 15055 : unsigned group_sizep = group_size;
9735 15055 : if (group_sizep % const_nunits == 0)
9736 111 : group_sizep = group_sizep / const_nunits;
9737 15055 : nivs = least_common_multiple (group_sizep, const_nunits) / const_nunits;
9738 : }
9739 : else
9740 : {
9741 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
9742 : nivs = 1;
9743 : }
9744 15279 : gimple_seq init_stmts = NULL;
9745 15279 : tree lupdate_mul = NULL_TREE;
9746 224 : if (!nested_in_vect_loop)
9747 : {
9748 15055 : if (nunits.is_constant (&const_nunits))
9749 : {
9750 : /* The number of iterations covered in one vector iteration. */
9751 15055 : unsigned lup_mul = (nvects * const_nunits) / group_size;
9752 15055 : lupdate_mul
9753 15055 : = build_vector_from_val (step_vectype,
9754 15055 : SCALAR_FLOAT_TYPE_P (stept)
9755 28 : ? build_real_from_wide (stept, lup_mul,
9756 : UNSIGNED)
9757 30082 : : build_int_cstu (stept, lup_mul));
9758 : }
9759 : else
9760 : {
9761 : if (SCALAR_FLOAT_TYPE_P (stept))
9762 : {
9763 : tree tem = build_int_cst (integer_type_node, vf);
9764 : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
9765 : }
9766 : else
9767 : lupdate_mul = build_int_cst (stept, vf);
9768 : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
9769 : lupdate_mul);
9770 : }
9771 : }
9772 15279 : tree peel_mul = NULL_TREE;
9773 15279 : if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9774 : {
9775 0 : if (SCALAR_FLOAT_TYPE_P (stept))
9776 0 : peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9777 : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9778 : else
9779 0 : peel_mul = gimple_convert (&init_stmts, stept,
9780 : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9781 0 : peel_mul = gimple_build_vector_from_val (&init_stmts,
9782 : step_vectype, peel_mul);
9783 : }
9784 15279 : tree step_mul = NULL_TREE;
9785 15279 : unsigned ivn;
9786 15279 : auto_vec<tree> vec_steps;
9787 31130 : for (ivn = 0; ivn < nivs; ++ivn)
9788 : {
9789 15851 : gimple_seq stmts = NULL;
9790 15851 : bool invariant = true;
9791 15851 : if (nunits.is_constant (&const_nunits))
9792 : {
9793 15851 : tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9794 15851 : tree_vector_builder init_elts (vectype, const_nunits, 1);
9795 15851 : tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9796 102295 : for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9797 : {
9798 : /* The scalar steps of the IVs. */
9799 86444 : tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9800 86444 : elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9801 86444 : step_elts.quick_push (elt);
9802 86444 : if (!init_node)
9803 : {
9804 : /* The scalar inits of the IVs if not vectorized. */
9805 85182 : elt = inits[(ivn*const_nunits + eltn) % group_size];
9806 85182 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
9807 85182 : TREE_TYPE (elt)))
9808 264 : elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9809 264 : TREE_TYPE (vectype), elt);
9810 85182 : init_elts.quick_push (elt);
9811 : }
9812 : /* The number of steps to add to the initial values. */
9813 86444 : unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9814 172888 : mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9815 172786 : ? build_real_from_wide (stept, mul_elt,
9816 : UNSIGNED)
9817 172786 : : build_int_cstu (stept, mul_elt));
9818 : }
9819 15851 : vec_step = gimple_build_vector (&init_stmts, &step_elts);
9820 15851 : step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9821 15851 : if (!init_node)
9822 15593 : vec_init = gimple_build_vector (&init_stmts, &init_elts);
9823 15851 : }
9824 : else
9825 : {
9826 : tree step = gimple_convert (&init_stmts, stept, steps[0]);
9827 : if (init_node)
9828 : ;
9829 : else if (INTEGRAL_TYPE_P (stept))
9830 : {
9831 : new_name = gimple_convert (&init_stmts, stept, inits[0]);
9832 : /* Build the initial value directly as a VEC_SERIES_EXPR. */
9833 : vec_init = gimple_build (&init_stmts, VEC_SERIES_EXPR,
9834 : step_vectype, new_name, step);
9835 : if (!useless_type_conversion_p (vectype, step_vectype))
9836 : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9837 : vectype, vec_init);
9838 : }
9839 : else
9840 : {
9841 : /* Build:
9842 : [base, base, base, ...]
9843 : + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
9844 : gcc_assert (SCALAR_FLOAT_TYPE_P (stept));
9845 : gcc_assert (flag_associative_math);
9846 : gcc_assert (index_vectype != NULL_TREE);
9847 :
9848 : tree index = build_index_vector (index_vectype, 0, 1);
9849 : new_name = gimple_convert (&init_stmts, stept, inits[0]);
9850 : tree base_vec = gimple_build_vector_from_val (&init_stmts,
9851 : step_vectype,
9852 : new_name);
9853 : tree step_vec = gimple_build_vector_from_val (&init_stmts,
9854 : step_vectype,
9855 : step);
9856 : vec_init = gimple_build (&init_stmts, FLOAT_EXPR,
9857 : step_vectype, index);
9858 : vec_init = gimple_build (&init_stmts, MULT_EXPR,
9859 : step_vectype, vec_init, step_vec);
9860 : vec_init = gimple_build (&init_stmts, PLUS_EXPR,
9861 : step_vectype, vec_init, base_vec);
9862 : if (!useless_type_conversion_p (vectype, step_vectype))
9863 : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9864 : vectype, vec_init);
9865 : }
9866 : /* iv_loop is nested in the loop to be vectorized. Generate:
9867 : vec_step = [S, S, S, S] */
9868 : t = unshare_expr (step);
9869 : gcc_assert (CONSTANT_CLASS_P (t)
9870 : || TREE_CODE (t) == SSA_NAME);
9871 : vec_step = gimple_build_vector_from_val (&init_stmts,
9872 : step_vectype, t);
9873 : }
9874 15851 : vec_steps.safe_push (vec_step);
9875 15851 : if (peel_mul)
9876 : {
9877 0 : if (!step_mul)
9878 : {
9879 0 : gcc_assert (!nunits.is_constant ());
9880 : step_mul = gimple_build (&init_stmts,
9881 : MINUS_EXPR, step_vectype,
9882 : build_zero_cst (step_vectype), peel_mul);
9883 : }
9884 : else
9885 0 : step_mul = gimple_build (&init_stmts,
9886 : MINUS_EXPR, step_vectype,
9887 : step_mul, peel_mul);
9888 : }
9889 :
9890 : /* Create the induction-phi that defines the induction-operand. */
9891 15851 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9892 : "vec_iv_");
9893 15851 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
9894 15851 : induc_def = PHI_RESULT (induction_phi);
9895 :
9896 : /* Create the iv update inside the loop */
9897 15851 : tree up = vec_step;
9898 15851 : if (lupdate_mul)
9899 : {
9900 15593 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
9901 : {
9902 : /* When we're using loop_len produced by SELEC_VL, the
9903 : non-final iterations are not always processing VF
9904 : elements. So vectorize induction variable instead of
9905 :
9906 : _21 = vect_vec_iv_.6_22 + { VF, ... };
9907 :
9908 : We should generate:
9909 :
9910 : _35 = .SELECT_VL (ivtmp_33, VF);
9911 : vect_cst__22 = [vec_duplicate_expr] _35;
9912 : _21 = vect_vec_iv_.6_22 + vect_cst__22; */
9913 0 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
9914 0 : tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1,
9915 : vectype, 0, 0, false);
9916 0 : if (SCALAR_FLOAT_TYPE_P (stept))
9917 0 : expr = gimple_build (&stmts, FLOAT_EXPR, stept, len);
9918 : else
9919 0 : expr = gimple_convert (&stmts, stept, len);
9920 0 : lupdate_mul = gimple_build_vector_from_val (&stmts, step_vectype,
9921 : expr);
9922 0 : up = gimple_build (&stmts, MULT_EXPR,
9923 : step_vectype, vec_step, lupdate_mul);
9924 : }
9925 : else
9926 15593 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9927 : vec_step, lupdate_mul);
9928 : }
9929 15851 : vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9930 15851 : vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, up);
9931 15851 : vec_def = gimple_convert (&stmts, vectype, vec_def);
9932 15851 : insert_iv_increment (&incr_si, insert_after, stmts);
9933 15851 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9934 : UNKNOWN_LOCATION);
9935 :
9936 15851 : if (init_node)
9937 258 : vec_init = vect_get_slp_vect_def (init_node, ivn);
9938 15851 : if (!nested_in_vect_loop
9939 15851 : && step_mul
9940 15851 : && !integer_zerop (step_mul))
9941 : {
9942 15152 : gcc_assert (invariant);
9943 15152 : vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9944 15152 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9945 : vec_step, step_mul);
9946 15152 : vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9947 : vec_def, up);
9948 15152 : vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9949 : }
9950 :
9951 : /* Set the arguments of the phi node: */
9952 15851 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9953 :
9954 15851 : slp_node->push_vec_def (induction_phi);
9955 : }
9956 15279 : if (!nested_in_vect_loop)
9957 : {
9958 : /* Fill up to the number of vectors we need for the whole group. */
9959 15055 : if (nunits.is_constant (&const_nunits))
9960 15055 : nivs = least_common_multiple (group_size, const_nunits) / const_nunits;
9961 : else
9962 : nivs = 1;
9963 15055 : vec_steps.reserve (nivs-ivn);
9964 30131 : for (; ivn < nivs; ++ivn)
9965 : {
9966 21 : slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
9967 21 : vec_steps.quick_push (vec_steps[0]);
9968 : }
9969 : }
9970 :
9971 : /* Re-use IVs when we can. We are generating further vector
9972 : stmts by adding VF' * stride to the IVs generated above. */
9973 15279 : if (ivn < nvects)
9974 : {
9975 3394 : if (nunits.is_constant (&const_nunits))
9976 : {
9977 3394 : unsigned vfp = (least_common_multiple (group_size, const_nunits)
9978 3394 : / group_size);
9979 3394 : lupdate_mul
9980 3394 : = build_vector_from_val (step_vectype,
9981 3394 : SCALAR_FLOAT_TYPE_P (stept)
9982 8 : ? build_real_from_wide (stept,
9983 8 : vfp, UNSIGNED)
9984 6780 : : build_int_cstu (stept, vfp));
9985 : }
9986 : else
9987 : {
9988 : if (SCALAR_FLOAT_TYPE_P (stept))
9989 : {
9990 : tree tem = build_int_cst (integer_type_node, nunits);
9991 : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
9992 : }
9993 : else
9994 : lupdate_mul = build_int_cst (stept, nunits);
9995 : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
9996 : lupdate_mul);
9997 : }
9998 10904 : for (; ivn < nvects; ++ivn)
9999 : {
10000 7510 : gimple *iv
10001 7510 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10002 7510 : tree def = gimple_get_lhs (iv);
10003 7510 : if (ivn < 2*nivs)
10004 3482 : vec_steps[ivn - nivs]
10005 3482 : = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10006 3482 : vec_steps[ivn - nivs], lupdate_mul);
10007 7510 : gimple_seq stmts = NULL;
10008 7510 : def = gimple_convert (&stmts, step_vectype, def);
10009 22530 : def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10010 7510 : def, vec_steps[ivn % nivs]);
10011 7510 : def = gimple_convert (&stmts, vectype, def);
10012 7510 : if (gimple_code (iv) == GIMPLE_PHI)
10013 3482 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10014 : else
10015 : {
10016 4028 : gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10017 4028 : gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10018 : }
10019 7510 : slp_node->push_vec_def (def);
10020 : }
10021 : }
10022 :
10023 15279 : new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10024 15279 : gcc_assert (!new_bb);
10025 :
10026 15279 : return true;
10027 15279 : }
10028 :
10029 : /* Function vectorizable_live_operation_1.
10030 :
10031 : helper function for vectorizable_live_operation. */
10032 :
10033 : static tree
10034 2844 : vectorizable_live_operation_1 (loop_vec_info loop_vinfo, basic_block exit_bb,
10035 : tree vectype, slp_tree slp_node,
10036 : tree bitsize, tree bitstart, tree vec_lhs,
10037 : tree lhs_type, gimple_stmt_iterator *exit_gsi)
10038 : {
10039 2844 : gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10040 :
10041 2844 : tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10042 2844 : gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10043 5690 : for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10044 2846 : SET_PHI_ARG_DEF (phi, i, vec_lhs);
10045 :
10046 2844 : gimple_seq stmts = NULL;
10047 2844 : tree new_tree;
10048 :
10049 : /* If bitstart is 0 then we can use a BIT_FIELD_REF */
10050 2844 : if (integer_zerop (bitstart))
10051 : {
10052 213 : tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10053 : vec_lhs_phi, bitsize, bitstart);
10054 :
10055 : /* Convert the extracted vector element to the scalar type. */
10056 213 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10057 : }
10058 2631 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10059 : {
10060 : /* Emit:
10061 :
10062 : SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - 1>
10063 :
10064 : where VEC_LHS is the vectorized live-out result, LEN is the length of
10065 : the vector, BIAS is the load-store bias. The bias should not be used
10066 : at all since we are not using load/store operations, but LEN will be
10067 : REALLEN + BIAS, so subtract it to get to the correct position. */
10068 0 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
10069 0 : gimple_seq tem = NULL;
10070 0 : gimple_stmt_iterator gsi = gsi_last (tem);
10071 0 : tree len = vect_get_loop_len (loop_vinfo, &gsi,
10072 : &LOOP_VINFO_LENS (loop_vinfo),
10073 : 1, vectype, 0, 1, false);
10074 0 : gimple_seq_add_seq (&stmts, tem);
10075 :
10076 : /* LAST_INDEX = LEN - 1. */
10077 0 : tree last_index = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (len),
10078 0 : len, build_one_cst (TREE_TYPE (len)));
10079 :
10080 : /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - 1>. */
10081 0 : tree scalar_res
10082 0 : = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10083 : vec_lhs_phi, last_index);
10084 :
10085 : /* Convert the extracted vector element to the scalar type. */
10086 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10087 : }
10088 2631 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10089 : {
10090 : /* Emit:
10091 :
10092 : SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10093 :
10094 : where VEC_LHS is the vectorized live-out result and MASK is
10095 : the loop mask for the final iteration. */
10096 0 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
10097 0 : tree scalar_type = TREE_TYPE (vectype);
10098 0 : gimple_seq tem = NULL;
10099 0 : gimple_stmt_iterator gsi = gsi_last (tem);
10100 0 : tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10101 : &LOOP_VINFO_MASKS (loop_vinfo),
10102 : 1, vectype, 0);
10103 0 : tree scalar_res;
10104 0 : gimple_seq_add_seq (&stmts, tem);
10105 :
10106 0 : scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10107 : mask, vec_lhs_phi);
10108 :
10109 : /* Convert the extracted vector element to the scalar type. */
10110 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10111 : }
10112 : else
10113 : {
10114 2631 : tree bftype = TREE_TYPE (vectype);
10115 2631 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10116 85 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10117 2631 : new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10118 2631 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10119 : &stmts, true, NULL_TREE);
10120 : }
10121 :
10122 2844 : *exit_gsi = gsi_after_labels (exit_bb);
10123 2844 : if (stmts)
10124 2844 : gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10125 :
10126 2844 : return new_tree;
10127 : }
10128 :
10129 : /* Function vectorizable_live_operation.
10130 :
10131 : STMT_INFO computes a value that is used outside the loop. Check if
10132 : it can be supported. */
10133 :
10134 : bool
10135 259740 : vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10136 : slp_tree slp_node, slp_instance slp_node_instance,
10137 : int slp_index, bool vec_stmt_p,
10138 : stmt_vector_for_cost *cost_vec)
10139 : {
10140 259740 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10141 259740 : imm_use_iterator imm_iter;
10142 259740 : tree lhs, lhs_type, bitsize;
10143 259740 : tree vectype = SLP_TREE_VECTYPE (slp_node);
10144 259740 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10145 259740 : gimple *use_stmt;
10146 259740 : use_operand_p use_p;
10147 259740 : auto_vec<tree> vec_oprnds;
10148 259740 : int vec_entry = 0;
10149 259740 : poly_uint64 vec_index = 0;
10150 :
10151 259740 : gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10152 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10153 :
10154 : /* If a stmt of a reduction is live, vectorize it via
10155 : vect_create_epilog_for_reduction. vectorizable_reduction assessed
10156 : validity so just trigger the transform here. */
10157 259740 : if (vect_is_reduction (slp_node))
10158 : {
10159 86610 : if (!vec_stmt_p)
10160 : {
10161 63395 : SLP_TREE_LIVE_LANES (slp_node).safe_push (slp_index);
10162 63395 : return true;
10163 : }
10164 : /* For SLP reductions we vectorize the epilogue for all involved stmts
10165 : together. For SLP reduction chains we only get here once. */
10166 23215 : if (SLP_INSTANCE_KIND (slp_node_instance) == slp_inst_kind_reduc_group
10167 22959 : && slp_index != 0)
10168 : return true;
10169 22767 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
10170 22767 : if (VECT_REDUC_INFO_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10171 22767 : || VECT_REDUC_INFO_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10172 : return true;
10173 :
10174 21928 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
10175 21928 : || !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10176 21919 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10177 : slp_node_instance,
10178 : LOOP_VINFO_MAIN_EXIT (loop_vinfo));
10179 :
10180 : /* If early break we only have to materialize the reduction on the merge
10181 : block, but we have to find an alternate exit first. */
10182 21928 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10183 : {
10184 28 : slp_tree phis_node = slp_node_instance->reduc_phis;
10185 28 : stmt_info = SLP_TREE_REPRESENTATIVE (phis_node);
10186 89 : for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10187 28 : if (exit != LOOP_VINFO_MAIN_EXIT (loop_vinfo))
10188 : {
10189 23 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10190 : phis_node, slp_node_instance,
10191 : exit);
10192 23 : break;
10193 28 : }
10194 28 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10195 9 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10196 : phis_node, slp_node_instance,
10197 : LOOP_VINFO_MAIN_EXIT
10198 : (loop_vinfo));
10199 : }
10200 :
10201 21928 : return true;
10202 : }
10203 :
10204 : /* If STMT is not relevant and it is a simple assignment and its inputs are
10205 : invariant then it can remain in place, unvectorized. The original last
10206 : scalar value that it computes will be used. */
10207 173130 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
10208 : {
10209 0 : gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10210 0 : if (dump_enabled_p ())
10211 0 : dump_printf_loc (MSG_NOTE, vect_location,
10212 : "statement is simple and uses invariant. Leaving in "
10213 : "place.\n");
10214 0 : return true;
10215 : }
10216 :
10217 173130 : gcc_assert (slp_index >= 0);
10218 :
10219 : /* Get the last occurrence of the scalar index from the concatenation of
10220 : all the slp vectors. Calculate which slp vector it is and the index
10221 : within. */
10222 173130 : int num_scalar = SLP_TREE_LANES (slp_node);
10223 173130 : int num_vec = vect_get_num_copies (vinfo, slp_node);
10224 173130 : poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10225 :
10226 : /* Calculate which vector contains the result, and which lane of
10227 : that vector we need. */
10228 173130 : if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10229 : {
10230 : if (dump_enabled_p ())
10231 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10232 : "Cannot determine which vector holds the"
10233 : " final result.\n");
10234 : return false;
10235 : }
10236 :
10237 173130 : if (!vec_stmt_p)
10238 : {
10239 : /* No transformation required. */
10240 135316 : if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10241 : {
10242 27331 : if (SLP_TREE_LANES (slp_node) != 1)
10243 : {
10244 19 : if (dump_enabled_p ())
10245 19 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10246 : "can't operate on partial vectors "
10247 : "because an SLP statement is live after "
10248 : "the loop.\n");
10249 19 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10250 : }
10251 27312 : else if (num_vec > 1)
10252 : {
10253 15601 : if (dump_enabled_p ())
10254 51 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10255 : "can't operate on partial vectors "
10256 : "because ncopies is greater than 1.\n");
10257 15601 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10258 : }
10259 : else
10260 : {
10261 11711 : if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10262 : OPTIMIZE_FOR_SPEED))
10263 0 : vect_record_loop_mask (loop_vinfo,
10264 : &LOOP_VINFO_MASKS (loop_vinfo),
10265 : 1, vectype, NULL);
10266 11711 : else if (can_vec_extract_var_idx_p (
10267 11711 : TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10268 0 : vect_record_loop_len (loop_vinfo,
10269 : &LOOP_VINFO_LENS (loop_vinfo),
10270 : 1, vectype, 1);
10271 : else
10272 : {
10273 11711 : if (dump_enabled_p ())
10274 651 : dump_printf_loc (
10275 651 : MSG_MISSED_OPTIMIZATION, vect_location,
10276 : "can't operate on partial vectors "
10277 : "because the target doesn't support extract "
10278 : "last reduction.\n");
10279 11711 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10280 : }
10281 : }
10282 : }
10283 : /* ??? Enable for loop costing as well. */
10284 27331 : if (!loop_vinfo)
10285 64415 : record_stmt_cost (cost_vec, 1, vec_to_scalar, slp_node,
10286 : 0, vect_epilogue);
10287 135316 : SLP_TREE_LIVE_LANES (slp_node).safe_push (slp_index);
10288 135316 : return true;
10289 : }
10290 :
10291 : /* Use the lhs of the original scalar statement. */
10292 37814 : gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10293 37814 : if (dump_enabled_p ())
10294 980 : dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10295 : "stmt %G", stmt);
10296 :
10297 37814 : lhs = gimple_get_lhs (stmt);
10298 37814 : lhs_type = TREE_TYPE (lhs);
10299 :
10300 37814 : bitsize = vector_element_bits_tree (vectype);
10301 :
10302 : /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10303 37814 : gcc_assert (!loop_vinfo
10304 : || ((!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10305 : && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10306 : || SLP_TREE_LANES (slp_node) == 1));
10307 :
10308 : /* Get the correct slp vectorized stmt. */
10309 37814 : tree vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10310 37814 : gimple *vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10311 :
10312 : /* In case we need to early break vectorize also get the first stmt. */
10313 37814 : tree vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10314 :
10315 : /* Get entry to use. */
10316 37814 : tree bitstart = bitsize_int (vec_index);
10317 37814 : bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10318 :
10319 37814 : if (loop_vinfo)
10320 : {
10321 : /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10322 : requirement, insert one phi node for it. It looks like:
10323 : loop;
10324 : BB:
10325 : # lhs' = PHI <lhs>
10326 : ==>
10327 : loop;
10328 : BB:
10329 : # vec_lhs' = PHI <vec_lhs>
10330 : new_tree = lane_extract <vec_lhs', ...>;
10331 : lhs' = new_tree; */
10332 :
10333 2903 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10334 : /* Check if we have a loop where the chosen exit is not the main exit,
10335 : in these cases for an early break we restart the iteration the vector code
10336 : did. For the live values we want the value at the start of the iteration
10337 : rather than at the end. */
10338 2903 : edge main_e = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
10339 2903 : bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10340 15052 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10341 9246 : if (!is_gimple_debug (use_stmt)
10342 9246 : && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10343 2844 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10344 : {
10345 2844 : edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10346 2844 : phi_arg_index_from_use (use_p));
10347 2844 : gcc_assert (loop_exit_edge_p (loop, e));
10348 2844 : bool main_exit_edge = e == main_e;
10349 2844 : tree tmp_vec_lhs = vec_lhs;
10350 2844 : tree tmp_bitstart = bitstart;
10351 :
10352 : /* For early exit where the exit is not in the BB that leads
10353 : to the latch then we're restarting the iteration in the
10354 : scalar loop. So get the first live value. */
10355 2844 : bool early_break_first_element_p
10356 2844 : = all_exits_as_early_p || !main_exit_edge;
10357 2844 : if (early_break_first_element_p)
10358 : {
10359 195 : tmp_vec_lhs = vec_lhs0;
10360 195 : tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10361 : }
10362 :
10363 2844 : gimple_stmt_iterator exit_gsi;
10364 2844 : tree new_tree
10365 2844 : = vectorizable_live_operation_1 (loop_vinfo,
10366 : e->dest, vectype,
10367 : slp_node, bitsize,
10368 : tmp_bitstart, tmp_vec_lhs,
10369 : lhs_type, &exit_gsi);
10370 :
10371 2844 : auto gsi = gsi_for_stmt (use_stmt);
10372 2844 : tree lhs_phi = gimple_phi_result (use_stmt);
10373 2844 : remove_phi_node (&gsi, false);
10374 2844 : gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10375 2844 : gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10376 2844 : break;
10377 2903 : }
10378 :
10379 : /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
10380 12208 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10381 6402 : gcc_assert (is_gimple_debug (use_stmt)
10382 2903 : || flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10383 : }
10384 : else
10385 : {
10386 : /* For basic-block vectorization simply insert the lane-extraction. */
10387 34911 : tree bftype = TREE_TYPE (vectype);
10388 34911 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10389 2 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10390 34911 : tree new_tree = build3 (BIT_FIELD_REF, bftype,
10391 : vec_lhs, bitsize, bitstart);
10392 34911 : gimple_seq stmts = NULL;
10393 34911 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10394 : &stmts, true, NULL_TREE);
10395 34911 : if (TREE_CODE (new_tree) == SSA_NAME
10396 69822 : && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10397 2 : SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10398 34911 : if (is_a <gphi *> (vec_stmt))
10399 : {
10400 2471 : gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10401 2471 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10402 : }
10403 : else
10404 : {
10405 32440 : gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10406 32440 : gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10407 : }
10408 :
10409 : /* Replace use of lhs with newly computed result. If the use stmt is a
10410 : single arg PHI, just replace all uses of PHI result. It's necessary
10411 : because lcssa PHI defining lhs may be before newly inserted stmt. */
10412 34911 : use_operand_p use_p;
10413 34911 : stmt_vec_info use_stmt_info;
10414 204447 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10415 134625 : if (!is_gimple_debug (use_stmt)
10416 134625 : && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10417 98888 : || !PURE_SLP_STMT (use_stmt_info)))
10418 : {
10419 : /* ??? This can happen when the live lane ends up being
10420 : rooted in a vector construction code-generated by an
10421 : external SLP node (and code-generation for that already
10422 : happened).
10423 : Doing this is what would happen if that vector CTOR
10424 : were not code-generated yet so it is not too bad.
10425 : ??? In fact we'd likely want to avoid this situation
10426 : in the first place. */
10427 60323 : if (TREE_CODE (new_tree) == SSA_NAME
10428 60323 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10429 60323 : && gimple_code (use_stmt) != GIMPLE_PHI
10430 114496 : && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10431 : use_stmt))
10432 : {
10433 0 : if (dump_enabled_p ())
10434 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10435 : "Using original scalar computation for "
10436 : "live lane because use preceeds vector "
10437 : "def\n");
10438 0 : continue;
10439 : }
10440 185025 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10441 : {
10442 : /* ??? It can also happen that we end up pulling a def into
10443 : a loop where replacing out-of-loop uses would require
10444 : a new LC SSA PHI node. Retain the original scalar in
10445 : those cases as well. PR98064. */
10446 62351 : edge e;
10447 62351 : if (TREE_CODE (new_tree) == SSA_NAME
10448 62351 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10449 62351 : && (gimple_bb (use_stmt)->loop_father
10450 62351 : != gimple_bb (vec_stmt)->loop_father)
10451 : /* But a replacement in a LC PHI is OK. This happens
10452 : in gcc.dg/vect/bb-slp-57.c for example. */
10453 6822 : && (gimple_code (use_stmt) != GIMPLE_PHI
10454 2757 : || (((e = phi_arg_edge_from_use (use_p)), true)
10455 2757 : && !loop_exit_edge_p
10456 2757 : (gimple_bb (vec_stmt)->loop_father, e)))
10457 67437 : && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10458 5086 : gimple_bb (use_stmt)->loop_father))
10459 : {
10460 0 : if (dump_enabled_p ())
10461 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10462 : "Using original scalar computation for "
10463 : "live lane because there is an "
10464 : "out-of-loop definition for it\n");
10465 0 : continue;
10466 : }
10467 62351 : SET_USE (use_p, new_tree);
10468 : }
10469 60323 : update_stmt (use_stmt);
10470 34911 : }
10471 : }
10472 :
10473 : return true;
10474 259740 : }
10475 :
10476 : /* Given loop represented by LOOP_VINFO, return true if computation of
10477 : LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10478 : otherwise. */
10479 :
10480 : static bool
10481 61369 : loop_niters_no_overflow (loop_vec_info loop_vinfo)
10482 : {
10483 61369 : gcc_assert (!LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo));
10484 :
10485 : /* Constant case. */
10486 61369 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10487 : {
10488 35836 : tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10489 35836 : tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10490 :
10491 35836 : gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10492 35836 : gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10493 35836 : if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10494 : return true;
10495 : }
10496 :
10497 25533 : widest_int max;
10498 25533 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10499 : /* Check the upper bound of loop niters. */
10500 25533 : if (get_max_loop_iterations (loop, &max))
10501 : {
10502 25533 : tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10503 25533 : signop sgn = TYPE_SIGN (type);
10504 25533 : widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10505 25533 : if (max < type_max)
10506 25308 : return true;
10507 25533 : }
10508 : return false;
10509 25533 : }
10510 :
10511 : /* Return a mask type with half the number of elements as OLD_TYPE,
10512 : given that it should have mode NEW_MODE. */
10513 :
10514 : tree
10515 4795 : vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10516 : {
10517 4795 : poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10518 4795 : return build_truth_vector_type_for_mode (nunits, new_mode);
10519 : }
10520 :
10521 : /* Return a mask type with twice as many elements as OLD_TYPE,
10522 : given that it should have mode NEW_MODE. */
10523 :
10524 : tree
10525 7186 : vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10526 : {
10527 7186 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10528 7186 : return build_truth_vector_type_for_mode (nunits, new_mode);
10529 : }
10530 :
10531 : /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10532 : contain a sequence of NVECTORS masks that each control a vector of type
10533 : VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10534 : these vector masks with the vector version of SCALAR_MASK. */
10535 :
10536 : void
10537 105286 : vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10538 : unsigned int nvectors, tree vectype, tree scalar_mask)
10539 : {
10540 105286 : gcc_assert (nvectors != 0);
10541 :
10542 105286 : if (scalar_mask)
10543 : {
10544 4979 : scalar_cond_masked_key cond (scalar_mask, nvectors);
10545 4979 : loop_vinfo->scalar_cond_masked_set.add (cond);
10546 : }
10547 :
10548 105286 : masks->mask_set.add (std::make_pair (vectype, nvectors));
10549 105286 : }
10550 :
10551 : /* Given a complete set of masks MASKS, extract mask number INDEX
10552 : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10553 : where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10554 :
10555 : See the comment above vec_loop_masks for more details about the mask
10556 : arrangement. */
10557 :
10558 : tree
10559 208 : vect_get_loop_mask (loop_vec_info loop_vinfo,
10560 : gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10561 : unsigned int nvectors, tree vectype, unsigned int index)
10562 : {
10563 208 : if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10564 : == vect_partial_vectors_while_ult)
10565 : {
10566 0 : rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10567 0 : tree mask_type = rgm->type;
10568 :
10569 : /* Populate the rgroup's mask array, if this is the first time we've
10570 : used it. */
10571 0 : if (rgm->controls.is_empty ())
10572 : {
10573 0 : rgm->controls.safe_grow_cleared (nvectors, true);
10574 0 : for (unsigned int i = 0; i < nvectors; ++i)
10575 : {
10576 0 : tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10577 : /* Provide a dummy definition until the real one is available. */
10578 0 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10579 0 : rgm->controls[i] = mask;
10580 : }
10581 : }
10582 :
10583 0 : tree mask = rgm->controls[index];
10584 0 : if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10585 0 : TYPE_VECTOR_SUBPARTS (vectype)))
10586 : {
10587 : /* A loop mask for data type X can be reused for data type Y
10588 : if X has N times more elements than Y and if Y's elements
10589 : are N times bigger than X's. In this case each sequence
10590 : of N elements in the loop mask will be all-zero or all-one.
10591 : We can then view-convert the mask so that each sequence of
10592 : N elements is replaced by a single element. */
10593 0 : gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10594 : TYPE_VECTOR_SUBPARTS (vectype)));
10595 0 : gimple_seq seq = NULL;
10596 0 : mask_type = truth_type_for (vectype);
10597 0 : mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10598 0 : if (seq)
10599 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10600 : }
10601 0 : return mask;
10602 : }
10603 208 : else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10604 : == vect_partial_vectors_avx512)
10605 : {
10606 : /* The number of scalars per iteration and the number of vectors are
10607 : both compile-time constants. */
10608 208 : unsigned int nscalars_per_iter
10609 208 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10610 208 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10611 :
10612 208 : rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10613 :
10614 : /* The stored nV is dependent on the mask type produced. */
10615 208 : gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10616 : TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10617 : == rgm->factor);
10618 208 : nvectors = rgm->factor;
10619 :
10620 : /* Populate the rgroup's mask array, if this is the first time we've
10621 : used it. */
10622 208 : if (rgm->controls.is_empty ())
10623 : {
10624 20 : rgm->controls.safe_grow_cleared (nvectors, true);
10625 106 : for (unsigned int i = 0; i < nvectors; ++i)
10626 : {
10627 86 : tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
10628 : /* Provide a dummy definition until the real one is available. */
10629 86 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10630 86 : rgm->controls[i] = mask;
10631 : }
10632 : }
10633 208 : if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
10634 : TYPE_VECTOR_SUBPARTS (vectype)))
10635 160 : return rgm->controls[index];
10636 :
10637 : /* Split the vector if needed. Since we are dealing with integer mode
10638 : masks with AVX512 we can operate on the integer representation
10639 : performing the whole vector shifting. */
10640 48 : unsigned HOST_WIDE_INT factor;
10641 48 : bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
10642 48 : TYPE_VECTOR_SUBPARTS (vectype), &factor);
10643 0 : gcc_assert (ok);
10644 48 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
10645 48 : tree mask_type = truth_type_for (vectype);
10646 48 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
10647 48 : unsigned vi = index / factor;
10648 48 : unsigned vpart = index % factor;
10649 48 : tree vec = rgm->controls[vi];
10650 48 : gimple_seq seq = NULL;
10651 48 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
10652 48 : lang_hooks.types.type_for_mode
10653 48 : (TYPE_MODE (rgm->type), 1), vec);
10654 : /* For integer mode masks simply shift the right bits into position. */
10655 48 : if (vpart != 0)
10656 40 : vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
10657 : build_int_cst (integer_type_node,
10658 80 : (TYPE_VECTOR_SUBPARTS (vectype)
10659 40 : * vpart)));
10660 48 : vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
10661 48 : (TYPE_MODE (mask_type), 1), vec);
10662 48 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
10663 48 : if (seq)
10664 48 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10665 48 : return vec;
10666 : }
10667 : else
10668 0 : gcc_unreachable ();
10669 : }
10670 :
10671 : /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10672 : lengths for controlling an operation on VECTYPE. The operation splits
10673 : each element of VECTYPE into FACTOR separate subelements, measuring the
10674 : length as a number of these subelements. */
10675 :
10676 : void
10677 0 : vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10678 : unsigned int nvectors, tree vectype, unsigned int factor)
10679 : {
10680 0 : gcc_assert (nvectors != 0);
10681 0 : if (lens->length () < nvectors)
10682 0 : lens->safe_grow_cleared (nvectors, true);
10683 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
10684 :
10685 : /* The number of scalars per iteration, scalar occupied bytes and
10686 : the number of vectors are both compile-time constants. */
10687 0 : unsigned int nscalars_per_iter
10688 0 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10689 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10690 :
10691 0 : if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10692 : {
10693 : /* For now, we only support cases in which all loads and stores fall back
10694 : to VnQI or none do. */
10695 0 : gcc_assert (!rgl->max_nscalars_per_iter
10696 : || (rgl->factor == 1 && factor == 1)
10697 : || (rgl->max_nscalars_per_iter * rgl->factor
10698 : == nscalars_per_iter * factor));
10699 0 : rgl->max_nscalars_per_iter = nscalars_per_iter;
10700 0 : rgl->type = vectype;
10701 0 : rgl->factor = factor;
10702 : }
10703 0 : }
10704 :
10705 : /* Given a complete set of lengths LENS, extract length number INDEX
10706 : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10707 : where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
10708 : multipled by the number of elements that should be processed.
10709 : Insert any set-up statements before GSI. */
10710 :
10711 : tree
10712 0 : vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10713 : vec_loop_lens *lens, unsigned int nvectors, tree vectype,
10714 : unsigned int index, unsigned int factor, bool adjusted)
10715 : {
10716 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
10717 0 : bool use_bias_adjusted_len =
10718 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10719 :
10720 : /* Populate the rgroup's len array, if this is the first time we've
10721 : used it. */
10722 0 : if (rgl->controls.is_empty ())
10723 : {
10724 0 : rgl->controls.safe_grow_cleared (nvectors, true);
10725 0 : for (unsigned int i = 0; i < nvectors; ++i)
10726 : {
10727 0 : tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10728 0 : gcc_assert (len_type != NULL_TREE);
10729 :
10730 0 : tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10731 :
10732 : /* Provide a dummy definition until the real one is available. */
10733 0 : SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10734 0 : rgl->controls[i] = len;
10735 :
10736 0 : if (use_bias_adjusted_len)
10737 : {
10738 0 : gcc_assert (i == 0);
10739 0 : tree adjusted_len =
10740 0 : make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10741 0 : SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10742 0 : rgl->bias_adjusted_ctrl = adjusted_len;
10743 : }
10744 : }
10745 : }
10746 :
10747 0 : if (use_bias_adjusted_len && adjusted)
10748 0 : return rgl->bias_adjusted_ctrl;
10749 :
10750 0 : tree loop_len = rgl->controls[index];
10751 0 : if (rgl->factor == 1 && factor == 1)
10752 : {
10753 0 : poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
10754 0 : poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
10755 0 : if (maybe_ne (nunits1, nunits2))
10756 : {
10757 : /* A loop len for data type X can be reused for data type Y
10758 : if X has N times more elements than Y and if Y's elements
10759 : are N times bigger than X's. */
10760 0 : gcc_assert (multiple_p (nunits1, nunits2));
10761 0 : factor = exact_div (nunits1, nunits2).to_constant ();
10762 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10763 0 : gimple_seq seq = NULL;
10764 0 : loop_len = gimple_build (&seq, EXACT_DIV_EXPR, iv_type, loop_len,
10765 0 : build_int_cst (iv_type, factor));
10766 0 : if (seq)
10767 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10768 : }
10769 0 : }
10770 0 : else if (factor && rgl->factor != factor)
10771 : {
10772 : /* The number of scalars per iteration, scalar occupied bytes and
10773 : the number of vectors are both compile-time constants. */
10774 0 : unsigned int nscalars_per_iter
10775 0 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10776 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10777 0 : unsigned int rglvecsize = rgl->factor * rgl->max_nscalars_per_iter;
10778 0 : unsigned int vecsize = nscalars_per_iter * factor;
10779 0 : if (rglvecsize > vecsize)
10780 : {
10781 0 : unsigned int fac = rglvecsize / vecsize;
10782 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10783 0 : gimple_seq seq = NULL;
10784 0 : loop_len = gimple_build (&seq, EXACT_DIV_EXPR, iv_type, loop_len,
10785 0 : build_int_cst (iv_type, fac));
10786 0 : if (seq)
10787 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10788 : }
10789 0 : else if (rglvecsize < vecsize)
10790 : {
10791 0 : unsigned int fac = vecsize / rglvecsize;
10792 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10793 0 : gimple_seq seq = NULL;
10794 0 : loop_len = gimple_build (&seq, MULT_EXPR, iv_type, loop_len,
10795 0 : build_int_cst (iv_type, fac));
10796 0 : if (seq)
10797 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10798 : }
10799 : }
10800 : return loop_len;
10801 : }
10802 :
10803 : /* Generate the tree for the loop len mask and return it. Given the lens,
10804 : nvectors, vectype, index and factor to gen the len mask as below.
10805 :
10806 : tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
10807 : */
10808 : tree
10809 0 : vect_gen_loop_len_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10810 : gimple_stmt_iterator *cond_gsi, vec_loop_lens *lens,
10811 : unsigned int nvectors, tree vectype, tree stmt,
10812 : unsigned int index, unsigned int factor)
10813 : {
10814 0 : tree all_one_mask = build_all_ones_cst (vectype);
10815 0 : tree all_zero_mask = build_zero_cst (vectype);
10816 0 : tree len = vect_get_loop_len (loop_vinfo, gsi, lens, nvectors, vectype, index,
10817 : factor, true);
10818 0 : tree bias = build_int_cst (intQI_type_node,
10819 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo));
10820 0 : tree len_mask = make_temp_ssa_name (TREE_TYPE (stmt), NULL, "vec_len_mask");
10821 0 : gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, stmt,
10822 : all_one_mask, all_zero_mask, len,
10823 : bias);
10824 0 : gimple_call_set_lhs (call, len_mask);
10825 0 : gsi_insert_before (cond_gsi, call, GSI_SAME_STMT);
10826 :
10827 0 : return len_mask;
10828 : }
10829 :
10830 : /* Scale profiling counters by estimation for LOOP which is vectorized
10831 : by factor VF.
10832 : If FLAT is true, the loop we started with had unrealistically flat
10833 : profile. */
10834 :
10835 : static void
10836 61412 : scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
10837 : {
10838 : /* For flat profiles do not scale down proportionally by VF and only
10839 : cap by known iteration count bounds. */
10840 61412 : if (flat)
10841 : {
10842 34542 : if (dump_file && (dump_flags & TDF_DETAILS))
10843 5268 : fprintf (dump_file,
10844 : "Vectorized loop profile seems flat; not scaling iteration "
10845 : "count down by the vectorization factor %i\n", vf);
10846 34542 : scale_loop_profile (loop, profile_probability::always (),
10847 : get_likely_max_loop_iterations_int (loop));
10848 34542 : return;
10849 : }
10850 : /* Loop body executes VF fewer times and exit increases VF times. */
10851 26870 : profile_count entry_count = loop_preheader_edge (loop)->count ();
10852 :
10853 : /* If we have unreliable loop profile avoid dropping entry
10854 : count below header count. This can happen since loops
10855 : has unrealistically low trip counts. */
10856 26870 : while (vf > 1
10857 27958 : && loop->header->count > entry_count
10858 56935 : && loop->header->count < entry_count * vf)
10859 : {
10860 2107 : if (dump_file && (dump_flags & TDF_DETAILS))
10861 153 : fprintf (dump_file,
10862 : "Vectorization factor %i seems too large for profile "
10863 : "prevoiusly believed to be consistent; reducing.\n", vf);
10864 2107 : vf /= 2;
10865 : }
10866 :
10867 26870 : if (entry_count.nonzero_p ())
10868 26870 : set_edge_probability_and_rescale_others
10869 26870 : (exit_e,
10870 26870 : entry_count.probability_in (loop->header->count / vf));
10871 : /* Avoid producing very large exit probability when we do not have
10872 : sensible profile. */
10873 0 : else if (exit_e->probability < profile_probability::always () / (vf * 2))
10874 0 : set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
10875 26870 : loop->latch->count = single_pred_edge (loop->latch)->count ();
10876 :
10877 26870 : scale_loop_profile (loop, profile_probability::always () / vf,
10878 : get_likely_max_loop_iterations_int (loop));
10879 : }
10880 :
10881 : /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
10882 : original loop that has now been vectorized.
10883 :
10884 : The inits of the data_references need to be advanced with the number of
10885 : iterations of the main loop. This has been computed in vect_do_peeling and
10886 : is stored in parameter ADVANCE.
10887 :
10888 : Since the loop_vec_info of this EPILOGUE was constructed for the original
10889 : loop, its stmt_vec_infos all point to the original statements. These need
10890 : to be updated to point to their corresponding copies.
10891 :
10892 : The data_reference's connections also need to be updated. Their
10893 : corresponding dr_vec_info need to be reconnected to the EPILOGUE's
10894 : stmt_vec_infos, their statements need to point to their corresponding
10895 : copy. */
10896 :
10897 : static void
10898 6824 : update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
10899 : {
10900 6824 : loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
10901 6824 : hash_map<tree,tree> mapping;
10902 6824 : gimple *orig_stmt, *new_stmt;
10903 6824 : gimple_stmt_iterator epilogue_gsi;
10904 6824 : gphi_iterator epilogue_phi_gsi;
10905 6824 : stmt_vec_info stmt_vinfo = NULL, related_vinfo;
10906 6824 : basic_block *epilogue_bbs = get_loop_body (epilogue);
10907 6824 : unsigned i;
10908 :
10909 6824 : free (LOOP_VINFO_BBS (epilogue_vinfo));
10910 6824 : LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
10911 6824 : LOOP_VINFO_NBBS (epilogue_vinfo) = epilogue->num_nodes;
10912 :
10913 : /* The EPILOGUE loop is a copy of the original loop so they share the same
10914 : gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
10915 : point to the copied statements. */
10916 20472 : for (unsigned i = 0; i < epilogue->num_nodes; ++i)
10917 : {
10918 13648 : for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
10919 35159 : !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
10920 : {
10921 21511 : new_stmt = epilogue_phi_gsi.phi ();
10922 :
10923 21511 : gcc_assert (gimple_uid (new_stmt) > 0);
10924 21511 : stmt_vinfo
10925 21511 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10926 :
10927 21511 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10928 : }
10929 :
10930 27296 : for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
10931 136522 : !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
10932 : {
10933 122874 : new_stmt = gsi_stmt (epilogue_gsi);
10934 122874 : if (is_gimple_debug (new_stmt))
10935 20253 : continue;
10936 :
10937 102621 : gcc_assert (gimple_uid (new_stmt) > 0);
10938 102621 : stmt_vinfo
10939 102621 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10940 :
10941 102621 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10942 :
10943 102621 : related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
10944 102621 : if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
10945 : {
10946 1938 : gimple *stmt = STMT_VINFO_STMT (related_vinfo);
10947 : /* Set BB such that the assert in
10948 : 'get_initial_defs_for_reduction' is able to determine that
10949 : the BB of the related stmt is inside this loop. */
10950 1938 : gimple_set_bb (stmt,
10951 : gimple_bb (new_stmt));
10952 1938 : related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
10953 1938 : gcc_assert (related_vinfo == NULL
10954 : || related_vinfo == stmt_vinfo);
10955 : }
10956 : }
10957 : }
10958 :
10959 6824 : struct data_reference *dr;
10960 6824 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
10961 30864 : FOR_EACH_VEC_ELT (datarefs, i, dr)
10962 : {
10963 24040 : orig_stmt = DR_STMT (dr);
10964 24040 : gcc_assert (gimple_uid (orig_stmt) > 0);
10965 24040 : stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
10966 24040 : DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
10967 : }
10968 :
10969 : /* Advance data_reference's with the number of iterations of the previous
10970 : loop and its prologue. */
10971 6824 : vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
10972 :
10973 : /* Remember the advancement made. */
10974 6824 : LOOP_VINFO_DRS_ADVANCED_BY (epilogue_vinfo) = advance;
10975 6824 : }
10976 :
10977 : /* When vectorizing early break statements instructions that happen before
10978 : the early break in the current BB need to be moved to after the early
10979 : break. This function deals with that and assumes that any validity
10980 : checks has already been performed.
10981 :
10982 : While moving the instructions if it encounters a VUSE or VDEF it then
10983 : corrects the VUSES as it moves the statements along. GDEST is the location
10984 : in which to insert the new statements. */
10985 :
10986 : static void
10987 1405 : move_early_exit_stmts (loop_vec_info loop_vinfo)
10988 : {
10989 1405 : DUMP_VECT_SCOPE ("move_early_exit_stmts");
10990 :
10991 1405 : if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
10992 1188 : return;
10993 :
10994 : /* Move all stmts that need moving. */
10995 217 : basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
10996 217 : gimple_stmt_iterator dest_gsi = gsi_after_labels (dest_bb);
10997 :
10998 217 : tree last_seen_vuse = NULL_TREE;
10999 533 : for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11000 : {
11001 : /* We have to update crossed degenerate virtual PHIs. Simply
11002 : elide them. */
11003 316 : if (gphi *vphi = dyn_cast <gphi *> (stmt))
11004 : {
11005 7 : tree vdef = gimple_phi_result (vphi);
11006 7 : tree vuse = gimple_phi_arg_def (vphi, 0);
11007 7 : imm_use_iterator iter;
11008 7 : use_operand_p use_p;
11009 7 : gimple *use_stmt;
11010 30 : FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef)
11011 : {
11012 48 : FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
11013 16 : SET_USE (use_p, vuse);
11014 7 : }
11015 7 : auto gsi = gsi_for_stmt (stmt);
11016 7 : remove_phi_node (&gsi, true);
11017 7 : last_seen_vuse = vuse;
11018 7 : continue;
11019 7 : }
11020 :
11021 : /* Check to see if statement is still required for vect or has been
11022 : elided. */
11023 309 : auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11024 309 : if (!stmt_info)
11025 0 : continue;
11026 :
11027 309 : if (dump_enabled_p ())
11028 158 : dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11029 :
11030 309 : gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11031 309 : gsi_move_before (&stmt_gsi, &dest_gsi, GSI_NEW_STMT);
11032 618 : last_seen_vuse = gimple_vuse (stmt);
11033 : }
11034 :
11035 : /* Update all the stmts with their new reaching VUSES. */
11036 679 : for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11037 : {
11038 194 : if (dump_enabled_p ())
11039 158 : dump_printf_loc (MSG_NOTE, vect_location,
11040 : "updating vuse to %T for load %G",
11041 : last_seen_vuse, p);
11042 194 : gimple_set_vuse (p, last_seen_vuse);
11043 194 : update_stmt (p);
11044 : }
11045 :
11046 : /* And update the LC PHIs on exits. */
11047 1098 : for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
11048 447 : if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
11049 243 : if (gphi *phi = get_virtual_phi (e->dest))
11050 460 : SET_PHI_ARG_DEF_ON_EDGE (phi, e, last_seen_vuse);
11051 : }
11052 :
11053 : /* Generate adjustment code for early break scalar IVs filling in the value
11054 : we created earlier on for LOOP_VINFO_EARLY_BRK_NITERS_VAR. */
11055 :
11056 : static void
11057 1405 : vect_update_ivs_after_vectorizer_for_early_breaks (loop_vec_info loop_vinfo)
11058 : {
11059 1405 : DUMP_VECT_SCOPE ("vect_update_ivs_after_vectorizer_for_early_breaks");
11060 :
11061 1405 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11062 0 : return;
11063 :
11064 1405 : gcc_assert (LOOP_VINFO_EARLY_BRK_NITERS_VAR (loop_vinfo));
11065 :
11066 1405 : tree phi_var = LOOP_VINFO_EARLY_BRK_NITERS_VAR (loop_vinfo);
11067 1405 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11068 1405 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11069 1405 : tree ty_var = TREE_TYPE (phi_var);
11070 1405 : auto loop = LOOP_VINFO_LOOP (loop_vinfo);
11071 1405 : tree induc_var = niters_skip ? copy_ssa_name (phi_var) : phi_var;
11072 :
11073 1405 : auto induction_phi = create_phi_node (induc_var, loop->header);
11074 1405 : tree induc_def = PHI_RESULT (induction_phi);
11075 :
11076 : /* Create the iv update inside the loop. */
11077 1405 : gimple_seq init_stmts = NULL;
11078 1405 : gimple_seq stmts = NULL;
11079 1405 : gimple_seq iv_stmts = NULL;
11080 1405 : tree tree_vf = build_int_cst (ty_var, vf);
11081 :
11082 : /* For loop len targets we have to use .SELECT_VL (ivtmp_33, VF); instead of
11083 : just += VF as the VF can change in between two loop iterations. */
11084 1405 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
11085 : {
11086 0 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
11087 0 : tree_vf = vect_get_loop_len (loop_vinfo, NULL, lens, 1,
11088 : NULL_TREE, 0, 0, true);
11089 : }
11090 :
11091 1405 : tree iter_var;
11092 1405 : if (POINTER_TYPE_P (ty_var))
11093 : {
11094 0 : tree offset = gimple_convert (&stmts, sizetype, tree_vf);
11095 0 : iter_var = gimple_build (&stmts, POINTER_PLUS_EXPR, ty_var, induc_def,
11096 : gimple_convert (&stmts, sizetype, offset));
11097 : }
11098 : else
11099 : {
11100 1405 : tree offset = gimple_convert (&stmts, ty_var, tree_vf);
11101 1405 : iter_var = gimple_build (&stmts, PLUS_EXPR, ty_var, induc_def, offset);
11102 : }
11103 :
11104 1405 : tree init_var = build_zero_cst (ty_var);
11105 1405 : if (niters_skip)
11106 0 : init_var = gimple_build (&init_stmts, MINUS_EXPR, ty_var, init_var,
11107 : gimple_convert (&init_stmts, ty_var, niters_skip));
11108 :
11109 1405 : add_phi_arg (induction_phi, iter_var,
11110 : loop_latch_edge (loop), UNKNOWN_LOCATION);
11111 1405 : add_phi_arg (induction_phi, init_var,
11112 : loop_preheader_edge (loop), UNKNOWN_LOCATION);
11113 :
11114 : /* Find the first insertion point in the BB. */
11115 1405 : auto pe = loop_preheader_edge (loop);
11116 :
11117 : /* If we've done any peeling, calculate the peeling adjustment needed to the
11118 : final IV. */
11119 1405 : if (niters_skip)
11120 : {
11121 0 : tree induc_type = TREE_TYPE (induc_def);
11122 0 : tree s_induc_type = signed_type_for (induc_type);
11123 0 : induc_def = gimple_build (&iv_stmts, MAX_EXPR, s_induc_type,
11124 : gimple_convert (&iv_stmts, s_induc_type,
11125 : induc_def),
11126 : build_zero_cst (s_induc_type));
11127 0 : auto stmt = gimple_build_assign (phi_var,
11128 : gimple_convert (&iv_stmts, induc_type,
11129 : induc_def));
11130 0 : gimple_seq_add_stmt_without_update (&iv_stmts, stmt);
11131 0 : basic_block exit_bb = NULL;
11132 : /* Identify the early exit merge block. I wish we had stored this. */
11133 0 : for (auto e : get_loop_exit_edges (loop))
11134 0 : if (e != LOOP_VINFO_MAIN_EXIT (loop_vinfo))
11135 : {
11136 0 : exit_bb = e->dest;
11137 0 : break;
11138 0 : }
11139 :
11140 0 : gcc_assert (exit_bb);
11141 0 : auto exit_gsi = gsi_after_labels (exit_bb);
11142 0 : gsi_insert_seq_before (&exit_gsi, iv_stmts, GSI_SAME_STMT);
11143 : }
11144 : /* Write the init_stmts in the loop-preheader block. */
11145 1405 : auto psi = gsi_last_nondebug_bb (pe->src);
11146 1405 : gsi_insert_seq_after (&psi, init_stmts, GSI_LAST_NEW_STMT);
11147 : /* Wite the adjustments in the header block. */
11148 1405 : basic_block bb = loop->header;
11149 1405 : auto si = gsi_after_labels (bb);
11150 1405 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11151 : }
11152 :
11153 : /* Function vect_transform_loop.
11154 :
11155 : The analysis phase has determined that the loop is vectorizable.
11156 : Vectorize the loop - created vectorized stmts to replace the scalar
11157 : stmts in the loop, and update the loop exit condition.
11158 : Returns scalar epilogue loop if any. */
11159 :
11160 : class loop *
11161 61412 : vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11162 : {
11163 61412 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11164 61412 : class loop *epilogue = NULL;
11165 61412 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11166 61412 : int nbbs = loop->num_nodes;
11167 61412 : int i;
11168 61412 : tree niters_vector = NULL_TREE;
11169 61412 : tree step_vector = NULL_TREE;
11170 61412 : tree niters_vector_mult_vf = NULL_TREE;
11171 61412 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11172 61412 : unsigned int lowest_vf = constant_lower_bound (vf);
11173 61412 : gimple *stmt;
11174 61412 : bool check_profitability = false;
11175 61412 : unsigned int th;
11176 61412 : bool flat = maybe_flat_loop_profile (loop);
11177 61412 : bool uncounted_p = LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo);
11178 :
11179 61412 : DUMP_VECT_SCOPE ("vec_transform_loop");
11180 :
11181 61412 : if (! LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11182 54588 : loop_vinfo->shared->check_datarefs ();
11183 :
11184 : /* Use the more conservative vectorization threshold. If the number
11185 : of iterations is constant assume the cost check has been performed
11186 : by our caller. If the threshold makes all loops profitable that
11187 : run at least the (estimated) vectorization factor number of times
11188 : checking is pointless, too. */
11189 61412 : th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11190 61412 : if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11191 : {
11192 18511 : if (dump_enabled_p ())
11193 176 : dump_printf_loc (MSG_NOTE, vect_location,
11194 : "Profitability threshold is %d loop iterations.\n",
11195 : th);
11196 : check_profitability = true;
11197 : }
11198 :
11199 : /* Make sure there exists a single-predecessor exit bb. Do this before
11200 : versioning. */
11201 61412 : edge e = LOOP_VINFO_MAIN_EXIT (loop_vinfo);
11202 61412 : if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11203 : {
11204 18956 : split_loop_exit_edge (e, true);
11205 18956 : if (dump_enabled_p ())
11206 2274 : dump_printf (MSG_NOTE, "split exit edge\n");
11207 : }
11208 :
11209 : /* Version the loop first, if required, so the profitability check
11210 : comes first. */
11211 :
11212 61412 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11213 : {
11214 3764 : class loop *sloop
11215 3764 : = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11216 3764 : sloop->force_vectorize = false;
11217 3764 : check_profitability = false;
11218 : }
11219 :
11220 : /* Make sure there exists a single-predecessor exit bb also on the
11221 : scalar loop copy. Do this after versioning but before peeling
11222 : so CFG structure is fine for both scalar and if-converted loop
11223 : to make slpeel_duplicate_current_defs_from_edges face matched
11224 : loop closed PHI nodes on the exit. */
11225 61412 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11226 : {
11227 8038 : e = LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo);
11228 8038 : if (! single_pred_p (e->dest))
11229 : {
11230 7780 : split_loop_exit_edge (e, true);
11231 7780 : if (dump_enabled_p ())
11232 1137 : dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11233 : }
11234 : }
11235 :
11236 61412 : tree niters = vect_build_loop_niters (loop_vinfo);
11237 61412 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11238 61412 : tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11239 61412 : tree advance;
11240 61412 : drs_init_vec orig_drs_init;
11241 61412 : bool niters_no_overflow = uncounted_p ? false /* Not known. */
11242 61369 : : loop_niters_no_overflow (loop_vinfo);
11243 :
11244 61412 : epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11245 : &step_vector, &niters_vector_mult_vf, th,
11246 : check_profitability, niters_no_overflow,
11247 : &advance);
11248 :
11249 : /* Assign hierarchical discriminators to the vectorized loop. */
11250 61412 : poly_uint64 vf_val = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11251 61412 : unsigned int vf_int = constant_lower_bound (vf_val);
11252 61412 : if (vf_int > DISCR_MULTIPLICITY_MAX)
11253 : vf_int = DISCR_MULTIPLICITY_MAX;
11254 :
11255 : /* Assign unique copy_id dynamically instead of using hardcoded constants.
11256 : Epilogue and main vectorized loops get different copy_ids. */
11257 61412 : gimple *loop_last = last_nondebug_stmt (loop->header);
11258 61412 : location_t loop_loc
11259 61412 : = loop_last ? gimple_location (loop_last) : UNKNOWN_LOCATION;
11260 61134 : if (loop_loc != UNKNOWN_LOCATION)
11261 : {
11262 50608 : unsigned int copyid = allocate_copyid_base (loop_loc, 1);
11263 50608 : assign_discriminators_to_loop (loop, vf_int, copyid);
11264 : }
11265 61412 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11266 61412 : && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11267 : {
11268 : /* Ifcvt duplicates loop preheader, loop body and produces an basic
11269 : block after loop exit. We need to scale all that. */
11270 88 : basic_block preheader
11271 88 : = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11272 88 : preheader->count
11273 : = preheader->count.apply_probability
11274 88 : (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11275 88 : scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11276 : LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11277 88 : LOOP_VINFO_SCALAR_MAIN_EXIT (loop_vinfo)->dest->count = preheader->count;
11278 : }
11279 :
11280 61412 : if (niters_vector == NULL_TREE && !uncounted_p)
11281 : {
11282 27395 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11283 27395 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11284 55575 : && known_eq (lowest_vf, vf))
11285 : {
11286 27392 : niters_vector
11287 27392 : = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11288 27392 : LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11289 27392 : step_vector = build_one_cst (TREE_TYPE (niters));
11290 : }
11291 791 : else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11292 1 : vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11293 : &step_vector, niters_no_overflow);
11294 : else
11295 : /* vect_do_peeling subtracted the number of peeled prologue
11296 : iterations from LOOP_VINFO_NITERS. */
11297 790 : vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11298 : &niters_vector, &step_vector,
11299 : niters_no_overflow);
11300 : }
11301 :
11302 : /* 1) Make sure the loop header has exactly two entries
11303 : 2) Make sure we have a preheader basic block. */
11304 :
11305 61412 : gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11306 :
11307 61412 : split_edge (loop_preheader_edge (loop));
11308 :
11309 61412 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11310 : /* This will deal with any possible peeling. */
11311 1 : vect_prepare_for_masked_peels (loop_vinfo);
11312 :
11313 : /* Handle any code motion that we need to for early-break vectorization after
11314 : we've done peeling but just before we start vectorizing. */
11315 61412 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11316 : {
11317 1405 : vect_update_ivs_after_vectorizer_for_early_breaks (loop_vinfo);
11318 1405 : move_early_exit_stmts (loop_vinfo);
11319 : }
11320 :
11321 : /* Remove existing clobber stmts and prefetches. */
11322 187552 : for (i = 0; i < nbbs; i++)
11323 : {
11324 126140 : basic_block bb = bbs[i];
11325 1089396 : for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);)
11326 : {
11327 837116 : stmt = gsi_stmt (si);
11328 837116 : if (gimple_clobber_p (stmt)
11329 837116 : || gimple_call_builtin_p (stmt, BUILT_IN_PREFETCH))
11330 : {
11331 90 : unlink_stmt_vdef (stmt);
11332 90 : gsi_remove (&si, true);
11333 90 : release_defs (stmt);
11334 : }
11335 : else
11336 837026 : gsi_next (&si);
11337 : }
11338 : }
11339 :
11340 : /* Schedule the SLP instances. */
11341 61412 : if (!loop_vinfo->slp_instances.is_empty ())
11342 : {
11343 61412 : DUMP_VECT_SCOPE ("scheduling SLP instances");
11344 61412 : vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11345 : }
11346 :
11347 : /* Generate the loop invariant statements. */
11348 61412 : if (!gimple_seq_empty_p (LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo)))
11349 : {
11350 73 : if (dump_enabled_p ())
11351 30 : dump_printf_loc (MSG_NOTE, vect_location,
11352 : "------>generating loop invariant statements\n");
11353 73 : gimple_stmt_iterator gsi;
11354 73 : gsi = gsi_after_labels (loop_preheader_edge (loop)->src);
11355 73 : gsi_insert_seq_before (&gsi, LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo),
11356 : GSI_CONTINUE_LINKING);
11357 : }
11358 :
11359 : /* Stub out scalar statements that must not survive vectorization and
11360 : were not picked as relevant in any SLP instance.
11361 : Doing this here helps with grouped statements, or statements that
11362 : are involved in patterns. */
11363 187552 : for (i = 0; i < nbbs; i++)
11364 : {
11365 126140 : basic_block bb = bbs[i];
11366 126140 : stmt_vec_info stmt_info;
11367 252280 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11368 1669574 : !gsi_end_p (gsi); gsi_next (&gsi))
11369 : {
11370 1543434 : gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11371 6350 : if (!call || !gimple_call_internal_p (call))
11372 1538243 : continue;
11373 5191 : internal_fn ifn = gimple_call_internal_fn (call);
11374 5191 : if (ifn == IFN_MASK_LOAD)
11375 : {
11376 737 : tree lhs = gimple_get_lhs (call);
11377 737 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11378 : {
11379 0 : tree zero = build_zero_cst (TREE_TYPE (lhs));
11380 0 : gimple *new_stmt = gimple_build_assign (lhs, zero);
11381 0 : gsi_replace (&gsi, new_stmt, true);
11382 : }
11383 : }
11384 4454 : else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11385 : {
11386 2297 : tree lhs = gimple_get_lhs (call);
11387 2297 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11388 : {
11389 0 : tree else_arg
11390 0 : = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11391 0 : gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11392 0 : gsi_replace (&gsi, new_stmt, true);
11393 : }
11394 : }
11395 2157 : else if (ifn == IFN_MASK_CALL
11396 4 : && (stmt_info = loop_vinfo->lookup_stmt (call))
11397 4 : && !STMT_VINFO_RELEVANT_P (stmt_info)
11398 2161 : && !STMT_VINFO_LIVE_P (stmt_info))
11399 : {
11400 4 : gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11401 4 : loop_vinfo->remove_stmt (stmt_info);
11402 : }
11403 : }
11404 : }
11405 :
11406 61412 : if (!uncounted_p)
11407 : {
11408 : /* The vectorization factor is always > 1, so if we use an IV increment of
11409 : 1. A zero NITERS becomes a nonzero NITERS_VECTOR. */
11410 61369 : if (integer_onep (step_vector))
11411 61351 : niters_no_overflow = true;
11412 :
11413 61369 : vect_set_loop_condition (loop, LOOP_VINFO_MAIN_EXIT (loop_vinfo),
11414 : loop_vinfo, niters_vector, step_vector,
11415 61369 : niters_vector_mult_vf, !niters_no_overflow);
11416 : }
11417 :
11418 61412 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11419 :
11420 : /* True if the final iteration might not handle a full vector's
11421 : worth of scalar iterations. */
11422 122824 : bool final_iter_may_be_partial
11423 61412 : = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11424 61412 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
11425 :
11426 : /* +1 to convert latch counts to loop iteration counts. */
11427 61412 : int bias_for_lowest = 1;
11428 :
11429 : /* When we are peeling for gaps then we take away one scalar iteration
11430 : from the vector loop. Thus we can adjust the upper bound by one
11431 : scalar iteration. But only when we know the bound applies to the
11432 : IV exit test which might not be true when we have multiple exits. */
11433 61412 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11434 119642 : bias_for_lowest -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11435 :
11436 61412 : int bias_for_assumed = bias_for_lowest;
11437 61412 : int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11438 61412 : if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11439 : {
11440 : /* When the amount of peeling is known at compile time, the first
11441 : iteration will have exactly alignment_npeels active elements.
11442 : In the worst case it will have at least one. */
11443 1 : int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11444 1 : bias_for_lowest += lowest_vf - min_first_active;
11445 1 : bias_for_assumed += assumed_vf - min_first_active;
11446 : }
11447 : /* In these calculations the "- 1" converts loop iteration counts
11448 : back to latch counts. */
11449 61412 : if (loop->any_upper_bound)
11450 : {
11451 61396 : loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11452 61396 : loop->nb_iterations_upper_bound
11453 61396 : = (final_iter_may_be_partial
11454 62803 : ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11455 2814 : lowest_vf) - 1
11456 59989 : : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11457 119978 : lowest_vf) - 1);
11458 61396 : if (main_vinfo
11459 : /* Both peeling for alignment and peeling for gaps can end up
11460 : with the scalar epilogue running for more than VF-1 iterations. */
11461 6824 : && !main_vinfo->peeling_for_alignment
11462 6776 : && !main_vinfo->peeling_for_gaps)
11463 : {
11464 6594 : unsigned int bound;
11465 6594 : poly_uint64 main_iters
11466 6594 : = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11467 : LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11468 6594 : main_iters
11469 6594 : = upper_bound (main_iters,
11470 6594 : LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11471 13188 : if (can_div_away_from_zero_p (main_iters,
11472 6594 : LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11473 : &bound))
11474 6594 : loop->nb_iterations_upper_bound
11475 6594 : = wi::umin ((bound_wide_int) (bound - 1),
11476 6594 : loop->nb_iterations_upper_bound);
11477 : }
11478 : }
11479 61412 : if (loop->any_likely_upper_bound)
11480 61396 : loop->nb_iterations_likely_upper_bound
11481 61396 : = (final_iter_may_be_partial
11482 62803 : ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11483 1407 : + bias_for_lowest, lowest_vf) - 1
11484 59989 : : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11485 61396 : + bias_for_lowest, lowest_vf) - 1);
11486 61412 : if (loop->any_estimate)
11487 35468 : loop->nb_iterations_estimate
11488 35468 : = (final_iter_may_be_partial
11489 36161 : ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11490 1386 : assumed_vf) - 1
11491 34775 : : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11492 70243 : assumed_vf) - 1);
11493 61412 : scale_profile_for_vect_loop (loop, LOOP_VINFO_MAIN_EXIT (loop_vinfo),
11494 : assumed_vf, flat);
11495 :
11496 61412 : if (dump_enabled_p ())
11497 : {
11498 10965 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11499 : {
11500 9511 : dump_printf_loc (MSG_NOTE, vect_location,
11501 : "LOOP VECTORIZED\n");
11502 9511 : if (loop->inner)
11503 345 : dump_printf_loc (MSG_NOTE, vect_location,
11504 : "OUTER LOOP VECTORIZED\n");
11505 9511 : dump_printf (MSG_NOTE, "\n");
11506 : }
11507 : else
11508 1454 : dump_printf_loc (MSG_NOTE, vect_location,
11509 : "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11510 1454 : GET_MODE_NAME (loop_vinfo->vector_mode));
11511 : }
11512 :
11513 : /* Loops vectorized with a variable factor won't benefit from
11514 : unrolling/peeling. */
11515 61412 : if (!vf.is_constant ())
11516 : {
11517 : loop->unroll = 1;
11518 : if (dump_enabled_p ())
11519 : dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11520 : " variable-length vectorization factor\n");
11521 : }
11522 :
11523 : /* When we have unrolled the loop due to a user requested value we should
11524 : leave it up to the RTL unroll heuristics to determine if it's still worth
11525 : while to unroll more. */
11526 61412 : if (LOOP_VINFO_USER_UNROLL (loop_vinfo))
11527 44 : loop->unroll = 0;
11528 :
11529 : /* Free SLP instances here because otherwise stmt reference counting
11530 : won't work. */
11531 : slp_instance instance;
11532 150899 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11533 89487 : vect_free_slp_instance (instance);
11534 61412 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11535 : /* Clear-up safelen field since its value is invalid after vectorization
11536 : since vectorized loop can have loop-carried dependencies. */
11537 61412 : loop->safelen = 0;
11538 :
11539 61412 : if (epilogue)
11540 : {
11541 : /* Accumulate past advancements made. */
11542 6824 : if (LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo))
11543 83 : advance = fold_build2 (PLUS_EXPR, TREE_TYPE (advance),
11544 : LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo),
11545 : advance);
11546 6824 : update_epilogue_loop_vinfo (epilogue, advance);
11547 :
11548 6824 : epilogue->simduid = loop->simduid;
11549 6824 : epilogue->force_vectorize = loop->force_vectorize;
11550 6824 : epilogue->dont_vectorize = false;
11551 : }
11552 :
11553 61412 : return epilogue;
11554 61412 : }
11555 :
11556 : /* The code below is trying to perform simple optimization - revert
11557 : if-conversion for masked stores, i.e. if the mask of a store is zero
11558 : do not perform it and all stored value producers also if possible.
11559 : For example,
11560 : for (i=0; i<n; i++)
11561 : if (c[i])
11562 : {
11563 : p1[i] += 1;
11564 : p2[i] = p3[i] +2;
11565 : }
11566 : this transformation will produce the following semi-hammock:
11567 :
11568 : if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11569 : {
11570 : vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11571 : vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11572 : MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11573 : vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11574 : vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11575 : MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11576 : }
11577 : */
11578 :
11579 : void
11580 493 : optimize_mask_stores (class loop *loop)
11581 : {
11582 493 : basic_block *bbs = get_loop_body (loop);
11583 493 : unsigned nbbs = loop->num_nodes;
11584 493 : unsigned i;
11585 493 : basic_block bb;
11586 493 : class loop *bb_loop;
11587 493 : gimple_stmt_iterator gsi;
11588 493 : gimple *stmt;
11589 493 : auto_vec<gimple *> worklist;
11590 493 : auto_purge_vect_location sentinel;
11591 :
11592 493 : vect_location = find_loop_location (loop);
11593 : /* Pick up all masked stores in loop if any. */
11594 1972 : for (i = 0; i < nbbs; i++)
11595 : {
11596 986 : bb = bbs[i];
11597 17311 : for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11598 15339 : gsi_next (&gsi))
11599 : {
11600 15339 : stmt = gsi_stmt (gsi);
11601 15339 : if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11602 695 : worklist.safe_push (stmt);
11603 : }
11604 : }
11605 :
11606 493 : free (bbs);
11607 493 : if (worklist.is_empty ())
11608 68 : return;
11609 :
11610 : /* Loop has masked stores. */
11611 1103 : while (!worklist.is_empty ())
11612 : {
11613 678 : gimple *last, *last_store;
11614 678 : edge e, efalse;
11615 678 : tree mask;
11616 678 : basic_block store_bb, join_bb;
11617 678 : gimple_stmt_iterator gsi_to;
11618 678 : tree vdef, new_vdef;
11619 678 : gphi *phi;
11620 678 : tree vectype;
11621 678 : tree zero;
11622 :
11623 678 : last = worklist.pop ();
11624 678 : mask = gimple_call_arg (last, 2);
11625 678 : bb = gimple_bb (last);
11626 : /* Create then_bb and if-then structure in CFG, then_bb belongs to
11627 : the same loop as if_bb. It could be different to LOOP when two
11628 : level loop-nest is vectorized and mask_store belongs to the inner
11629 : one. */
11630 678 : e = split_block (bb, last);
11631 678 : bb_loop = bb->loop_father;
11632 678 : gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11633 678 : join_bb = e->dest;
11634 678 : store_bb = create_empty_bb (bb);
11635 678 : add_bb_to_loop (store_bb, bb_loop);
11636 678 : e->flags = EDGE_TRUE_VALUE;
11637 678 : efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11638 : /* Put STORE_BB to likely part. */
11639 678 : efalse->probability = profile_probability::likely ();
11640 678 : e->probability = efalse->probability.invert ();
11641 678 : store_bb->count = efalse->count ();
11642 678 : make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11643 678 : if (dom_info_available_p (CDI_DOMINATORS))
11644 678 : set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11645 678 : if (dump_enabled_p ())
11646 351 : dump_printf_loc (MSG_NOTE, vect_location,
11647 : "Create new block %d to sink mask stores.",
11648 : store_bb->index);
11649 : /* Create vector comparison with boolean result. */
11650 678 : vectype = TREE_TYPE (mask);
11651 678 : zero = build_zero_cst (vectype);
11652 678 : stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11653 678 : gsi = gsi_last_bb (bb);
11654 678 : gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11655 : /* Create new PHI node for vdef of the last masked store:
11656 : .MEM_2 = VDEF <.MEM_1>
11657 : will be converted to
11658 : .MEM.3 = VDEF <.MEM_1>
11659 : and new PHI node will be created in join bb
11660 : .MEM_2 = PHI <.MEM_1, .MEM_3>
11661 : */
11662 678 : vdef = gimple_vdef (last);
11663 678 : new_vdef = make_ssa_name (gimple_vop (cfun), last);
11664 678 : gimple_set_vdef (last, new_vdef);
11665 678 : phi = create_phi_node (vdef, join_bb);
11666 678 : add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11667 :
11668 : /* Put all masked stores with the same mask to STORE_BB if possible. */
11669 712 : while (true)
11670 : {
11671 695 : gimple_stmt_iterator gsi_from;
11672 695 : gimple *stmt1 = NULL;
11673 :
11674 : /* Move masked store to STORE_BB. */
11675 695 : last_store = last;
11676 695 : gsi = gsi_for_stmt (last);
11677 695 : gsi_from = gsi;
11678 : /* Shift GSI to the previous stmt for further traversal. */
11679 695 : gsi_prev (&gsi);
11680 695 : gsi_to = gsi_start_bb (store_bb);
11681 695 : gsi_move_before (&gsi_from, &gsi_to);
11682 : /* Setup GSI_TO to the non-empty block start. */
11683 695 : gsi_to = gsi_start_bb (store_bb);
11684 695 : if (dump_enabled_p ())
11685 367 : dump_printf_loc (MSG_NOTE, vect_location,
11686 : "Move stmt to created bb\n%G", last);
11687 : /* Move all stored value producers if possible. */
11688 4960 : while (!gsi_end_p (gsi))
11689 : {
11690 4959 : tree lhs;
11691 4959 : imm_use_iterator imm_iter;
11692 4959 : use_operand_p use_p;
11693 4959 : bool res;
11694 :
11695 : /* Skip debug statements. */
11696 4959 : if (is_gimple_debug (gsi_stmt (gsi)))
11697 : {
11698 3 : gsi_prev (&gsi);
11699 3225 : continue;
11700 : }
11701 4956 : stmt1 = gsi_stmt (gsi);
11702 : /* Do not consider statements writing to memory or having
11703 : volatile operand. */
11704 9762 : if (gimple_vdef (stmt1)
11705 9762 : || gimple_has_volatile_ops (stmt1))
11706 : break;
11707 4806 : gsi_from = gsi;
11708 4806 : gsi_prev (&gsi);
11709 4806 : lhs = gimple_get_lhs (stmt1);
11710 4806 : if (!lhs)
11711 : break;
11712 :
11713 : /* LHS of vectorized stmt must be SSA_NAME. */
11714 4806 : if (TREE_CODE (lhs) != SSA_NAME)
11715 : break;
11716 :
11717 4806 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11718 : {
11719 : /* Remove dead scalar statement. */
11720 3554 : if (has_zero_uses (lhs))
11721 : {
11722 3222 : gsi_remove (&gsi_from, true);
11723 3222 : release_defs (stmt1);
11724 3222 : continue;
11725 : }
11726 : }
11727 :
11728 : /* Check that LHS does not have uses outside of STORE_BB. */
11729 1584 : res = true;
11730 4309 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11731 : {
11732 1685 : gimple *use_stmt;
11733 1685 : use_stmt = USE_STMT (use_p);
11734 1685 : if (is_gimple_debug (use_stmt))
11735 0 : continue;
11736 1685 : if (gimple_bb (use_stmt) != store_bb)
11737 : {
11738 : res = false;
11739 : break;
11740 : }
11741 1584 : }
11742 1584 : if (!res)
11743 : break;
11744 :
11745 1040 : if (gimple_vuse (stmt1)
11746 1476 : && gimple_vuse (stmt1) != gimple_vuse (last_store))
11747 : break;
11748 :
11749 : /* Can move STMT1 to STORE_BB. */
11750 1040 : if (dump_enabled_p ())
11751 563 : dump_printf_loc (MSG_NOTE, vect_location,
11752 : "Move stmt to created bb\n%G", stmt1);
11753 1040 : gsi_move_before (&gsi_from, &gsi_to);
11754 : /* Shift GSI_TO for further insertion. */
11755 2080 : gsi_prev (&gsi_to);
11756 : }
11757 : /* Put other masked stores with the same mask to STORE_BB. */
11758 695 : if (worklist.is_empty ()
11759 270 : || gimple_call_arg (worklist.last (), 2) != mask
11760 17 : || worklist.last () != stmt1)
11761 : break;
11762 17 : last = worklist.pop ();
11763 17 : }
11764 1356 : add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11765 : }
11766 493 : }
11767 :
11768 : /* Decide whether it is possible to use a zero-based induction variable
11769 : when vectorizing LOOP_VINFO with partial vectors. If it is, return
11770 : the value that the induction variable must be able to hold in order
11771 : to ensure that the rgroups eventually have no active vector elements.
11772 : Return -1 otherwise. */
11773 :
11774 : widest_int
11775 46794 : vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11776 : {
11777 46794 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11778 46794 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11779 46794 : unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11780 :
11781 : /* Calculate the value that the induction variable must be able
11782 : to hit in order to ensure that we end the loop with an all-false mask.
11783 : This involves adding the maximum number of inactive trailing scalar
11784 : iterations. */
11785 46794 : widest_int iv_limit = -1;
11786 46794 : if (max_loop_iterations (loop, &iv_limit))
11787 : {
11788 46794 : if (niters_skip)
11789 : {
11790 : /* Add the maximum number of skipped iterations to the
11791 : maximum iteration count. */
11792 0 : if (TREE_CODE (niters_skip) == INTEGER_CST)
11793 0 : iv_limit += wi::to_widest (niters_skip);
11794 : else
11795 0 : iv_limit += max_vf - 1;
11796 : }
11797 46794 : else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11798 : /* Make a conservatively-correct assumption. */
11799 344 : iv_limit += max_vf - 1;
11800 :
11801 : /* IV_LIMIT is the maximum number of latch iterations, which is also
11802 : the maximum in-range IV value. Round this value down to the previous
11803 : vector alignment boundary and then add an extra full iteration. */
11804 46794 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11805 46794 : iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11806 : }
11807 46794 : return iv_limit;
11808 : }
11809 :
11810 : /* For the given rgroup_controls RGC, check whether an induction variable
11811 : would ever hit a value that produces a set of all-false masks or zero
11812 : lengths before wrapping around. Return true if it's possible to wrap
11813 : around before hitting the desirable value, otherwise return false. */
11814 :
11815 : bool
11816 0 : vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11817 : {
11818 0 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11819 :
11820 0 : if (iv_limit == -1)
11821 : return true;
11822 :
11823 0 : tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11824 0 : unsigned int compare_precision = TYPE_PRECISION (compare_type);
11825 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11826 :
11827 0 : if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11828 : return true;
11829 :
11830 : return false;
11831 0 : }
|