Branch data Line data Source code
1 : : /* Loop Vectorization
2 : : Copyright (C) 2003-2025 Free Software Foundation, Inc.
3 : : Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 : : Ira Rosen <irar@il.ibm.com>
5 : :
6 : : This file is part of GCC.
7 : :
8 : : GCC is free software; you can redistribute it and/or modify it under
9 : : the terms of the GNU General Public License as published by the Free
10 : : Software Foundation; either version 3, or (at your option) any later
11 : : version.
12 : :
13 : : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : : for more details.
17 : :
18 : : You should have received a copy of the GNU General Public License
19 : : along with GCC; see the file COPYING3. If not see
20 : : <http://www.gnu.org/licenses/>. */
21 : :
22 : : #define INCLUDE_ALGORITHM
23 : : #include "config.h"
24 : : #include "system.h"
25 : : #include "coretypes.h"
26 : : #include "backend.h"
27 : : #include "target.h"
28 : : #include "rtl.h"
29 : : #include "tree.h"
30 : : #include "gimple.h"
31 : : #include "cfghooks.h"
32 : : #include "tree-pass.h"
33 : : #include "ssa.h"
34 : : #include "optabs-tree.h"
35 : : #include "memmodel.h"
36 : : #include "optabs.h"
37 : : #include "diagnostic-core.h"
38 : : #include "fold-const.h"
39 : : #include "stor-layout.h"
40 : : #include "cfganal.h"
41 : : #include "gimplify.h"
42 : : #include "gimple-iterator.h"
43 : : #include "gimplify-me.h"
44 : : #include "tree-ssa-loop-ivopts.h"
45 : : #include "tree-ssa-loop-manip.h"
46 : : #include "tree-ssa-loop-niter.h"
47 : : #include "tree-ssa-loop.h"
48 : : #include "cfgloop.h"
49 : : #include "tree-scalar-evolution.h"
50 : : #include "tree-vectorizer.h"
51 : : #include "gimple-fold.h"
52 : : #include "cgraph.h"
53 : : #include "tree-cfg.h"
54 : : #include "tree-if-conv.h"
55 : : #include "internal-fn.h"
56 : : #include "tree-vector-builder.h"
57 : : #include "vec-perm-indices.h"
58 : : #include "tree-eh.h"
59 : : #include "case-cfn-macros.h"
60 : : #include "langhooks.h"
61 : : #include "opts.h"
62 : :
63 : : /* Loop Vectorization Pass.
64 : :
65 : : This pass tries to vectorize loops.
66 : :
67 : : For example, the vectorizer transforms the following simple loop:
68 : :
69 : : short a[N]; short b[N]; short c[N]; int i;
70 : :
71 : : for (i=0; i<N; i++){
72 : : a[i] = b[i] + c[i];
73 : : }
74 : :
75 : : as if it was manually vectorized by rewriting the source code into:
76 : :
77 : : typedef int __attribute__((mode(V8HI))) v8hi;
78 : : short a[N]; short b[N]; short c[N]; int i;
79 : : v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
80 : : v8hi va, vb, vc;
81 : :
82 : : for (i=0; i<N/8; i++){
83 : : vb = pb[i];
84 : : vc = pc[i];
85 : : va = vb + vc;
86 : : pa[i] = va;
87 : : }
88 : :
89 : : The main entry to this pass is vectorize_loops(), in which
90 : : the vectorizer applies a set of analyses on a given set of loops,
91 : : followed by the actual vectorization transformation for the loops that
92 : : had successfully passed the analysis phase.
93 : : Throughout this pass we make a distinction between two types of
94 : : data: scalars (which are represented by SSA_NAMES), and memory references
95 : : ("data-refs"). These two types of data require different handling both
96 : : during analysis and transformation. The types of data-refs that the
97 : : vectorizer currently supports are ARRAY_REFS which base is an array DECL
98 : : (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
99 : : accesses are required to have a simple (consecutive) access pattern.
100 : :
101 : : Analysis phase:
102 : : ===============
103 : : The driver for the analysis phase is vect_analyze_loop().
104 : : It applies a set of analyses, some of which rely on the scalar evolution
105 : : analyzer (scev) developed by Sebastian Pop.
106 : :
107 : : During the analysis phase the vectorizer records some information
108 : : per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
109 : : loop, as well as general information about the loop as a whole, which is
110 : : recorded in a "loop_vec_info" struct attached to each loop.
111 : :
112 : : Transformation phase:
113 : : =====================
114 : : The loop transformation phase scans all the stmts in the loop, and
115 : : creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
116 : : the loop that needs to be vectorized. It inserts the vector code sequence
117 : : just before the scalar stmt S, and records a pointer to the vector code
118 : : in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
119 : : attached to S). This pointer will be used for the vectorization of following
120 : : stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
121 : : otherwise, we rely on dead code elimination for removing it.
122 : :
123 : : For example, say stmt S1 was vectorized into stmt VS1:
124 : :
125 : : VS1: vb = px[i];
126 : : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
127 : : S2: a = b;
128 : :
129 : : To vectorize stmt S2, the vectorizer first finds the stmt that defines
130 : : the operand 'b' (S1), and gets the relevant vector def 'vb' from the
131 : : vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
132 : : resulting sequence would be:
133 : :
134 : : VS1: vb = px[i];
135 : : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
136 : : VS2: va = vb;
137 : : S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
138 : :
139 : : Operands that are not SSA_NAMEs, are data-refs that appear in
140 : : load/store operations (like 'x[i]' in S1), and are handled differently.
141 : :
142 : : Target modeling:
143 : : =================
144 : : Currently the only target specific information that is used is the
145 : : size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
146 : : Targets that can support different sizes of vectors, for now will need
147 : : to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
148 : : flexibility will be added in the future.
149 : :
150 : : Since we only vectorize operations which vector form can be
151 : : expressed using existing tree codes, to verify that an operation is
152 : : supported, the vectorizer checks the relevant optab at the relevant
153 : : machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
154 : : the value found is CODE_FOR_nothing, then there's no target support, and
155 : : we can't vectorize the stmt.
156 : :
157 : : For additional information on this project see:
158 : : http://gcc.gnu.org/projects/tree-ssa/vectorization.html
159 : : */
160 : :
161 : : static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
162 : : unsigned *);
163 : : static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
164 : : gphi **, bool *, bool);
165 : :
166 : :
167 : : /* Function vect_is_simple_iv_evolution.
168 : :
169 : : FORNOW: A simple evolution of an induction variables in the loop is
170 : : considered a polynomial evolution. */
171 : :
172 : : static bool
173 : 664058 : vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn,
174 : : stmt_vec_info stmt_info)
175 : : {
176 : 664058 : tree init_expr;
177 : 664058 : tree step_expr;
178 : 664058 : tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
179 : 664058 : basic_block bb;
180 : :
181 : : /* When there is no evolution in this loop, the evolution function
182 : : is not "simple". */
183 : 664058 : if (evolution_part == NULL_TREE)
184 : : return false;
185 : :
186 : : /* When the evolution is a polynomial of degree >= 2
187 : : the evolution function is not "simple". */
188 : 706165 : if (tree_is_chrec (evolution_part))
189 : : return false;
190 : :
191 : 614668 : step_expr = evolution_part;
192 : 614668 : init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
193 : :
194 : 614668 : if (dump_enabled_p ())
195 : 36068 : dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
196 : : step_expr, init_expr);
197 : :
198 : 614668 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = init_expr;
199 : 614668 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = step_expr;
200 : :
201 : 614668 : if (TREE_CODE (step_expr) != INTEGER_CST
202 : 48742 : && (TREE_CODE (step_expr) != SSA_NAME
203 : 40972 : || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
204 : 40810 : && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
205 : 6659 : || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
206 : 111 : && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
207 : 111 : || !flag_associative_math)))
208 : 656832 : && (TREE_CODE (step_expr) != REAL_CST
209 : 443 : || !flag_associative_math))
210 : : {
211 : 42107 : if (dump_enabled_p ())
212 : 2714 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
213 : : "step unknown.\n");
214 : 42107 : return false;
215 : : }
216 : :
217 : : return true;
218 : : }
219 : :
220 : : /* Function vect_is_nonlinear_iv_evolution
221 : :
222 : : Only support nonlinear induction for integer type
223 : : 1. neg
224 : : 2. mul by constant
225 : : 3. lshift/rshift by constant.
226 : :
227 : : For neg induction, return a fake step as integer -1. */
228 : : static bool
229 : 89142 : vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
230 : : gphi* loop_phi_node)
231 : : {
232 : 89142 : tree init_expr, ev_expr, result, op1, op2;
233 : 89142 : gimple* def;
234 : :
235 : 89142 : if (gimple_phi_num_args (loop_phi_node) != 2)
236 : : return false;
237 : :
238 : 89142 : init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
239 : 89142 : ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
240 : :
241 : : /* Support nonlinear induction only for integer type. */
242 : 89142 : if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
243 : : return false;
244 : :
245 : 66286 : result = PHI_RESULT (loop_phi_node);
246 : :
247 : 66286 : if (TREE_CODE (ev_expr) != SSA_NAME
248 : 64187 : || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
249 : 66286 : || !is_gimple_assign (def))
250 : : return false;
251 : :
252 : 59345 : enum tree_code t_code = gimple_assign_rhs_code (def);
253 : 59345 : tree step;
254 : 59345 : switch (t_code)
255 : : {
256 : 1554 : case NEGATE_EXPR:
257 : 1554 : if (gimple_assign_rhs1 (def) != result)
258 : : return false;
259 : 1554 : step = build_int_cst (TREE_TYPE (init_expr), -1);
260 : 1554 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
261 : 1554 : break;
262 : :
263 : 9525 : case RSHIFT_EXPR:
264 : 9525 : case LSHIFT_EXPR:
265 : 9525 : case MULT_EXPR:
266 : 9525 : op1 = gimple_assign_rhs1 (def);
267 : 9525 : op2 = gimple_assign_rhs2 (def);
268 : 9525 : if (TREE_CODE (op2) != INTEGER_CST
269 : 6049 : || op1 != result)
270 : : return false;
271 : 5920 : step = op2;
272 : 5920 : if (t_code == LSHIFT_EXPR)
273 : 186 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
274 : 5734 : else if (t_code == RSHIFT_EXPR)
275 : 5129 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
276 : : /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
277 : : else
278 : 605 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
279 : : break;
280 : :
281 : : default:
282 : : return false;
283 : : }
284 : :
285 : 7474 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = init_expr;
286 : 7474 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = step;
287 : :
288 : 7474 : return true;
289 : : }
290 : :
291 : : /* Returns true if Phi is a first-order recurrence. A first-order
292 : : recurrence is a non-reduction recurrence relation in which the value of
293 : : the recurrence in the current loop iteration equals a value defined in
294 : : the previous iteration. */
295 : :
296 : : static bool
297 : 20987 : vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
298 : : gphi *phi)
299 : : {
300 : : /* A nested cycle isn't vectorizable as first order recurrence. */
301 : 20987 : if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
302 : : return false;
303 : :
304 : : /* Ensure the loop latch definition is from within the loop. */
305 : 20821 : edge latch = loop_latch_edge (loop);
306 : 20821 : tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
307 : 20821 : if (TREE_CODE (ldef) != SSA_NAME
308 : 18441 : || SSA_NAME_IS_DEFAULT_DEF (ldef)
309 : 18413 : || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
310 : 37958 : || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
311 : 3994 : return false;
312 : :
313 : 16827 : tree def = gimple_phi_result (phi);
314 : :
315 : : /* Ensure every use_stmt of the phi node is dominated by the latch
316 : : definition. */
317 : 16827 : imm_use_iterator imm_iter;
318 : 16827 : use_operand_p use_p;
319 : 19107 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
320 : 18572 : if (!is_gimple_debug (USE_STMT (use_p))
321 : 36188 : && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
322 : 10665 : || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
323 : : USE_STMT (use_p))))
324 : 16292 : return false;
325 : :
326 : : /* First-order recurrence autovectorization needs shuffle vector. */
327 : 535 : tree scalar_type = TREE_TYPE (def);
328 : 535 : tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
329 : 535 : if (!vectype)
330 : : return false;
331 : :
332 : : return true;
333 : : }
334 : :
335 : : /* Function vect_analyze_scalar_cycles_1.
336 : :
337 : : Examine the cross iteration def-use cycles of scalar variables
338 : : in LOOP. LOOP_VINFO represents the loop that is now being
339 : : considered for vectorization (can be LOOP, or an outer-loop
340 : : enclosing LOOP). SLP indicates there will be some subsequent
341 : : slp analyses or not. */
342 : :
343 : : static void
344 : 324227 : vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
345 : : bool slp)
346 : : {
347 : 324227 : basic_block bb = loop->header;
348 : 324227 : auto_vec<stmt_vec_info, 64> worklist;
349 : 324227 : gphi_iterator gsi;
350 : :
351 : 324227 : DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
352 : :
353 : : /* First - identify all inductions. Reduction detection assumes that all the
354 : : inductions have been identified, therefore, this order must not be
355 : : changed. */
356 : 1165610 : for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
357 : : {
358 : 841383 : gphi *phi = gsi.phi ();
359 : 841383 : tree access_fn = NULL;
360 : 841383 : tree def = PHI_RESULT (phi);
361 : 841383 : stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
362 : :
363 : : /* Skip virtual phi's. The data dependences that are associated with
364 : : virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
365 : 1682766 : if (virtual_operand_p (def))
366 : 261354 : continue;
367 : :
368 : : /* Skip already analyzed inner loop PHIs of double reductions. */
369 : 664960 : if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))
370 : 902 : continue;
371 : :
372 : 664058 : if (dump_enabled_p ())
373 : 37852 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
374 : : (gimple *) phi);
375 : :
376 : 664058 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
377 : :
378 : : /* Analyze the evolution function. */
379 : 664058 : access_fn = analyze_scalar_evolution (loop, def);
380 : 664058 : if (dump_enabled_p ())
381 : 37852 : dump_printf_loc (MSG_NOTE, vect_location,
382 : : "Access function of PHI: %T\n", access_fn);
383 : 664058 : if (access_fn)
384 : 664058 : STRIP_NOPS (access_fn);
385 : :
386 : 748087 : if ((!access_fn
387 : 664058 : || !vect_is_simple_iv_evolution (loop->num, access_fn, stmt_vinfo)
388 : 572561 : || (LOOP_VINFO_LOOP (loop_vinfo) != loop
389 : 9837 : && (TREE_CODE (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo))
390 : : != INTEGER_CST)))
391 : : /* Only handle nonlinear iv for same loop. */
392 : 755561 : && (LOOP_VINFO_LOOP (loop_vinfo) != loop
393 : 89142 : || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo, phi)))
394 : : {
395 : 84029 : worklist.safe_push (stmt_vinfo);
396 : 84029 : continue;
397 : : }
398 : :
399 : 580029 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
400 : : != NULL_TREE);
401 : 580029 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
402 : :
403 : 580029 : if (dump_enabled_p ())
404 : 33458 : dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
405 : 580029 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
406 : :
407 : : /* Mark if we have a non-linear IV. */
408 : 580029 : LOOP_VINFO_NON_LINEAR_IV (loop_vinfo)
409 : 580029 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_vinfo) != vect_step_op_add;
410 : : }
411 : :
412 : :
413 : : /* Second - identify all reductions and nested cycles. */
414 : 408256 : while (worklist.length () > 0)
415 : : {
416 : 84029 : stmt_vec_info stmt_vinfo = worklist.pop ();
417 : 84029 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
418 : 84029 : tree def = PHI_RESULT (phi);
419 : :
420 : 84029 : if (dump_enabled_p ())
421 : 4394 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
422 : : (gimple *) phi);
423 : :
424 : 168058 : gcc_assert (!virtual_operand_p (def)
425 : : && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
426 : :
427 : 84029 : gphi *double_reduc;
428 : 84029 : bool reduc_chain;
429 : 84029 : stmt_vec_info reduc_stmt_info
430 : 84029 : = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
431 : 84029 : &reduc_chain, slp);
432 : 84029 : if (reduc_stmt_info && double_reduc)
433 : : {
434 : 991 : bool inner_chain;
435 : 991 : stmt_vec_info inner_phi_info
436 : 991 : = loop_vinfo->lookup_stmt (double_reduc);
437 : : /* ??? Pass down flag we're the inner loop of a double reduc. */
438 : 991 : stmt_vec_info inner_reduc_info
439 : 991 : = vect_is_simple_reduction (loop_vinfo, inner_phi_info,
440 : : NULL, &inner_chain, slp);
441 : 991 : if (inner_reduc_info)
442 : : {
443 : 902 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
444 : 902 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
445 : 902 : STMT_VINFO_REDUC_DEF (inner_phi_info) = inner_reduc_info;
446 : 902 : STMT_VINFO_REDUC_DEF (inner_reduc_info) = inner_phi_info;
447 : 902 : if (dump_enabled_p ())
448 : 122 : dump_printf_loc (MSG_NOTE, vect_location,
449 : : "Detected double reduction.\n");
450 : :
451 : 902 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
452 : 902 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
453 : 902 : STMT_VINFO_DEF_TYPE (inner_phi_info) = vect_nested_cycle;
454 : : /* Make it accessible for SLP vectorization. */
455 : 902 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
456 : : }
457 : 89 : else if (dump_enabled_p ())
458 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
459 : : "Unknown def-use cycle pattern.\n");
460 : 991 : }
461 : 83038 : else if (reduc_stmt_info)
462 : : {
463 : 62051 : if (loop != LOOP_VINFO_LOOP (loop_vinfo))
464 : : {
465 : 2195 : if (dump_enabled_p ())
466 : 361 : dump_printf_loc (MSG_NOTE, vect_location,
467 : : "Detected vectorizable nested cycle.\n");
468 : :
469 : 2195 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
470 : : }
471 : : else
472 : : {
473 : 59856 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
474 : 59856 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
475 : 59856 : if (dump_enabled_p ())
476 : 3457 : dump_printf_loc (MSG_NOTE, vect_location,
477 : : "Detected reduction.\n");
478 : :
479 : 59856 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
480 : 59856 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
481 : : /* Store the reduction cycles for possible vectorization in
482 : : loop-aware SLP if it was not detected as reduction
483 : : chain. */
484 : 59856 : if (! reduc_chain)
485 : 56591 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
486 : 56591 : (reduc_stmt_info);
487 : : }
488 : : }
489 : 20987 : else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
490 : 529 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
491 : : else
492 : 20458 : if (dump_enabled_p ())
493 : 368 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
494 : : "Unknown def-use cycle pattern.\n");
495 : : }
496 : 324227 : }
497 : :
498 : :
499 : : /* Function vect_analyze_scalar_cycles.
500 : :
501 : : Examine the cross iteration def-use cycles of scalar variables, by
502 : : analyzing the loop-header PHIs of scalar variables. Classify each
503 : : cycle as one of the following: invariant, induction, reduction, unknown.
504 : : We do that for the loop represented by LOOP_VINFO, and also to its
505 : : inner-loop, if exists.
506 : : Examples for scalar cycles:
507 : :
508 : : Example1: reduction:
509 : :
510 : : loop1:
511 : : for (i=0; i<N; i++)
512 : : sum += a[i];
513 : :
514 : : Example2: induction:
515 : :
516 : : loop2:
517 : : for (i=0; i<N; i++)
518 : : a[i] = i; */
519 : :
520 : : static void
521 : 319232 : vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
522 : : {
523 : 319232 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
524 : :
525 : 319232 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
526 : :
527 : : /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
528 : : Reductions in such inner-loop therefore have different properties than
529 : : the reductions in the nest that gets vectorized:
530 : : 1. When vectorized, they are executed in the same order as in the original
531 : : scalar loop, so we can't change the order of computation when
532 : : vectorizing them.
533 : : 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
534 : : current checks are too strict. */
535 : :
536 : 319232 : if (loop->inner)
537 : 4995 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
538 : 319232 : }
539 : :
540 : : /* Transfer group and reduction information from STMT_INFO to its
541 : : pattern stmt. */
542 : :
543 : : static void
544 : 32 : vect_fixup_reduc_chain (stmt_vec_info stmt_info)
545 : : {
546 : 32 : stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
547 : 32 : stmt_vec_info stmtp;
548 : 32 : gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
549 : : && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
550 : 32 : REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
551 : 306 : do
552 : : {
553 : 306 : stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
554 : 306 : gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
555 : : == STMT_VINFO_DEF_TYPE (stmt_info));
556 : 306 : REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
557 : 306 : stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
558 : 306 : if (stmt_info)
559 : 274 : REDUC_GROUP_NEXT_ELEMENT (stmtp)
560 : 274 : = STMT_VINFO_RELATED_STMT (stmt_info);
561 : : }
562 : 306 : while (stmt_info);
563 : 32 : }
564 : :
565 : : /* Fixup scalar cycles that now have their stmts detected as patterns. */
566 : :
567 : : static void
568 : 319232 : vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
569 : : {
570 : 319232 : stmt_vec_info first;
571 : 319232 : unsigned i;
572 : :
573 : 322497 : FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
574 : : {
575 : 3265 : stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
576 : 8417 : while (next)
577 : : {
578 : 5189 : if ((STMT_VINFO_IN_PATTERN_P (next)
579 : 5189 : != STMT_VINFO_IN_PATTERN_P (first))
580 : 10341 : || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
581 : : break;
582 : 5152 : next = REDUC_GROUP_NEXT_ELEMENT (next);
583 : : }
584 : : /* If all reduction chain members are well-formed patterns adjust
585 : : the group to group the pattern stmts instead. */
586 : 3265 : if (! next
587 : 3297 : && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
588 : : {
589 : 3228 : if (STMT_VINFO_IN_PATTERN_P (first))
590 : : {
591 : 32 : vect_fixup_reduc_chain (first);
592 : 64 : LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
593 : 32 : = STMT_VINFO_RELATED_STMT (first);
594 : : }
595 : : }
596 : : /* If not all stmt in the chain are patterns or if we failed
597 : : to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
598 : : it as regular reduction instead. */
599 : : else
600 : : {
601 : : stmt_vec_info vinfo = first;
602 : : stmt_vec_info last = NULL;
603 : 144 : while (vinfo)
604 : : {
605 : 107 : next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
606 : 107 : REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
607 : 107 : REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
608 : 107 : last = vinfo;
609 : 107 : vinfo = next;
610 : : }
611 : 37 : STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
612 : 37 : = vect_internal_def;
613 : 39 : loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
614 : 37 : LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
615 : 37 : --i;
616 : : }
617 : : }
618 : 319232 : }
619 : :
620 : : /* Function vect_get_loop_niters.
621 : :
622 : : Determine how many iterations the loop is executed and place it
623 : : in NUMBER_OF_ITERATIONS. Place the number of latch iterations
624 : : in NUMBER_OF_ITERATIONSM1. Place the condition under which the
625 : : niter information holds in ASSUMPTIONS.
626 : :
627 : : Return the loop exit conditions. */
628 : :
629 : :
630 : : static vec<gcond *>
631 : 262841 : vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
632 : : tree *number_of_iterations, tree *number_of_iterationsm1)
633 : : {
634 : 262841 : auto_vec<edge> exits = get_loop_exit_edges (loop);
635 : 262841 : vec<gcond *> conds;
636 : 525682 : conds.create (exits.length ());
637 : 262841 : class tree_niter_desc niter_desc;
638 : 262841 : tree niter_assumptions, niter, may_be_zero;
639 : :
640 : 262841 : *assumptions = boolean_true_node;
641 : 262841 : *number_of_iterationsm1 = chrec_dont_know;
642 : 262841 : *number_of_iterations = chrec_dont_know;
643 : :
644 : 262841 : DUMP_VECT_SCOPE ("get_loop_niters");
645 : :
646 : 262841 : if (exits.is_empty ())
647 : 0 : return conds;
648 : :
649 : 262841 : if (dump_enabled_p ())
650 : 13628 : dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
651 : : exits.length ());
652 : :
653 : : edge exit;
654 : : unsigned int i;
655 : 635813 : FOR_EACH_VEC_ELT (exits, i, exit)
656 : : {
657 : 372972 : gcond *cond = get_loop_exit_condition (exit);
658 : 372972 : if (cond)
659 : 362423 : conds.safe_push (cond);
660 : :
661 : 372972 : if (dump_enabled_p ())
662 : 14589 : dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
663 : :
664 : 372972 : if (exit != main_exit)
665 : 151024 : continue;
666 : :
667 : 262841 : may_be_zero = NULL_TREE;
668 : 262841 : if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
669 : 262841 : || chrec_contains_undetermined (niter_desc.niter))
670 : 40893 : continue;
671 : :
672 : 221948 : niter_assumptions = niter_desc.assumptions;
673 : 221948 : may_be_zero = niter_desc.may_be_zero;
674 : 221948 : niter = niter_desc.niter;
675 : :
676 : 221948 : if (may_be_zero && integer_zerop (may_be_zero))
677 : : may_be_zero = NULL_TREE;
678 : :
679 : 13044 : if (may_be_zero)
680 : : {
681 : 13044 : if (COMPARISON_CLASS_P (may_be_zero))
682 : : {
683 : : /* Try to combine may_be_zero with assumptions, this can simplify
684 : : computation of niter expression. */
685 : 13044 : if (niter_assumptions && !integer_nonzerop (niter_assumptions))
686 : 1134 : niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
687 : : niter_assumptions,
688 : : fold_build1 (TRUTH_NOT_EXPR,
689 : : boolean_type_node,
690 : : may_be_zero));
691 : : else
692 : 11910 : niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
693 : : build_int_cst (TREE_TYPE (niter), 0),
694 : : rewrite_to_non_trapping_overflow (niter));
695 : :
696 : 221948 : may_be_zero = NULL_TREE;
697 : : }
698 : 0 : else if (integer_nonzerop (may_be_zero))
699 : : {
700 : 0 : *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
701 : 0 : *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
702 : 0 : continue;
703 : : }
704 : : else
705 : 0 : continue;
706 : : }
707 : :
708 : : /* Loop assumptions are based off the normal exit. */
709 : 221948 : *assumptions = niter_assumptions;
710 : 221948 : *number_of_iterationsm1 = niter;
711 : :
712 : : /* We want the number of loop header executions which is the number
713 : : of latch executions plus one.
714 : : ??? For UINT_MAX latch executions this number overflows to zero
715 : : for loops like do { n++; } while (n != 0); */
716 : 221948 : if (niter && !chrec_contains_undetermined (niter))
717 : : {
718 : 221948 : niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
719 : : unshare_expr (niter),
720 : : build_int_cst (TREE_TYPE (niter), 1));
721 : 221948 : if (TREE_CODE (niter) == INTEGER_CST
722 : 119761 : && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
723 : : {
724 : : /* If we manage to fold niter + 1 into INTEGER_CST even when
725 : : niter is some complex expression, ensure back
726 : : *number_of_iterationsm1 is an INTEGER_CST as well. See
727 : : PR113210. */
728 : 4 : *number_of_iterationsm1
729 : 4 : = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
730 : : build_minus_one_cst (TREE_TYPE (niter)));
731 : : }
732 : : }
733 : 221948 : *number_of_iterations = niter;
734 : : }
735 : :
736 : 262841 : if (dump_enabled_p ())
737 : 13628 : dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
738 : :
739 : 262841 : return conds;
740 : 262841 : }
741 : :
742 : : /* Determine the main loop exit for the vectorizer. */
743 : :
744 : : edge
745 : 495925 : vec_init_loop_exit_info (class loop *loop)
746 : : {
747 : : /* Before we begin we must first determine which exit is the main one and
748 : : which are auxilary exits. */
749 : 495925 : auto_vec<edge> exits = get_loop_exit_edges (loop);
750 : 495925 : if (exits.length () == 1)
751 : 314903 : return exits[0];
752 : :
753 : : /* If we have multiple exits we only support counting IV at the moment.
754 : : Analyze all exits and return the last one we can analyze. */
755 : 181022 : class tree_niter_desc niter_desc;
756 : 181022 : edge candidate = NULL;
757 : 1196798 : for (edge exit : exits)
758 : : {
759 : 663626 : if (!get_loop_exit_condition (exit))
760 : 157419 : continue;
761 : :
762 : 506207 : if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
763 : 506207 : && !chrec_contains_undetermined (niter_desc.niter))
764 : : {
765 : 138416 : tree may_be_zero = niter_desc.may_be_zero;
766 : 138416 : if ((integer_zerop (may_be_zero)
767 : : /* As we are handling may_be_zero that's not false by
768 : : rewriting niter to may_be_zero ? 0 : niter we require
769 : : an empty latch. */
770 : 677036 : || (single_pred_p (loop->latch)
771 : 13047 : && exit->src == single_pred (loop->latch)
772 : 4521 : && (integer_nonzerop (may_be_zero)
773 : 4521 : || COMPARISON_CLASS_P (may_be_zero))))
774 : 142937 : && (!candidate
775 : 6603 : || dominated_by_p (CDI_DOMINATORS, exit->src,
776 : 6603 : candidate->src)))
777 : : candidate = exit;
778 : : }
779 : : }
780 : :
781 : 181022 : return candidate;
782 : 181022 : }
783 : :
784 : : /* Function bb_in_loop_p
785 : :
786 : : Used as predicate for dfs order traversal of the loop bbs. */
787 : :
788 : : static bool
789 : 1312577 : bb_in_loop_p (const_basic_block bb, const void *data)
790 : : {
791 : 1312577 : const class loop *const loop = (const class loop *)data;
792 : 1312577 : if (flow_bb_inside_loop_p (loop, bb))
793 : : return true;
794 : : return false;
795 : : }
796 : :
797 : :
798 : : /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
799 : : stmt_vec_info structs for all the stmts in LOOP_IN. */
800 : :
801 : 414444 : _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
802 : : : vec_info (vec_info::loop, shared),
803 : 414444 : loop (loop_in),
804 : 414444 : num_itersm1 (NULL_TREE),
805 : 414444 : num_iters (NULL_TREE),
806 : 414444 : num_iters_unchanged (NULL_TREE),
807 : 414444 : num_iters_assumptions (NULL_TREE),
808 : 414444 : vector_costs (nullptr),
809 : 414444 : scalar_costs (nullptr),
810 : 414444 : th (0),
811 : 414444 : versioning_threshold (0),
812 : 414444 : vectorization_factor (0),
813 : 414444 : main_loop_edge (nullptr),
814 : 414444 : skip_main_loop_edge (nullptr),
815 : 414444 : skip_this_loop_edge (nullptr),
816 : 414444 : reusable_accumulators (),
817 : 414444 : suggested_unroll_factor (1),
818 : 414444 : max_vectorization_factor (0),
819 : 414444 : mask_skip_niters (NULL_TREE),
820 : 414444 : mask_skip_niters_pfa_offset (NULL_TREE),
821 : 414444 : rgroup_compare_type (NULL_TREE),
822 : 414444 : simd_if_cond (NULL_TREE),
823 : 414444 : partial_vector_style (vect_partial_vectors_none),
824 : 414444 : unaligned_dr (NULL),
825 : 414444 : peeling_for_alignment (0),
826 : 414444 : ptr_mask (0),
827 : 414444 : max_spec_read_amount (0),
828 : 414444 : nonlinear_iv (false),
829 : 414444 : ivexpr_map (NULL),
830 : 414444 : scan_map (NULL),
831 : 414444 : slp_unrolling_factor (1),
832 : 414444 : inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
833 : 414444 : vectorizable (false),
834 : 414444 : can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
835 : 414444 : must_use_partial_vectors_p (false),
836 : 414444 : using_partial_vectors_p (false),
837 : 414444 : using_decrementing_iv_p (false),
838 : 414444 : using_select_vl_p (false),
839 : 414444 : epil_using_partial_vectors_p (false),
840 : 414444 : allow_mutual_alignment (false),
841 : 414444 : partial_load_store_bias (0),
842 : 414444 : peeling_for_gaps (false),
843 : 414444 : peeling_for_niter (false),
844 : 414444 : early_breaks (false),
845 : 414444 : user_unroll (false),
846 : 414444 : no_data_dependencies (false),
847 : 414444 : has_mask_store (false),
848 : 414444 : scalar_loop_scaling (profile_probability::uninitialized ()),
849 : 414444 : scalar_loop (NULL),
850 : 414444 : main_loop_info (NULL),
851 : 414444 : orig_loop_info (NULL),
852 : 414444 : epilogue_vinfo (NULL),
853 : 414444 : drs_advanced_by (NULL_TREE),
854 : 414444 : vec_loop_iv_exit (NULL),
855 : 414444 : vec_epilogue_loop_iv_exit (NULL),
856 : 414444 : scalar_loop_iv_exit (NULL)
857 : : {
858 : : /* CHECKME: We want to visit all BBs before their successors (except for
859 : : latch blocks, for which this assertion wouldn't hold). In the simple
860 : : case of the loop forms we allow, a dfs order of the BBs would the same
861 : : as reversed postorder traversal, so we are safe. */
862 : :
863 : 414444 : bbs = XCNEWVEC (basic_block, loop->num_nodes);
864 : 828888 : nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, bbs,
865 : 414444 : loop->num_nodes, loop);
866 : 414444 : gcc_assert (nbbs == loop->num_nodes);
867 : :
868 : 1497804 : for (unsigned int i = 0; i < nbbs; i++)
869 : : {
870 : 1083360 : basic_block bb = bbs[i];
871 : 1083360 : gimple_stmt_iterator si;
872 : :
873 : 2202087 : for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
874 : : {
875 : 1118727 : gimple *phi = gsi_stmt (si);
876 : 1118727 : gimple_set_uid (phi, 0);
877 : 1118727 : add_stmt (phi);
878 : : }
879 : :
880 : 9351322 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
881 : : {
882 : 7184602 : gimple *stmt = gsi_stmt (si);
883 : 7184602 : gimple_set_uid (stmt, 0);
884 : 7184602 : if (is_gimple_debug (stmt))
885 : 2731785 : continue;
886 : 4452817 : add_stmt (stmt);
887 : : /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
888 : : third argument is the #pragma omp simd if (x) condition, when 0,
889 : : loop shouldn't be vectorized, when non-zero constant, it should
890 : : be vectorized normally, otherwise versioned with vectorized loop
891 : : done if the condition is non-zero at runtime. */
892 : 4452817 : if (loop_in->simduid
893 : 43319 : && is_gimple_call (stmt)
894 : 4259 : && gimple_call_internal_p (stmt)
895 : 4132 : && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
896 : 4131 : && gimple_call_num_args (stmt) >= 3
897 : 103 : && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
898 : 4452920 : && (loop_in->simduid
899 : 103 : == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
900 : : {
901 : 103 : tree arg = gimple_call_arg (stmt, 2);
902 : 103 : if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
903 : 103 : simd_if_cond = arg;
904 : : else
905 : 0 : gcc_assert (integer_nonzerop (arg));
906 : : }
907 : : }
908 : : }
909 : 414444 : }
910 : :
911 : : /* Free all levels of rgroup CONTROLS. */
912 : :
913 : : void
914 : 1076842 : release_vec_loop_controls (vec<rgroup_controls> *controls)
915 : : {
916 : 1076842 : rgroup_controls *rgc;
917 : 1076842 : unsigned int i;
918 : 1076887 : FOR_EACH_VEC_ELT (*controls, i, rgc)
919 : 45 : rgc->controls.release ();
920 : 1076842 : controls->release ();
921 : 1076842 : }
922 : :
923 : : /* Free all memory used by the _loop_vec_info, as well as all the
924 : : stmt_vec_info structs of all the stmts in the loop. */
925 : :
926 : 414444 : _loop_vec_info::~_loop_vec_info ()
927 : : {
928 : 414444 : free (bbs);
929 : :
930 : 414444 : release_vec_loop_controls (&masks.rgc_vec);
931 : 414444 : release_vec_loop_controls (&lens);
932 : 418218 : delete ivexpr_map;
933 : 414766 : delete scan_map;
934 : 414444 : delete scalar_costs;
935 : 414444 : delete vector_costs;
936 : 552457 : for (auto reduc_info : reduc_infos)
937 : 130850 : delete reduc_info;
938 : :
939 : : /* When we release an epiloge vinfo that we do not intend to use
940 : : avoid clearing AUX of the main loop which should continue to
941 : : point to the main loop vinfo since otherwise we'll leak that. */
942 : 414444 : if (loop->aux == this)
943 : 57682 : loop->aux = NULL;
944 : 828888 : }
945 : :
946 : : /* Return an invariant or register for EXPR and emit necessary
947 : : computations in the LOOP_VINFO loop preheader. */
948 : :
949 : : tree
950 : 19857 : cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
951 : : {
952 : 19857 : if (is_gimple_reg (expr)
953 : 19857 : || is_gimple_min_invariant (expr))
954 : 6507 : return expr;
955 : :
956 : 13350 : if (! loop_vinfo->ivexpr_map)
957 : 3774 : loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
958 : 13350 : tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
959 : 13350 : if (! cached)
960 : : {
961 : 8397 : gimple_seq stmts = NULL;
962 : 8397 : cached = force_gimple_operand (unshare_expr (expr),
963 : : &stmts, true, NULL_TREE);
964 : 8397 : if (stmts)
965 : : {
966 : 8255 : edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
967 : 8255 : gsi_insert_seq_on_edge_immediate (e, stmts);
968 : : }
969 : : }
970 : 13350 : return cached;
971 : : }
972 : :
973 : : /* Return true if we can use CMP_TYPE as the comparison type to produce
974 : : all masks required to mask LOOP_VINFO. */
975 : :
976 : : static bool
977 : 129 : can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
978 : : {
979 : 129 : rgroup_controls *rgm;
980 : 129 : unsigned int i;
981 : 181 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
982 : 181 : if (rgm->type != NULL_TREE
983 : 181 : && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
984 : : cmp_type, rgm->type,
985 : : OPTIMIZE_FOR_SPEED))
986 : : return false;
987 : : return true;
988 : : }
989 : :
990 : : /* Calculate the maximum number of scalars per iteration for every
991 : : rgroup in LOOP_VINFO. */
992 : :
993 : : static unsigned int
994 : 31 : vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
995 : : {
996 : 31 : unsigned int res = 1;
997 : 31 : unsigned int i;
998 : 31 : rgroup_controls *rgm;
999 : 216 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1000 : 185 : res = MAX (res, rgm->max_nscalars_per_iter);
1001 : 31 : return res;
1002 : : }
1003 : :
1004 : : /* Calculate the minimum precision necessary to represent:
1005 : :
1006 : : MAX_NITERS * FACTOR
1007 : :
1008 : : as an unsigned integer, where MAX_NITERS is the maximum number of
1009 : : loop header iterations for the original scalar form of LOOP_VINFO. */
1010 : :
1011 : : static unsigned
1012 : 31 : vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1013 : : {
1014 : 31 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1015 : :
1016 : : /* Get the maximum number of iterations that is representable
1017 : : in the counter type. */
1018 : 31 : tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1019 : 31 : widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1020 : :
1021 : : /* Get a more refined estimate for the number of iterations. */
1022 : 31 : widest_int max_back_edges;
1023 : 31 : if (max_loop_iterations (loop, &max_back_edges))
1024 : 31 : max_ni = wi::smin (max_ni, max_back_edges + 1);
1025 : :
1026 : : /* Work out how many bits we need to represent the limit. */
1027 : 31 : return wi::min_precision (max_ni * factor, UNSIGNED);
1028 : 31 : }
1029 : :
1030 : : /* True if the loop needs peeling or partial vectors when vectorized. */
1031 : :
1032 : : static bool
1033 : 113460 : vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1034 : : {
1035 : 113460 : unsigned HOST_WIDE_INT const_vf;
1036 : 113460 : HOST_WIDE_INT max_niter
1037 : 113460 : = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1038 : :
1039 : 113460 : unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1040 : 113460 : if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1041 : 15031 : th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1042 : : (loop_vinfo));
1043 : :
1044 : 113460 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1045 : 52138 : && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1046 : : {
1047 : : /* Work out the (constant) number of iterations that need to be
1048 : : peeled for reasons other than niters. */
1049 : 52102 : unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1050 : 52102 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1051 : 382 : peel_niter += 1;
1052 : 112363 : if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1053 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1054 : : return true;
1055 : : }
1056 : 61358 : else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1057 : : /* ??? When peeling for gaps but not alignment, we could
1058 : : try to check whether the (variable) niters is known to be
1059 : : VF * N + 1. That's something of a niche case though. */
1060 : 61125 : || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1061 : 60255 : || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1062 : 121613 : || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1063 : 120510 : < (unsigned) exact_log2 (const_vf))
1064 : : /* In case of versioning, check if the maximum number of
1065 : : iterations is greater than th. If they are identical,
1066 : : the epilogue is unnecessary. */
1067 : 59184 : && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1068 : 4370 : || ((unsigned HOST_WIDE_INT) max_niter
1069 : : /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
1070 : : but that's only computed later based on our result.
1071 : : The following is the most conservative approximation. */
1072 : 4370 : > (std::max ((unsigned HOST_WIDE_INT) th,
1073 : 4370 : const_vf) / const_vf) * const_vf))))
1074 : 60261 : return true;
1075 : :
1076 : : return false;
1077 : : }
1078 : :
1079 : : /* Each statement in LOOP_VINFO can be masked where necessary. Check
1080 : : whether we can actually generate the masks required. Return true if so,
1081 : : storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1082 : :
1083 : : static bool
1084 : 31 : vect_verify_full_masking (loop_vec_info loop_vinfo)
1085 : : {
1086 : 31 : unsigned int min_ni_width;
1087 : :
1088 : : /* Use a normal loop if there are no statements that need masking.
1089 : : This only happens in rare degenerate cases: it means that the loop
1090 : : has no loads, no stores, and no live-out values. */
1091 : 31 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1092 : : return false;
1093 : :
1094 : : /* Produce the rgroup controls. */
1095 : 113 : for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1096 : : {
1097 : 41 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1098 : 41 : tree vectype = mask.first;
1099 : 41 : unsigned nvectors = mask.second;
1100 : :
1101 : 51 : if (masks->rgc_vec.length () < nvectors)
1102 : 36 : masks->rgc_vec.safe_grow_cleared (nvectors, true);
1103 : 41 : rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1104 : : /* The number of scalars per iteration and the number of vectors are
1105 : : both compile-time constants. */
1106 : 41 : unsigned int nscalars_per_iter
1107 : 41 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1108 : 41 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1109 : :
1110 : 41 : if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1111 : : {
1112 : 41 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1113 : 41 : rgm->type = truth_type_for (vectype);
1114 : 41 : rgm->factor = 1;
1115 : : }
1116 : : }
1117 : :
1118 : 31 : unsigned int max_nscalars_per_iter
1119 : 31 : = vect_get_max_nscalars_per_iter (loop_vinfo);
1120 : :
1121 : : /* Work out how many bits we need to represent the limit. */
1122 : 31 : min_ni_width
1123 : 31 : = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1124 : :
1125 : : /* Find a scalar mode for which WHILE_ULT is supported. */
1126 : 31 : opt_scalar_int_mode cmp_mode_iter;
1127 : 31 : tree cmp_type = NULL_TREE;
1128 : 31 : tree iv_type = NULL_TREE;
1129 : 31 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1130 : 31 : unsigned int iv_precision = UINT_MAX;
1131 : :
1132 : 31 : if (iv_limit != -1)
1133 : 31 : iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1134 : : UNSIGNED);
1135 : :
1136 : 248 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1137 : : {
1138 : 217 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1139 : 217 : if (cmp_bits >= min_ni_width
1140 : 217 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1141 : : {
1142 : 129 : tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1143 : 129 : if (this_type
1144 : 129 : && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1145 : : {
1146 : : /* Although we could stop as soon as we find a valid mode,
1147 : : there are at least two reasons why that's not always the
1148 : : best choice:
1149 : :
1150 : : - An IV that's Pmode or wider is more likely to be reusable
1151 : : in address calculations than an IV that's narrower than
1152 : : Pmode.
1153 : :
1154 : : - Doing the comparison in IV_PRECISION or wider allows
1155 : : a natural 0-based IV, whereas using a narrower comparison
1156 : : type requires mitigations against wrap-around.
1157 : :
1158 : : Conversely, if the IV limit is variable, doing the comparison
1159 : : in a wider type than the original type can introduce
1160 : : unnecessary extensions, so picking the widest valid mode
1161 : : is not always a good choice either.
1162 : :
1163 : : Here we prefer the first IV type that's Pmode or wider,
1164 : : and the first comparison type that's IV_PRECISION or wider.
1165 : : (The comparison type must be no wider than the IV type,
1166 : : to avoid extensions in the vector loop.)
1167 : :
1168 : : ??? We might want to try continuing beyond Pmode for ILP32
1169 : : targets if CMP_BITS < IV_PRECISION. */
1170 : 0 : iv_type = this_type;
1171 : 0 : if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1172 : : cmp_type = this_type;
1173 : 0 : if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1174 : : break;
1175 : : }
1176 : : }
1177 : : }
1178 : :
1179 : 31 : if (!cmp_type)
1180 : : {
1181 : 31 : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1182 : 31 : return false;
1183 : : }
1184 : :
1185 : 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1186 : 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1187 : 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1188 : 0 : return true;
1189 : 31 : }
1190 : :
1191 : : /* Each statement in LOOP_VINFO can be masked where necessary. Check
1192 : : whether we can actually generate AVX512 style masks. Return true if so,
1193 : : storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1194 : :
1195 : : static bool
1196 : 31 : vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1197 : : {
1198 : : /* Produce differently organized rgc_vec and differently check
1199 : : we can produce masks. */
1200 : :
1201 : : /* Use a normal loop if there are no statements that need masking.
1202 : : This only happens in rare degenerate cases: it means that the loop
1203 : : has no loads, no stores, and no live-out values. */
1204 : 31 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1205 : : return false;
1206 : :
1207 : : /* For the decrementing IV we need to represent all values in
1208 : : [0, niter + niter_skip] where niter_skip is the elements we
1209 : : skip in the first iteration for prologue peeling. */
1210 : 31 : tree iv_type = NULL_TREE;
1211 : 31 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1212 : 31 : unsigned int iv_precision = UINT_MAX;
1213 : 31 : if (iv_limit != -1)
1214 : 31 : iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1215 : :
1216 : : /* First compute the type for the IV we use to track the remaining
1217 : : scalar iterations. */
1218 : 31 : opt_scalar_int_mode cmp_mode_iter;
1219 : 58 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1220 : : {
1221 : 58 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1222 : 58 : if (cmp_bits >= iv_precision
1223 : 58 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1224 : : {
1225 : 31 : iv_type = build_nonstandard_integer_type (cmp_bits, true);
1226 : 31 : if (iv_type)
1227 : : break;
1228 : : }
1229 : : }
1230 : 31 : if (!iv_type)
1231 : : return false;
1232 : :
1233 : : /* Produce the rgroup controls. */
1234 : 113 : for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1235 : : {
1236 : 41 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1237 : 41 : tree vectype = mask.first;
1238 : 41 : unsigned nvectors = mask.second;
1239 : :
1240 : : /* The number of scalars per iteration and the number of vectors are
1241 : : both compile-time constants. */
1242 : 41 : unsigned int nscalars_per_iter
1243 : 41 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1244 : 41 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1245 : :
1246 : : /* We index the rgroup_controls vector with nscalars_per_iter
1247 : : which we keep constant and instead have a varying nvectors,
1248 : : remembering the vector mask with the fewest nV. */
1249 : 51 : if (masks->rgc_vec.length () < nscalars_per_iter)
1250 : 33 : masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1251 : 41 : rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1252 : :
1253 : 41 : if (!rgm->type || rgm->factor > nvectors)
1254 : : {
1255 : 38 : rgm->type = truth_type_for (vectype);
1256 : 38 : rgm->compare_type = NULL_TREE;
1257 : 38 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1258 : 38 : rgm->factor = nvectors;
1259 : 38 : rgm->bias_adjusted_ctrl = NULL_TREE;
1260 : : }
1261 : : }
1262 : :
1263 : : /* There is no fixed compare type we are going to use but we have to
1264 : : be able to get at one for each mask group. */
1265 : 31 : unsigned int min_ni_width
1266 : 31 : = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1267 : :
1268 : 31 : bool ok = true;
1269 : 138 : for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1270 : : {
1271 : 45 : tree mask_type = rgc.type;
1272 : 45 : if (!mask_type)
1273 : 10 : continue;
1274 : :
1275 : : /* For now vect_get_loop_mask only supports integer mode masks
1276 : : when we need to split it. */
1277 : 35 : if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1278 : 35 : || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1279 : : {
1280 : : ok = false;
1281 : : break;
1282 : : }
1283 : :
1284 : : /* If iv_type is usable as compare type use that - we can elide the
1285 : : saturation in that case. */
1286 : 35 : if (TYPE_PRECISION (iv_type) >= min_ni_width)
1287 : : {
1288 : 35 : tree cmp_vectype
1289 : 35 : = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1290 : 35 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1291 : 6 : rgc.compare_type = cmp_vectype;
1292 : : }
1293 : 35 : if (!rgc.compare_type)
1294 : 66 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1295 : : {
1296 : 66 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1297 : 66 : if (cmp_bits >= min_ni_width
1298 : 66 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1299 : : {
1300 : 66 : tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1301 : 66 : if (!cmp_type)
1302 : 0 : continue;
1303 : :
1304 : : /* Check whether we can produce the mask with cmp_type. */
1305 : 66 : tree cmp_vectype
1306 : 66 : = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1307 : 66 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1308 : : {
1309 : 29 : rgc.compare_type = cmp_vectype;
1310 : 29 : break;
1311 : : }
1312 : : }
1313 : : }
1314 : 35 : if (!rgc.compare_type)
1315 : : {
1316 : : ok = false;
1317 : : break;
1318 : : }
1319 : : }
1320 : 31 : if (!ok)
1321 : : {
1322 : 0 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1323 : 0 : return false;
1324 : : }
1325 : :
1326 : 31 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1327 : 31 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1328 : 31 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1329 : 31 : return true;
1330 : 31 : }
1331 : :
1332 : : /* Check whether we can use vector access with length based on precison
1333 : : comparison. So far, to keep it simple, we only allow the case that the
1334 : : precision of the target supported length is larger than the precision
1335 : : required by loop niters. */
1336 : :
1337 : : static bool
1338 : 0 : vect_verify_loop_lens (loop_vec_info loop_vinfo)
1339 : : {
1340 : 0 : if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1341 : : return false;
1342 : :
1343 : 0 : if (!VECTOR_MODE_P (loop_vinfo->vector_mode))
1344 : : return false;
1345 : :
1346 : 0 : machine_mode len_load_mode, len_store_mode;
1347 : 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1348 : 0 : .exists (&len_load_mode))
1349 : 0 : return false;
1350 : 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1351 : 0 : .exists (&len_store_mode))
1352 : 0 : return false;
1353 : :
1354 : 0 : signed char partial_load_bias = internal_len_load_store_bias
1355 : 0 : (IFN_LEN_LOAD, len_load_mode);
1356 : :
1357 : 0 : signed char partial_store_bias = internal_len_load_store_bias
1358 : 0 : (IFN_LEN_STORE, len_store_mode);
1359 : :
1360 : 0 : gcc_assert (partial_load_bias == partial_store_bias);
1361 : :
1362 : 0 : if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1363 : : return false;
1364 : :
1365 : : /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1366 : : len_loads with a length of zero. In order to avoid that we prohibit
1367 : : more than one loop length here. */
1368 : 0 : if (partial_load_bias == -1
1369 : 0 : && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1370 : : return false;
1371 : :
1372 : 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1373 : :
1374 : 0 : unsigned int max_nitems_per_iter = 1;
1375 : 0 : unsigned int i;
1376 : 0 : rgroup_controls *rgl;
1377 : : /* Find the maximum number of items per iteration for every rgroup. */
1378 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1379 : : {
1380 : 0 : unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1381 : 0 : max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1382 : : }
1383 : :
1384 : : /* Work out how many bits we need to represent the length limit. */
1385 : 0 : unsigned int min_ni_prec
1386 : 0 : = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1387 : :
1388 : : /* Now use the maximum of below precisions for one suitable IV type:
1389 : : - the IV's natural precision
1390 : : - the precision needed to hold: the maximum number of scalar
1391 : : iterations multiplied by the scale factor (min_ni_prec above)
1392 : : - the Pmode precision
1393 : :
1394 : : If min_ni_prec is less than the precision of the current niters,
1395 : : we perfer to still use the niters type. Prefer to use Pmode and
1396 : : wider IV to avoid narrow conversions. */
1397 : :
1398 : 0 : unsigned int ni_prec
1399 : 0 : = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1400 : 0 : min_ni_prec = MAX (min_ni_prec, ni_prec);
1401 : 0 : min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1402 : :
1403 : 0 : tree iv_type = NULL_TREE;
1404 : 0 : opt_scalar_int_mode tmode_iter;
1405 : 0 : FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1406 : : {
1407 : 0 : scalar_mode tmode = tmode_iter.require ();
1408 : 0 : unsigned int tbits = GET_MODE_BITSIZE (tmode);
1409 : :
1410 : : /* ??? Do we really want to construct one IV whose precision exceeds
1411 : : BITS_PER_WORD? */
1412 : 0 : if (tbits > BITS_PER_WORD)
1413 : : break;
1414 : :
1415 : : /* Find the first available standard integral type. */
1416 : 0 : if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1417 : : {
1418 : 0 : iv_type = build_nonstandard_integer_type (tbits, true);
1419 : 0 : break;
1420 : : }
1421 : : }
1422 : :
1423 : 0 : if (!iv_type)
1424 : : {
1425 : 0 : if (dump_enabled_p ())
1426 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1427 : : "can't vectorize with length-based partial vectors"
1428 : : " because there is no suitable iv type.\n");
1429 : 0 : return false;
1430 : : }
1431 : :
1432 : 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1433 : 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1434 : 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1435 : :
1436 : 0 : return true;
1437 : : }
1438 : :
1439 : : /* Calculate the cost of one scalar iteration of the loop. */
1440 : : static void
1441 : 284677 : vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1442 : : {
1443 : 284677 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1444 : 284677 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1445 : 284677 : int nbbs = loop->num_nodes, factor;
1446 : 284677 : int innerloop_iters, i;
1447 : :
1448 : 284677 : DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1449 : :
1450 : : /* Gather costs for statements in the scalar loop. */
1451 : :
1452 : : /* FORNOW. */
1453 : 284677 : innerloop_iters = 1;
1454 : 284677 : if (loop->inner)
1455 : 1286 : innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1456 : :
1457 : 1011994 : for (i = 0; i < nbbs; i++)
1458 : : {
1459 : 727317 : gimple_stmt_iterator si;
1460 : 727317 : basic_block bb = bbs[i];
1461 : :
1462 : 727317 : if (bb->loop_father == loop->inner)
1463 : : factor = innerloop_iters;
1464 : : else
1465 : 724745 : factor = 1;
1466 : :
1467 : 5809089 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1468 : : {
1469 : 4354455 : gimple *stmt = gsi_stmt (si);
1470 : 4354455 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1471 : :
1472 : 4354455 : if (!is_gimple_assign (stmt)
1473 : : && !is_gimple_call (stmt)
1474 : : && !is_a<gcond *> (stmt))
1475 : 1571488 : continue;
1476 : :
1477 : : /* Skip stmts that are not vectorized inside the loop. */
1478 : 2782967 : stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1479 : 2782967 : if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1480 : 1186798 : && (!STMT_VINFO_LIVE_P (vstmt_info)
1481 : 73 : || !VECTORIZABLE_CYCLE_DEF
1482 : : (STMT_VINFO_DEF_TYPE (vstmt_info))))
1483 : 1186798 : continue;
1484 : :
1485 : 1596169 : vect_cost_for_stmt kind;
1486 : 1596169 : if (STMT_VINFO_DATA_REF (stmt_info))
1487 : : {
1488 : 675064 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1489 : : kind = scalar_load;
1490 : : else
1491 : 239368 : kind = scalar_store;
1492 : : }
1493 : 921105 : else if (vect_nop_conversion_p (stmt_info))
1494 : 40624 : continue;
1495 : : else
1496 : : kind = scalar_stmt;
1497 : :
1498 : : /* We are using vect_prologue here to avoid scaling twice
1499 : : by the inner loop factor. */
1500 : 1555545 : record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1501 : : factor, kind, stmt_info, 0, vect_prologue);
1502 : : }
1503 : : }
1504 : :
1505 : : /* Now accumulate cost. */
1506 : 284677 : loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1507 : 284677 : add_stmt_costs (loop_vinfo->scalar_costs,
1508 : : &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1509 : 284677 : loop_vinfo->scalar_costs->finish_cost (nullptr);
1510 : 284677 : }
1511 : :
1512 : : /* Function vect_analyze_loop_form.
1513 : :
1514 : : Verify that certain CFG restrictions hold, including:
1515 : : - the loop has a pre-header
1516 : : - the loop has a single entry
1517 : : - nested loops can have only a single exit.
1518 : : - the loop exit condition is simple enough
1519 : : - the number of iterations can be analyzed, i.e, a countable loop. The
1520 : : niter could be analyzed under some assumptions. */
1521 : :
1522 : : opt_result
1523 : 464252 : vect_analyze_loop_form (class loop *loop, gimple *loop_vectorized_call,
1524 : : vect_loop_form_info *info)
1525 : : {
1526 : 464252 : DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1527 : :
1528 : 464252 : edge exit_e = vec_init_loop_exit_info (loop);
1529 : 464252 : if (!exit_e)
1530 : 58098 : return opt_result::failure_at (vect_location,
1531 : : "not vectorized:"
1532 : : " could not determine main exit from"
1533 : : " loop with multiple exits.\n");
1534 : 406154 : if (loop_vectorized_call)
1535 : : {
1536 : 25842 : tree arg = gimple_call_arg (loop_vectorized_call, 1);
1537 : 25842 : class loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg));
1538 : 25842 : edge scalar_exit_e = vec_init_loop_exit_info (scalar_loop);
1539 : 25842 : if (!scalar_exit_e)
1540 : 0 : return opt_result::failure_at (vect_location,
1541 : : "not vectorized:"
1542 : : " could not determine main exit from"
1543 : : " loop with multiple exits.\n");
1544 : : }
1545 : :
1546 : 406154 : info->loop_exit = exit_e;
1547 : 406154 : if (dump_enabled_p ())
1548 : 14922 : dump_printf_loc (MSG_NOTE, vect_location,
1549 : : "using as main loop exit: %d -> %d [AUX: %p]\n",
1550 : 14922 : exit_e->src->index, exit_e->dest->index, exit_e->aux);
1551 : :
1552 : : /* Check if we have any control flow that doesn't leave the loop. */
1553 : 406154 : basic_block *bbs = get_loop_body (loop);
1554 : 1365835 : for (unsigned i = 0; i < loop->num_nodes; i++)
1555 : 1069875 : if (EDGE_COUNT (bbs[i]->succs) != 1
1556 : 1069875 : && (EDGE_COUNT (bbs[i]->succs) != 2
1557 : 633083 : || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1558 : : {
1559 : 110194 : free (bbs);
1560 : 110194 : return opt_result::failure_at (vect_location,
1561 : : "not vectorized:"
1562 : : " unsupported control flow in loop.\n");
1563 : : }
1564 : 295960 : free (bbs);
1565 : :
1566 : : /* Different restrictions apply when we are considering an inner-most loop,
1567 : : vs. an outer (nested) loop.
1568 : : (FORNOW. May want to relax some of these restrictions in the future). */
1569 : :
1570 : 295960 : info->inner_loop_cond = NULL;
1571 : 295960 : if (!loop->inner)
1572 : : {
1573 : : /* Inner-most loop. */
1574 : :
1575 : 274583 : if (empty_block_p (loop->header))
1576 : 3 : return opt_result::failure_at (vect_location,
1577 : : "not vectorized: empty loop.\n");
1578 : : }
1579 : : else
1580 : : {
1581 : 21377 : class loop *innerloop = loop->inner;
1582 : 21377 : edge entryedge;
1583 : :
1584 : : /* Nested loop. We currently require that the loop is doubly-nested,
1585 : : contains a single inner loop with a single exit to the block
1586 : : with the single exit condition in the outer loop.
1587 : : Vectorizable outer-loops look like this:
1588 : :
1589 : : (pre-header)
1590 : : |
1591 : : header <---+
1592 : : | |
1593 : : inner-loop |
1594 : : | |
1595 : : tail ------+
1596 : : |
1597 : : (exit-bb)
1598 : :
1599 : : The inner-loop also has the properties expected of inner-most loops
1600 : : as described above. */
1601 : :
1602 : 21377 : if ((loop->inner)->inner || (loop->inner)->next)
1603 : 3021 : return opt_result::failure_at (vect_location,
1604 : : "not vectorized:"
1605 : : " multiple nested loops.\n");
1606 : :
1607 : 18356 : entryedge = loop_preheader_edge (innerloop);
1608 : 18356 : if (entryedge->src != loop->header
1609 : 18010 : || !single_exit (innerloop)
1610 : 29482 : || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1611 : 7512 : return opt_result::failure_at (vect_location,
1612 : : "not vectorized:"
1613 : : " unsupported outerloop form.\n");
1614 : :
1615 : : /* Analyze the inner-loop. */
1616 : 10844 : vect_loop_form_info inner;
1617 : 10844 : opt_result res = vect_analyze_loop_form (loop->inner, NULL, &inner);
1618 : 10844 : if (!res)
1619 : : {
1620 : 1169 : if (dump_enabled_p ())
1621 : 5 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1622 : : "not vectorized: Bad inner loop.\n");
1623 : 1169 : return res;
1624 : : }
1625 : :
1626 : : /* Don't support analyzing niter under assumptions for inner
1627 : : loop. */
1628 : 9675 : if (!integer_onep (inner.assumptions))
1629 : 303 : return opt_result::failure_at (vect_location,
1630 : : "not vectorized: Bad inner loop.\n");
1631 : :
1632 : 9372 : if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1633 : 1084 : return opt_result::failure_at (vect_location,
1634 : : "not vectorized: inner-loop count not"
1635 : : " invariant.\n");
1636 : :
1637 : 8288 : if (dump_enabled_p ())
1638 : 939 : dump_printf_loc (MSG_NOTE, vect_location,
1639 : : "Considering outer-loop vectorization.\n");
1640 : 8288 : info->inner_loop_cond = inner.conds[0];
1641 : 10844 : }
1642 : :
1643 : 282868 : if (EDGE_COUNT (loop->header->preds) != 2)
1644 : 0 : return opt_result::failure_at (vect_location,
1645 : : "not vectorized:"
1646 : : " too many incoming edges.\n");
1647 : :
1648 : : /* We assume that the latch is empty. */
1649 : 282868 : basic_block latch = loop->latch;
1650 : 282868 : do
1651 : : {
1652 : 282868 : if (!empty_block_p (latch)
1653 : 282868 : || !gimple_seq_empty_p (phi_nodes (latch)))
1654 : 19983 : return opt_result::failure_at (vect_location,
1655 : : "not vectorized: latch block not "
1656 : : "empty.\n");
1657 : 262885 : latch = single_pred (latch);
1658 : : }
1659 : 525770 : while (single_succ_p (latch));
1660 : :
1661 : : /* Make sure there is no abnormal exit. */
1662 : 262885 : auto_vec<edge> exits = get_loop_exit_edges (loop);
1663 : 1161638 : for (edge e : exits)
1664 : : {
1665 : 373027 : if (e->flags & EDGE_ABNORMAL)
1666 : 44 : return opt_result::failure_at (vect_location,
1667 : : "not vectorized:"
1668 : : " abnormal loop exit edge.\n");
1669 : : }
1670 : :
1671 : 262841 : info->conds
1672 : 262841 : = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1673 : : &info->number_of_iterations,
1674 : 262841 : &info->number_of_iterationsm1);
1675 : 262841 : if (info->conds.is_empty ())
1676 : 36 : return opt_result::failure_at
1677 : 36 : (vect_location,
1678 : : "not vectorized: complicated exit condition.\n");
1679 : :
1680 : : /* Determine what the primary and alternate exit conds are. */
1681 : 625228 : for (unsigned i = 0; i < info->conds.length (); i++)
1682 : : {
1683 : 362423 : gcond *cond = info->conds[i];
1684 : 362423 : if (exit_e->src == gimple_bb (cond))
1685 : 262805 : std::swap (info->conds[0], info->conds[i]);
1686 : : }
1687 : :
1688 : 262805 : if (integer_zerop (info->assumptions)
1689 : 262805 : || !info->number_of_iterations
1690 : 525610 : || chrec_contains_undetermined (info->number_of_iterations))
1691 : 40857 : return opt_result::failure_at
1692 : 40857 : (info->conds[0],
1693 : : "not vectorized: number of iterations cannot be computed.\n");
1694 : :
1695 : 221948 : if (integer_zerop (info->number_of_iterations))
1696 : 14 : return opt_result::failure_at
1697 : 14 : (info->conds[0],
1698 : : "not vectorized: number of iterations = 0.\n");
1699 : :
1700 : 221934 : if (!(tree_fits_shwi_p (info->number_of_iterations)
1701 : 119740 : && tree_to_shwi (info->number_of_iterations) > 0))
1702 : : {
1703 : 102194 : if (dump_enabled_p ())
1704 : : {
1705 : 2319 : dump_printf_loc (MSG_NOTE, vect_location,
1706 : : "Symbolic number of iterations is ");
1707 : 2319 : dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1708 : 2319 : dump_printf (MSG_NOTE, "\n");
1709 : : }
1710 : : }
1711 : :
1712 : 221934 : if (!integer_onep (info->assumptions))
1713 : : {
1714 : 10743 : if (dump_enabled_p ())
1715 : : {
1716 : 65 : dump_printf_loc (MSG_NOTE, vect_location,
1717 : : "Loop to be versioned with niter assumption ");
1718 : 65 : dump_generic_expr (MSG_NOTE, TDF_SLIM, info->assumptions);
1719 : 65 : dump_printf (MSG_NOTE, "\n");
1720 : : }
1721 : : }
1722 : :
1723 : 221934 : return opt_result::success ();
1724 : 262885 : }
1725 : :
1726 : : /* Create a loop_vec_info for LOOP with SHARED and the
1727 : : vect_analyze_loop_form result. */
1728 : :
1729 : : loop_vec_info
1730 : 414444 : vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1731 : : const vect_loop_form_info *info,
1732 : : loop_vec_info orig_loop_info)
1733 : : {
1734 : 414444 : loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1735 : 414444 : LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1736 : 414444 : LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1737 : 414444 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1738 : 414444 : LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_info;
1739 : 414444 : if (orig_loop_info && LOOP_VINFO_EPILOGUE_P (orig_loop_info))
1740 : 166 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo)
1741 : 166 : = LOOP_VINFO_MAIN_LOOP_INFO (orig_loop_info);
1742 : : else
1743 : 414278 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo) = orig_loop_info;
1744 : : /* Also record the assumptions for versioning. */
1745 : 414444 : if (!integer_onep (info->assumptions) && !orig_loop_info)
1746 : 21630 : LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1747 : :
1748 : 1876480 : for (gcond *cond : info->conds)
1749 : : {
1750 : 633148 : stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1751 : : /* Mark the statement as a condition. */
1752 : 633148 : STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1753 : : }
1754 : :
1755 : 633148 : for (unsigned i = 1; i < info->conds.length (); i ++)
1756 : 218704 : LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1757 : 414444 : LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1758 : :
1759 : 414444 : LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1760 : :
1761 : : /* Check to see if we're vectorizing multiple exits. */
1762 : 414444 : LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1763 : 414444 : = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1764 : :
1765 : 414444 : if (info->inner_loop_cond)
1766 : : {
1767 : : /* If we have an estimate on the number of iterations of the inner
1768 : : loop use that to limit the scale for costing, otherwise use
1769 : : --param vect-inner-loop-cost-factor literally. */
1770 : 8411 : widest_int nit;
1771 : 8411 : if (estimated_stmt_executions (loop->inner, &nit))
1772 : 7173 : LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1773 : 7173 : = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1774 : 8411 : }
1775 : :
1776 : 414444 : return loop_vinfo;
1777 : : }
1778 : :
1779 : :
1780 : :
1781 : : /* Return true if we know that the iteration count is smaller than the
1782 : : vectorization factor. Return false if it isn't, or if we can't be sure
1783 : : either way. */
1784 : :
1785 : : static bool
1786 : 105945 : vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1787 : : {
1788 : 105945 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1789 : :
1790 : 105945 : HOST_WIDE_INT max_niter;
1791 : 105945 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1792 : 50773 : max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1793 : : else
1794 : 55172 : max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1795 : :
1796 : 105945 : if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1797 : 8139 : return true;
1798 : :
1799 : : return false;
1800 : : }
1801 : :
1802 : : /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1803 : : is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1804 : : definitely no, or -1 if it's worth retrying. */
1805 : :
1806 : : static int
1807 : 105953 : vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1808 : : unsigned *suggested_unroll_factor)
1809 : : {
1810 : 105953 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1811 : 105953 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1812 : :
1813 : : /* Only loops that can handle partially-populated vectors can have iteration
1814 : : counts less than the vectorization factor. */
1815 : 105953 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
1816 : 105953 : && vect_known_niters_smaller_than_vf (loop_vinfo))
1817 : : {
1818 : 8129 : if (dump_enabled_p ())
1819 : 234 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1820 : : "not vectorized: iteration count smaller than "
1821 : : "vectorization factor.\n");
1822 : 8129 : return 0;
1823 : : }
1824 : :
1825 : : /* If we know the number of iterations we can do better, for the
1826 : : epilogue we can also decide whether the main loop leaves us
1827 : : with enough iterations, prefering a smaller vector epilog then
1828 : : also possibly used for the case we skip the vector loop. */
1829 : 97824 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1830 : : {
1831 : 42949 : widest_int scalar_niters
1832 : 42949 : = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
1833 : 42949 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1834 : : {
1835 : 2630 : loop_vec_info orig_loop_vinfo
1836 : : = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1837 : 2630 : loop_vec_info main_loop_vinfo
1838 : : = LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo);
1839 : 2630 : unsigned lowest_vf
1840 : 2630 : = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
1841 : 2630 : int prolog_peeling = 0;
1842 : 2630 : if (!vect_use_loop_mask_for_alignment_p (main_loop_vinfo))
1843 : 2630 : prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo);
1844 : 2630 : if (prolog_peeling >= 0
1845 : 2630 : && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
1846 : : lowest_vf))
1847 : : {
1848 : 5250 : unsigned gap
1849 : 2625 : = LOOP_VINFO_PEELING_FOR_GAPS (main_loop_vinfo) ? 1 : 0;
1850 : 5250 : scalar_niters = ((scalar_niters - gap - prolog_peeling)
1851 : 5250 : % lowest_vf + gap);
1852 : : }
1853 : : }
1854 : : /* Reject vectorizing for a single scalar iteration, even if
1855 : : we could in principle implement that using partial vectors. */
1856 : 42949 : unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
1857 : 42949 : if (scalar_niters <= peeling_gap + 1)
1858 : : {
1859 : 784 : if (dump_enabled_p ())
1860 : 168 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1861 : : "not vectorized: loop only has a single "
1862 : : "scalar iteration.\n");
1863 : 784 : return 0;
1864 : : }
1865 : :
1866 : 42165 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1867 : : {
1868 : : /* Check that the loop processes at least one full vector. */
1869 : 42154 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1870 : 42154 : if (known_lt (scalar_niters, vf))
1871 : : {
1872 : 357 : if (dump_enabled_p ())
1873 : 289 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1874 : : "loop does not have enough iterations "
1875 : : "to support vectorization.\n");
1876 : 397 : return 0;
1877 : : }
1878 : :
1879 : : /* If we need to peel an extra epilogue iteration to handle data
1880 : : accesses with gaps, check that there are enough scalar iterations
1881 : : available.
1882 : :
1883 : : The check above is redundant with this one when peeling for gaps,
1884 : : but the distinction is useful for diagnostics. */
1885 : 41797 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1886 : 42079 : && known_le (scalar_niters, vf))
1887 : : {
1888 : 40 : if (dump_enabled_p ())
1889 : 9 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1890 : : "loop does not have enough iterations "
1891 : : "to support peeling for gaps.\n");
1892 : 40 : return 0;
1893 : : }
1894 : : }
1895 : 42949 : }
1896 : :
1897 : : /* If using the "very cheap" model. reject cases in which we'd keep
1898 : : a copy of the scalar code (even if we might be able to vectorize it). */
1899 : 96643 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1900 : 96643 : && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1901 : 47554 : || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
1902 : : {
1903 : 708 : if (dump_enabled_p ())
1904 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1905 : : "some scalar iterations would need to be peeled\n");
1906 : 708 : return 0;
1907 : : }
1908 : :
1909 : 95935 : int min_profitable_iters, min_profitable_estimate;
1910 : 95935 : vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1911 : : &min_profitable_estimate,
1912 : : suggested_unroll_factor);
1913 : :
1914 : 95935 : if (min_profitable_iters < 0)
1915 : : {
1916 : 25325 : if (dump_enabled_p ())
1917 : 22 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1918 : : "not vectorized: vectorization not profitable.\n");
1919 : 25325 : if (dump_enabled_p ())
1920 : 22 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1921 : : "not vectorized: vector version will never be "
1922 : : "profitable.\n");
1923 : 25325 : return -1;
1924 : : }
1925 : :
1926 : 70610 : int min_scalar_loop_bound = (param_min_vect_loop_bound
1927 : 70610 : * assumed_vf);
1928 : :
1929 : : /* Use the cost model only if it is more conservative than user specified
1930 : : threshold. */
1931 : 70610 : unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1932 : : min_profitable_iters);
1933 : :
1934 : 70610 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1935 : :
1936 : 35545 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1937 : 106155 : && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1938 : : {
1939 : 402 : if (dump_enabled_p ())
1940 : 1 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1941 : : "not vectorized: vectorization not profitable.\n");
1942 : 402 : if (dump_enabled_p ())
1943 : 1 : dump_printf_loc (MSG_NOTE, vect_location,
1944 : : "not vectorized: iteration count smaller than user "
1945 : : "specified loop bound parameter or minimum profitable "
1946 : : "iterations (whichever is more conservative).\n");
1947 : 402 : return 0;
1948 : : }
1949 : :
1950 : : /* The static profitablity threshold min_profitable_estimate includes
1951 : : the cost of having to check at runtime whether the scalar loop
1952 : : should be used instead. If it turns out that we don't need or want
1953 : : such a check, the threshold we should use for the static estimate
1954 : : is simply the point at which the vector loop becomes more profitable
1955 : : than the scalar loop. */
1956 : 70208 : if (min_profitable_estimate > min_profitable_iters
1957 : 15215 : && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1958 : 14753 : && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1959 : 262 : && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1960 : 70470 : && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1961 : : {
1962 : 8 : if (dump_enabled_p ())
1963 : 3 : dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1964 : : " choice between the scalar and vector loops\n");
1965 : 8 : min_profitable_estimate = min_profitable_iters;
1966 : : }
1967 : :
1968 : : /* If the vector loop needs multiple iterations to be beneficial then
1969 : : things are probably too close to call, and the conservative thing
1970 : : would be to stick with the scalar code. */
1971 : 70208 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1972 : 70208 : && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1973 : : {
1974 : 8061 : if (dump_enabled_p ())
1975 : 177 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1976 : : "one iteration of the vector loop would be"
1977 : : " more expensive than the equivalent number of"
1978 : : " iterations of the scalar loop\n");
1979 : 8061 : return 0;
1980 : : }
1981 : :
1982 : 62147 : HOST_WIDE_INT estimated_niter;
1983 : :
1984 : : /* If we are vectorizing an epilogue then we know the maximum number of
1985 : : scalar iterations it will cover is at least one lower than the
1986 : : vectorization factor of the main loop. */
1987 : 62147 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1988 : 10507 : estimated_niter
1989 : 10507 : = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1990 : : else
1991 : : {
1992 : 51640 : estimated_niter = estimated_stmt_executions_int (loop);
1993 : 51640 : if (estimated_niter == -1)
1994 : 19807 : estimated_niter = likely_max_stmt_executions_int (loop);
1995 : : }
1996 : 30314 : if (estimated_niter != -1
1997 : 60537 : && ((unsigned HOST_WIDE_INT) estimated_niter
1998 : 60537 : < MAX (th, (unsigned) min_profitable_estimate)))
1999 : : {
2000 : 4265 : if (dump_enabled_p ())
2001 : 28 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2002 : : "not vectorized: estimated iteration count too "
2003 : : "small.\n");
2004 : 4265 : if (dump_enabled_p ())
2005 : 28 : dump_printf_loc (MSG_NOTE, vect_location,
2006 : : "not vectorized: estimated iteration count smaller "
2007 : : "than specified loop bound parameter or minimum "
2008 : : "profitable iterations (whichever is more "
2009 : : "conservative).\n");
2010 : 4265 : return -1;
2011 : : }
2012 : :
2013 : : return 1;
2014 : : }
2015 : :
2016 : : static opt_result
2017 : 219616 : vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2018 : : vec<data_reference_p> *datarefs)
2019 : : {
2020 : 673285 : for (unsigned i = 0; i < loop->num_nodes; i++)
2021 : 995804 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2022 : 3766387 : !gsi_end_p (gsi); gsi_next (&gsi))
2023 : : {
2024 : 3312718 : gimple *stmt = gsi_stmt (gsi);
2025 : 3312718 : if (is_gimple_debug (stmt))
2026 : 1227184 : continue;
2027 : 2085662 : opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2028 : : NULL, 0);
2029 : 2085662 : if (!res)
2030 : : {
2031 : 44361 : if (is_gimple_call (stmt) && loop->safelen)
2032 : : {
2033 : 397 : tree fndecl = gimple_call_fndecl (stmt), op;
2034 : 397 : if (fndecl == NULL_TREE
2035 : 397 : && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2036 : : {
2037 : 0 : fndecl = gimple_call_arg (stmt, 0);
2038 : 0 : gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2039 : 0 : fndecl = TREE_OPERAND (fndecl, 0);
2040 : 0 : gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2041 : : }
2042 : 397 : if (fndecl != NULL_TREE)
2043 : : {
2044 : 363 : cgraph_node *node = cgraph_node::get (fndecl);
2045 : 363 : if (node != NULL && node->simd_clones != NULL)
2046 : : {
2047 : 129 : unsigned int j, n = gimple_call_num_args (stmt);
2048 : 539 : for (j = 0; j < n; j++)
2049 : : {
2050 : 282 : op = gimple_call_arg (stmt, j);
2051 : 282 : if (DECL_P (op)
2052 : 282 : || (REFERENCE_CLASS_P (op)
2053 : 0 : && get_base_address (op)))
2054 : : break;
2055 : : }
2056 : 129 : op = gimple_call_lhs (stmt);
2057 : : /* Ignore #pragma omp declare simd functions
2058 : : if they don't have data references in the
2059 : : call stmt itself. */
2060 : 257 : if (j == n
2061 : 129 : && !(op
2062 : 118 : && (DECL_P (op)
2063 : 118 : || (REFERENCE_CLASS_P (op)
2064 : 0 : && get_base_address (op)))))
2065 : 128 : continue;
2066 : : }
2067 : : }
2068 : : }
2069 : 44233 : return res;
2070 : : }
2071 : : /* If dependence analysis will give up due to the limit on the
2072 : : number of datarefs stop here and fail fatally. */
2073 : 3579317 : if (datarefs->length ()
2074 : 1538016 : > (unsigned)param_loop_max_datarefs_for_datadeps)
2075 : 0 : return opt_result::failure_at (stmt, "exceeded param "
2076 : : "loop-max-datarefs-for-datadeps\n");
2077 : : }
2078 : 175383 : return opt_result::success ();
2079 : : }
2080 : :
2081 : : /* Determine if operating on full vectors for LOOP_VINFO might leave
2082 : : some scalar iterations still to do. If so, decide how we should
2083 : : handle those scalar iterations. The possibilities are:
2084 : :
2085 : : (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2086 : : In this case:
2087 : :
2088 : : LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2089 : : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2090 : : LOOP_VINFO_PEELING_FOR_NITER == false
2091 : :
2092 : : (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2093 : : to handle the remaining scalar iterations. In this case:
2094 : :
2095 : : LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2096 : : LOOP_VINFO_PEELING_FOR_NITER == true
2097 : :
2098 : : There are two choices:
2099 : :
2100 : : (2a) Consider vectorizing the epilogue loop at the same VF as the
2101 : : main loop, but using partial vectors instead of full vectors.
2102 : : In this case:
2103 : :
2104 : : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2105 : :
2106 : : (2b) Consider vectorizing the epilogue loop at lower VFs only.
2107 : : In this case:
2108 : :
2109 : : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2110 : : */
2111 : :
2112 : : opt_result
2113 : 113460 : vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2114 : : {
2115 : : /* Determine whether there would be any scalar iterations left over. */
2116 : 113460 : bool need_peeling_or_partial_vectors_p
2117 : 113460 : = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2118 : :
2119 : : /* Decide whether to vectorize the loop with partial vectors. */
2120 : 113460 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2121 : 113460 : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2122 : 113460 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2123 : 37 : && LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo))
2124 : 0 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2125 : 113460 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2126 : 37 : && need_peeling_or_partial_vectors_p)
2127 : : {
2128 : : /* For partial-vector-usage=1, try to push the handling of partial
2129 : : vectors to the epilogue, with the main loop continuing to operate
2130 : : on full vectors.
2131 : :
2132 : : If we are unrolling we also do not want to use partial vectors. This
2133 : : is to avoid the overhead of generating multiple masks and also to
2134 : : avoid having to execute entire iterations of FALSE masked instructions
2135 : : when dealing with one or less full iterations.
2136 : :
2137 : : ??? We could then end up failing to use partial vectors if we
2138 : : decide to peel iterations into a prologue, and if the main loop
2139 : : then ends up processing fewer than VF iterations. */
2140 : 32 : if ((param_vect_partial_vector_usage == 1
2141 : 10 : || loop_vinfo->suggested_unroll_factor > 1)
2142 : 22 : && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2143 : 46 : && !vect_known_niters_smaller_than_vf (loop_vinfo))
2144 : 4 : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2145 : : else
2146 : 28 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2147 : : }
2148 : :
2149 : 113460 : if (LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo)
2150 : 0 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2151 : 0 : return opt_result::failure_at (vect_location,
2152 : : "not vectorized: loop needs but cannot "
2153 : : "use partial vectors\n");
2154 : :
2155 : 113460 : if (dump_enabled_p ())
2156 : 12556 : dump_printf_loc (MSG_NOTE, vect_location,
2157 : : "operating on %s vectors%s.\n",
2158 : 12556 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2159 : : ? "partial" : "full",
2160 : 12556 : LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2161 : : ? " for epilogue loop" : "");
2162 : :
2163 : 113460 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2164 : 226920 : = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2165 : 113460 : && need_peeling_or_partial_vectors_p);
2166 : :
2167 : : /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2168 : : analysis that we don't know whether the loop is vectorized by partial
2169 : : vectors (More details see tree-vect-loop-manip.cc).
2170 : :
2171 : : However, SELECT_VL vectorizaton style should only applied on partial
2172 : : vectorization since SELECT_VL is the GIMPLE IR that calculates the
2173 : : number of elements to be process for each iteration.
2174 : :
2175 : : After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2176 : : if it is not partial vectorized loop. */
2177 : 113460 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2178 : 113432 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2179 : :
2180 : 113460 : return opt_result::success ();
2181 : : }
2182 : :
2183 : : /* Function vect_analyze_loop_2.
2184 : :
2185 : : Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2186 : : analyses will record information in some members of LOOP_VINFO. FATAL
2187 : : indicates if some analysis meets fatal error. If one non-NULL pointer
2188 : : SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2189 : : worked out suggested unroll factor, while one NULL pointer shows it's
2190 : : going to apply the suggested unroll factor.
2191 : : SINGLE_LANE_SLP_DONE_FOR_SUGGESTED_UF is to hold whether single-lane
2192 : : slp was forced when the suggested unroll factor was worked out. */
2193 : : static opt_result
2194 : 413745 : vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2195 : : unsigned *suggested_unroll_factor,
2196 : : bool& single_lane_slp_done_for_suggested_uf)
2197 : : {
2198 : 413745 : opt_result ok = opt_result::success ();
2199 : 413745 : int res;
2200 : 413745 : unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2201 : 413745 : loop_vec_info orig_loop_vinfo = NULL;
2202 : :
2203 : : /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2204 : : loop_vec_info of the first vectorized loop. */
2205 : 413745 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2206 : 17445 : orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2207 : : else
2208 : : orig_loop_vinfo = loop_vinfo;
2209 : 17445 : gcc_assert (orig_loop_vinfo);
2210 : :
2211 : : /* The first group of checks is independent of the vector size. */
2212 : 413745 : fatal = true;
2213 : :
2214 : 413745 : if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2215 : 413745 : && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2216 : 5 : return opt_result::failure_at (vect_location,
2217 : : "not vectorized: simd if(0)\n");
2218 : :
2219 : : /* Find all data references in the loop (which correspond to vdefs/vuses)
2220 : : and analyze their evolution in the loop. */
2221 : :
2222 : 413740 : loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2223 : :
2224 : : /* Gather the data references and count stmts in the loop. */
2225 : 413740 : if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2226 : : {
2227 : 219616 : opt_result res
2228 : 219616 : = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2229 : : &LOOP_VINFO_DATAREFS (loop_vinfo));
2230 : 219616 : if (!res)
2231 : : {
2232 : 44233 : if (dump_enabled_p ())
2233 : 1465 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2234 : : "not vectorized: loop contains function "
2235 : : "calls or data references that cannot "
2236 : : "be analyzed\n");
2237 : 44233 : return res;
2238 : : }
2239 : 175383 : loop_vinfo->shared->save_datarefs ();
2240 : : }
2241 : : else
2242 : 194124 : loop_vinfo->shared->check_datarefs ();
2243 : :
2244 : : /* Analyze the data references and also adjust the minimal
2245 : : vectorization factor according to the loads and stores. */
2246 : :
2247 : 369507 : ok = vect_analyze_data_refs (loop_vinfo, &fatal);
2248 : 369507 : if (!ok)
2249 : : {
2250 : 50275 : if (dump_enabled_p ())
2251 : 964 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2252 : : "bad data references.\n");
2253 : 50275 : return ok;
2254 : : }
2255 : :
2256 : : /* Check if we are applying unroll factor now. */
2257 : 319232 : bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2258 : 319232 : gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2259 : :
2260 : : /* When single-lane SLP was forced and we are applying suggested unroll
2261 : : factor, keep that decision here. */
2262 : 638464 : bool force_single_lane = (applying_suggested_uf
2263 : 319232 : && single_lane_slp_done_for_suggested_uf);
2264 : :
2265 : : /* Classify all cross-iteration scalar data-flow cycles.
2266 : : Cross-iteration cycles caused by virtual phis are analyzed separately. */
2267 : 319232 : vect_analyze_scalar_cycles (loop_vinfo, !force_single_lane);
2268 : :
2269 : 319232 : vect_pattern_recog (loop_vinfo);
2270 : :
2271 : 319232 : vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2272 : :
2273 : : /* Analyze the access patterns of the data-refs in the loop (consecutive,
2274 : : complex, etc.). FORNOW: Only handle consecutive access pattern. */
2275 : :
2276 : 319232 : ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2277 : 319232 : if (!ok)
2278 : : {
2279 : 6801 : if (dump_enabled_p ())
2280 : 262 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2281 : : "bad data access.\n");
2282 : 6801 : return ok;
2283 : : }
2284 : :
2285 : : /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2286 : :
2287 : 312431 : ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2288 : 312431 : if (!ok)
2289 : : {
2290 : 13539 : if (dump_enabled_p ())
2291 : 304 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2292 : : "unexpected pattern.\n");
2293 : 13539 : return ok;
2294 : : }
2295 : :
2296 : : /* While the rest of the analysis below depends on it in some way. */
2297 : 298892 : fatal = false;
2298 : :
2299 : : /* Analyze data dependences between the data-refs in the loop
2300 : : and adjust the maximum vectorization factor according to
2301 : : the dependences.
2302 : : FORNOW: fail at the first data dependence that we encounter. */
2303 : :
2304 : 298892 : ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2305 : 298892 : if (!ok)
2306 : : {
2307 : 14215 : if (dump_enabled_p ())
2308 : 372 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2309 : : "bad data dependence.\n");
2310 : 14215 : return ok;
2311 : : }
2312 : 284677 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2313 : :
2314 : : /* Compute the scalar iteration cost. */
2315 : 284677 : vect_compute_single_scalar_iteration_cost (loop_vinfo);
2316 : :
2317 : 284677 : bool saved_can_use_partial_vectors_p
2318 : : = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2319 : :
2320 : : /* This is the point where we can re-start analysis with single-lane
2321 : : SLP forced. */
2322 : 408654 : start_over:
2323 : :
2324 : : /* Check the SLP opportunities in the loop, analyze and build
2325 : : SLP trees. */
2326 : 817308 : ok = vect_analyze_slp (loop_vinfo, loop_vinfo->stmt_vec_infos.length (),
2327 : : force_single_lane);
2328 : 408654 : if (!ok)
2329 : 28100 : return ok;
2330 : :
2331 : : /* If there are any SLP instances mark them as pure_slp. */
2332 : 380554 : if (!vect_make_slp_decision (loop_vinfo))
2333 : 38461 : return opt_result::failure_at (vect_location, "no stmts to vectorize.\n");
2334 : :
2335 : 342093 : if (dump_enabled_p ())
2336 : 17496 : dump_printf_loc (MSG_NOTE, vect_location, "Loop contains only SLP stmts\n");
2337 : :
2338 : : /* Determine the vectorization factor from the SLP decision. */
2339 : 342093 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2340 : 342093 : = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2341 : 342093 : if (dump_enabled_p ())
2342 : : {
2343 : 17496 : dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
2344 : 17496 : dump_dec (MSG_NOTE, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2345 : 17496 : dump_printf (MSG_NOTE, "\n");
2346 : : }
2347 : :
2348 : : /* Optimize the SLP graph with the vectorization factor fixed. */
2349 : 342093 : vect_optimize_slp (loop_vinfo);
2350 : :
2351 : : /* Gather the loads reachable from the SLP graph entries. */
2352 : 342093 : vect_gather_slp_loads (loop_vinfo);
2353 : :
2354 : : /* We don't expect to have to roll back to anything other than an empty
2355 : : set of rgroups. */
2356 : 342093 : gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2357 : :
2358 : : /* Apply the suggested unrolling factor, this was determined by the backend
2359 : : during finish_cost the first time we ran the analyzis for this
2360 : : vector mode. */
2361 : 342093 : if (applying_suggested_uf)
2362 : 239 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2363 : :
2364 : : /* Now the vectorization factor is final. */
2365 : 342093 : poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2366 : 342093 : gcc_assert (known_ne (vectorization_factor, 0U));
2367 : :
2368 : 342093 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2369 : : {
2370 : 13306 : dump_printf_loc (MSG_NOTE, vect_location,
2371 : : "vectorization_factor = ");
2372 : 13306 : dump_dec (MSG_NOTE, vectorization_factor);
2373 : 13306 : dump_printf (MSG_NOTE, ", niters = %wd\n",
2374 : 13306 : LOOP_VINFO_INT_NITERS (loop_vinfo));
2375 : : }
2376 : :
2377 : 342093 : if (max_vf != MAX_VECTORIZATION_FACTOR
2378 : 342093 : && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2379 : 41 : return opt_result::failure_at (vect_location, "bad data dependence.\n");
2380 : :
2381 : 342052 : loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2382 : :
2383 : : /* Analyze the alignment of the data-refs in the loop.
2384 : : Fail if a data reference is found that cannot be vectorized. */
2385 : :
2386 : 342052 : ok = vect_analyze_data_refs_alignment (loop_vinfo);
2387 : 342052 : if (!ok)
2388 : : {
2389 : 0 : if (dump_enabled_p ())
2390 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2391 : : "bad data alignment.\n");
2392 : 0 : return ok;
2393 : : }
2394 : :
2395 : : /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2396 : : It is important to call pruning after vect_analyze_data_ref_accesses,
2397 : : since we use grouping information gathered by interleaving analysis. */
2398 : 342052 : ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2399 : 342052 : if (!ok)
2400 : 16533 : return ok;
2401 : :
2402 : : /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2403 : : vectorization, since we do not want to add extra peeling or
2404 : : add versioning for alignment. */
2405 : 325519 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2406 : : /* This pass will decide on using loop versioning and/or loop peeling in
2407 : : order to enhance the alignment of data references in the loop. */
2408 : 311243 : ok = vect_enhance_data_refs_alignment (loop_vinfo);
2409 : 325519 : if (!ok)
2410 : 0 : return ok;
2411 : :
2412 : : /* Analyze operations in the SLP instances. We can't simply
2413 : : remove unsupported SLP instances as this makes the above
2414 : : SLP kind detection invalid and might also affect the VF. */
2415 : 325519 : if (! vect_slp_analyze_operations (loop_vinfo))
2416 : : {
2417 : 218771 : ok = opt_result::failure_at (vect_location,
2418 : : "unsupported SLP instances\n");
2419 : 218771 : goto again;
2420 : : }
2421 : :
2422 : : /* For now, we don't expect to mix both masking and length approaches for one
2423 : : loop, disable it if both are recorded. */
2424 : 106748 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2425 : 31 : && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2426 : 106779 : && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2427 : : {
2428 : 0 : if (dump_enabled_p ())
2429 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2430 : : "can't vectorize a loop with partial vectors"
2431 : : " because we don't expect to mix different"
2432 : : " approaches with partial vectors for the"
2433 : : " same loop.\n");
2434 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2435 : : }
2436 : :
2437 : : /* If we still have the option of using partial vectors,
2438 : : check whether we can generate the necessary loop controls. */
2439 : 106748 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2440 : : {
2441 : 31 : if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2442 : : {
2443 : 31 : if (!vect_verify_full_masking (loop_vinfo)
2444 : 31 : && !vect_verify_full_masking_avx512 (loop_vinfo))
2445 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2446 : : }
2447 : : else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
2448 : 0 : if (!vect_verify_loop_lens (loop_vinfo))
2449 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2450 : : }
2451 : :
2452 : : /* If we're vectorizing a loop that uses length "controls" and
2453 : : can iterate more than once, we apply decrementing IV approach
2454 : : in loop control. */
2455 : 106748 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2456 : 31 : && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
2457 : 0 : && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
2458 : 106748 : && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2459 : 0 : && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
2460 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
2461 : 0 : LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
2462 : :
2463 : : /* If a loop uses length controls and has a decrementing loop control IV,
2464 : : we will normally pass that IV through a MIN_EXPR to calcaluate the
2465 : : basis for the length controls. E.g. in a loop that processes one
2466 : : element per scalar iteration, the number of elements would be
2467 : : MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
2468 : :
2469 : : This MIN_EXPR approach allows us to use pointer IVs with an invariant
2470 : : step, since only the final iteration of the vector loop can have
2471 : : inactive lanes.
2472 : :
2473 : : However, some targets have a dedicated instruction for calculating the
2474 : : preferred length, given the total number of elements that still need to
2475 : : be processed. This is encapsulated in the SELECT_VL internal function.
2476 : :
2477 : : If the target supports SELECT_VL, we can use it instead of MIN_EXPR
2478 : : to determine the basis for the length controls. However, unlike the
2479 : : MIN_EXPR calculation, the SELECT_VL calculation can decide to make
2480 : : lanes inactive in any iteration of the vector loop, not just the last
2481 : : iteration. This SELECT_VL approach therefore requires us to use pointer
2482 : : IVs with variable steps.
2483 : :
2484 : : Once we've decided how many elements should be processed by one
2485 : : iteration of the vector loop, we need to populate the rgroup controls.
2486 : : If a loop has multiple rgroups, we need to make sure that those rgroups
2487 : : "line up" (that is, they must be consistent about which elements are
2488 : : active and which aren't). This is done by vect_adjust_loop_lens_control.
2489 : :
2490 : : In principle, it would be possible to use vect_adjust_loop_lens_control
2491 : : on either the result of a MIN_EXPR or the result of a SELECT_VL.
2492 : : However:
2493 : :
2494 : : (1) In practice, it only makes sense to use SELECT_VL when a vector
2495 : : operation will be controlled directly by the result. It is not
2496 : : worth using SELECT_VL if it would only be the input to other
2497 : : calculations.
2498 : :
2499 : : (2) If we use SELECT_VL for an rgroup that has N controls, each associated
2500 : : pointer IV will need N updates by a variable amount (N-1 updates
2501 : : within the iteration and 1 update to move to the next iteration).
2502 : :
2503 : : Because of this, we prefer to use the MIN_EXPR approach whenever there
2504 : : is more than one length control.
2505 : :
2506 : : In addition, SELECT_VL always operates to a granularity of 1 unit.
2507 : : If we wanted to use it to control an SLP operation on N consecutive
2508 : : elements, we would need to make the SELECT_VL inputs measure scalar
2509 : : iterations (rather than elements) and then multiply the SELECT_VL
2510 : : result by N. But using SELECT_VL this way is inefficient because
2511 : : of (1) above.
2512 : :
2513 : : 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
2514 : : satisfied:
2515 : :
2516 : : (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
2517 : : (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
2518 : :
2519 : : Since SELECT_VL (variable step) will make SCEV analysis failed and then
2520 : : we will fail to gain benefits of following unroll optimizations. We prefer
2521 : : using the MIN_EXPR approach in this situation. */
2522 : 106748 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
2523 : : {
2524 : 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
2525 : 0 : if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
2526 : : OPTIMIZE_FOR_SPEED)
2527 : 0 : && LOOP_VINFO_LENS (loop_vinfo).length () == 1
2528 : 0 : && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1
2529 : 0 : && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2530 : : || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
2531 : 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
2532 : :
2533 : : /* If any of the SLP instances cover more than a single lane
2534 : : we cannot use .SELECT_VL at the moment, even if the number
2535 : : of lanes is uniform throughout the SLP graph. */
2536 : 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
2537 : 0 : for (slp_instance inst : LOOP_VINFO_SLP_INSTANCES (loop_vinfo))
2538 : 0 : if (SLP_TREE_LANES (SLP_INSTANCE_TREE (inst)) != 1
2539 : 0 : && !(SLP_INSTANCE_KIND (inst) == slp_inst_kind_store
2540 : 0 : && SLP_INSTANCE_TREE (inst)->ldst_lanes))
2541 : : {
2542 : 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2543 : 0 : break;
2544 : : }
2545 : : }
2546 : :
2547 : : /* Decide whether this loop_vinfo should use partial vectors or peeling,
2548 : : assuming that the loop will be used as a main loop. We will redo
2549 : : this analysis later if we instead decide to use the loop as an
2550 : : epilogue loop. */
2551 : 106748 : ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
2552 : 106748 : if (!ok)
2553 : 0 : return ok;
2554 : :
2555 : : /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2556 : : to be able to handle fewer than VF scalars, or needs to have a lower VF
2557 : : than the main loop. */
2558 : 106748 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2559 : 12226 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2560 : : {
2561 : 12220 : poly_uint64 unscaled_vf
2562 : 12220 : = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2563 : : orig_loop_vinfo->suggested_unroll_factor);
2564 : 12220 : if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
2565 : 271 : return opt_result::failure_at (vect_location,
2566 : : "Vectorization factor too high for"
2567 : : " epilogue loop.\n");
2568 : : }
2569 : :
2570 : : /* If the epilogue needs peeling for gaps but the main loop doesn't give
2571 : : up on the epilogue. */
2572 : 106477 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2573 : 11955 : && LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2574 : 58 : && (LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo)
2575 : : != LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
2576 : 4 : return opt_result::failure_at (vect_location,
2577 : : "Epilogue loop requires peeling for gaps "
2578 : : "but main loop does not.\n");
2579 : :
2580 : : /* If an epilogue loop is required make sure we can create one. */
2581 : 106473 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2582 : 105275 : || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2583 : 31691 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
2584 : : {
2585 : 75814 : if (dump_enabled_p ())
2586 : 4947 : dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2587 : 75814 : if (!vect_can_advance_ivs_p (loop_vinfo)
2588 : 151108 : || !slpeel_can_duplicate_loop_p (loop,
2589 : : LOOP_VINFO_IV_EXIT (loop_vinfo),
2590 : 75294 : LOOP_VINFO_IV_EXIT (loop_vinfo)))
2591 : : {
2592 : 520 : ok = opt_result::failure_at (vect_location,
2593 : : "not vectorized: can't create required "
2594 : : "epilog loop\n");
2595 : 520 : goto again;
2596 : : }
2597 : : }
2598 : :
2599 : : /* Check the costings of the loop make vectorizing worthwhile. */
2600 : 105953 : res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2601 : 105953 : if (res < 0)
2602 : : {
2603 : 29590 : ok = opt_result::failure_at (vect_location,
2604 : : "Loop costings may not be worthwhile.\n");
2605 : 29590 : goto again;
2606 : : }
2607 : 76363 : if (!res)
2608 : 18481 : return opt_result::failure_at (vect_location,
2609 : : "Loop costings not worthwhile.\n");
2610 : :
2611 : : /* During peeling, we need to check if number of loop iterations is
2612 : : enough for both peeled prolog loop and vector loop. This check
2613 : : can be merged along with threshold check of loop versioning, so
2614 : : increase threshold for this case if necessary.
2615 : :
2616 : : If we are analyzing an epilogue we still want to check what its
2617 : : versioning threshold would be. If we decide to vectorize the epilogues we
2618 : : will want to use the lowest versioning threshold of all epilogues and main
2619 : : loop. This will enable us to enter a vectorized epilogue even when
2620 : : versioning the loop. We can't simply check whether the epilogue requires
2621 : : versioning though since we may have skipped some versioning checks when
2622 : : analyzing the epilogue. For instance, checks for alias versioning will be
2623 : : skipped when dealing with epilogues as we assume we already checked them
2624 : : for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2625 : 57882 : if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2626 : : {
2627 : 5597 : poly_uint64 niters_th = 0;
2628 : 5597 : unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2629 : :
2630 : 5597 : if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2631 : : {
2632 : : /* Niters for peeled prolog loop. */
2633 : 5597 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2634 : : {
2635 : 125 : dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2636 : 125 : tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2637 : 125 : niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2638 : : }
2639 : : else
2640 : 5472 : niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2641 : : }
2642 : :
2643 : : /* Niters for at least one iteration of vectorized loop. */
2644 : 5597 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2645 : 5593 : niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2646 : : /* One additional iteration because of peeling for gap. */
2647 : 5597 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2648 : 55 : niters_th += 1;
2649 : :
2650 : : /* Use the same condition as vect_transform_loop to decide when to use
2651 : : the cost to determine a versioning threshold. */
2652 : 5597 : if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2653 : 5597 : && ordered_p (th, niters_th))
2654 : 3807 : niters_th = ordered_max (poly_uint64 (th), niters_th);
2655 : :
2656 : 5597 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2657 : : }
2658 : :
2659 : 57882 : gcc_assert (known_eq (vectorization_factor,
2660 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2661 : :
2662 : 57882 : single_lane_slp_done_for_suggested_uf = force_single_lane;
2663 : :
2664 : : /* Ok to vectorize! */
2665 : 57882 : LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2666 : 57882 : return opt_result::success ();
2667 : :
2668 : 248881 : again:
2669 : : /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2670 : 248881 : gcc_assert (!ok);
2671 : :
2672 : : /* Try again with single-lane SLP. */
2673 : 248881 : if (force_single_lane)
2674 : 123498 : return ok;
2675 : :
2676 : : /* If we are applying suggested unroll factor, we don't need to
2677 : : re-try any more as we want to keep the SLP mode fixed. */
2678 : 125383 : if (applying_suggested_uf)
2679 : 6 : return ok;
2680 : :
2681 : : /* If there are reduction chains re-trying will fail anyway. */
2682 : 125377 : if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2683 : 323 : return ok;
2684 : :
2685 : : /* Likewise if the grouped loads or stores in the SLP cannot be handled
2686 : : via interleaving or lane instructions. */
2687 : : slp_instance instance;
2688 : : slp_tree node;
2689 : : unsigned i, j;
2690 : 484734 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2691 : : {
2692 : 360757 : if (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance)) != vect_internal_def)
2693 : 0 : continue;
2694 : :
2695 : 360757 : stmt_vec_info vinfo;
2696 : 360757 : vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2697 : 360757 : if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2698 : 358106 : continue;
2699 : 2651 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2700 : 2651 : unsigned int size = DR_GROUP_SIZE (vinfo);
2701 : 2651 : tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
2702 : 2651 : if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
2703 : 4608 : && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2704 : 5294 : && ! vect_grouped_store_supported (vectype, size))
2705 : 686 : return opt_result::failure_at (vinfo->stmt,
2706 : : "unsupported grouped store\n");
2707 : 363069 : FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2708 : : {
2709 : 2152 : vinfo = SLP_TREE_REPRESENTATIVE (node);
2710 : 2152 : if (STMT_VINFO_GROUPED_ACCESS (vinfo))
2711 : : {
2712 : 1839 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2713 : 1839 : bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2714 : 1839 : size = DR_GROUP_SIZE (vinfo);
2715 : 1839 : vectype = SLP_TREE_VECTYPE (node);
2716 : 1839 : if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
2717 : 1839 : && ! vect_grouped_load_supported (vectype, single_element_p,
2718 : : size))
2719 : 391 : return opt_result::failure_at (vinfo->stmt,
2720 : : "unsupported grouped load\n");
2721 : : }
2722 : : }
2723 : : }
2724 : :
2725 : : /* Roll back state appropriately. Force single-lane SLP this time. */
2726 : 123977 : force_single_lane = true;
2727 : 123977 : if (dump_enabled_p ())
2728 : 3209 : dump_printf_loc (MSG_NOTE, vect_location,
2729 : : "re-trying with single-lane SLP\n");
2730 : :
2731 : : /* Reset the vectorization factor. */
2732 : 123977 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = 0;
2733 : : /* Free the SLP instances. */
2734 : 483641 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2735 : 359664 : vect_free_slp_instance (instance);
2736 : 123977 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2737 : : /* Reset SLP type to loop_vect on all stmts. */
2738 : 478241 : for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2739 : : {
2740 : 354264 : basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2741 : 354264 : for (gimple_stmt_iterator si = gsi_start_phis (bb);
2742 : 635168 : !gsi_end_p (si); gsi_next (&si))
2743 : : {
2744 : 280904 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2745 : 280904 : STMT_SLP_TYPE (stmt_info) = not_vect;
2746 : 280904 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2747 : 280904 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2748 : : {
2749 : : /* vectorizable_reduction adjusts reduction stmt def-types,
2750 : : restore them to that of the PHI. */
2751 : 15546 : STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2752 : 15546 : = STMT_VINFO_DEF_TYPE (stmt_info);
2753 : 15546 : STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2754 : : (STMT_VINFO_REDUC_DEF (stmt_info)))
2755 : 15546 : = STMT_VINFO_DEF_TYPE (stmt_info);
2756 : : }
2757 : : }
2758 : 708528 : for (gimple_stmt_iterator si = gsi_start_bb (bb);
2759 : 2127777 : !gsi_end_p (si); gsi_next (&si))
2760 : : {
2761 : 1773513 : if (is_gimple_debug (gsi_stmt (si)))
2762 : 623531 : continue;
2763 : 1149982 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2764 : 1149982 : STMT_SLP_TYPE (stmt_info) = not_vect;
2765 : 1149982 : if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2766 : : {
2767 : 213163 : stmt_vec_info pattern_stmt_info
2768 : : = STMT_VINFO_RELATED_STMT (stmt_info);
2769 : 213163 : if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2770 : 0 : STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2771 : :
2772 : 213163 : gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2773 : 213163 : STMT_SLP_TYPE (pattern_stmt_info) = not_vect;
2774 : 213163 : for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2775 : 431944 : !gsi_end_p (pi); gsi_next (&pi))
2776 : 218781 : STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2777 : 218781 : = not_vect;
2778 : : }
2779 : : }
2780 : : }
2781 : : /* Free optimized alias test DDRS. */
2782 : 123977 : LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2783 : 123977 : LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2784 : 123977 : LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2785 : : /* Reset target cost data. */
2786 : 123977 : delete loop_vinfo->vector_costs;
2787 : 123977 : loop_vinfo->vector_costs = nullptr;
2788 : : /* Reset accumulated rgroup information. */
2789 : 123977 : LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
2790 : 123977 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
2791 : 123977 : release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2792 : : /* Reset assorted flags. */
2793 : 123977 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2794 : 123977 : LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2795 : 123977 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2796 : 123977 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2797 : 123977 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2798 : 123977 : = saved_can_use_partial_vectors_p;
2799 : 123977 : LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2800 : 123977 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2801 : 123977 : if (loop_vinfo->scan_map)
2802 : 122 : loop_vinfo->scan_map->empty ();
2803 : :
2804 : 123977 : goto start_over;
2805 : : }
2806 : :
2807 : : /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2808 : : to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2809 : : OLD_LOOP_VINFO is better unless something specifically indicates
2810 : : otherwise.
2811 : :
2812 : : Note that this deliberately isn't a partial order. */
2813 : :
2814 : : static bool
2815 : 0 : vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2816 : : loop_vec_info old_loop_vinfo)
2817 : : {
2818 : 0 : struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2819 : 0 : gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2820 : :
2821 : 0 : poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2822 : 0 : poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2823 : :
2824 : : /* Always prefer a VF of loop->simdlen over any other VF. */
2825 : 0 : if (loop->simdlen)
2826 : : {
2827 : 0 : bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2828 : 0 : bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2829 : 0 : if (new_simdlen_p != old_simdlen_p)
2830 : : return new_simdlen_p;
2831 : : }
2832 : :
2833 : 0 : const auto *old_costs = old_loop_vinfo->vector_costs;
2834 : 0 : const auto *new_costs = new_loop_vinfo->vector_costs;
2835 : 0 : if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2836 : 0 : return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2837 : :
2838 : 0 : return new_costs->better_main_loop_than_p (old_costs);
2839 : : }
2840 : :
2841 : : /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2842 : : true if we should. */
2843 : :
2844 : : static bool
2845 : 0 : vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2846 : : loop_vec_info old_loop_vinfo)
2847 : : {
2848 : 0 : if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2849 : : return false;
2850 : :
2851 : 0 : if (dump_enabled_p ())
2852 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
2853 : : "***** Preferring vector mode %s to vector mode %s\n",
2854 : 0 : GET_MODE_NAME (new_loop_vinfo->vector_mode),
2855 : 0 : GET_MODE_NAME (old_loop_vinfo->vector_mode));
2856 : : return true;
2857 : : }
2858 : :
2859 : : /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if ORIG_LOOP_VINFO is
2860 : : not NULL. When MASKED_P is not -1 override the default
2861 : : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P with it.
2862 : : Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance MODE_I to the next
2863 : : mode useful to analyze.
2864 : : Return the loop_vinfo on success and wrapped null on failure. */
2865 : :
2866 : : static opt_loop_vec_info
2867 : 413506 : vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2868 : : const vect_loop_form_info *loop_form_info,
2869 : : loop_vec_info orig_loop_vinfo,
2870 : : const vector_modes &vector_modes, unsigned &mode_i,
2871 : : int masked_p,
2872 : : machine_mode &autodetected_vector_mode,
2873 : : bool &fatal)
2874 : : {
2875 : 413506 : loop_vec_info loop_vinfo
2876 : 413506 : = vect_create_loop_vinfo (loop, shared, loop_form_info, orig_loop_vinfo);
2877 : :
2878 : 413506 : machine_mode vector_mode = vector_modes[mode_i];
2879 : 413506 : loop_vinfo->vector_mode = vector_mode;
2880 : 413506 : if (masked_p != -1)
2881 : 4 : loop_vinfo->can_use_partial_vectors_p = masked_p;
2882 : 413506 : unsigned int suggested_unroll_factor = 1;
2883 : 413506 : bool single_lane_slp_done_for_suggested_uf = false;
2884 : :
2885 : : /* Run the main analysis. */
2886 : 413506 : opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
2887 : : &suggested_unroll_factor,
2888 : : single_lane_slp_done_for_suggested_uf);
2889 : 413506 : if (dump_enabled_p ())
2890 : 18932 : dump_printf_loc (MSG_NOTE, vect_location,
2891 : : "***** Analysis %s with vector mode %s\n",
2892 : 18932 : res ? "succeeded" : "failed",
2893 : 18932 : GET_MODE_NAME (loop_vinfo->vector_mode));
2894 : :
2895 : 413506 : auto user_unroll = LOOP_VINFO_LOOP (loop_vinfo)->unroll;
2896 : 413506 : if (res && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2897 : : /* Check to see if the user wants to unroll or if the target wants to. */
2898 : 464483 : && (suggested_unroll_factor > 1 || user_unroll > 1))
2899 : : {
2900 : 251 : if (suggested_unroll_factor == 1)
2901 : : {
2902 : 40 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
2903 : 40 : suggested_unroll_factor = user_unroll / assumed_vf;
2904 : 40 : if (suggested_unroll_factor > 1)
2905 : : {
2906 : 28 : if (dump_enabled_p ())
2907 : 20 : dump_printf_loc (MSG_NOTE, vect_location,
2908 : : "setting unroll factor to %d based on user requested "
2909 : : "unroll factor %d and suggested vectorization "
2910 : : "factor: %d\n",
2911 : : suggested_unroll_factor, user_unroll, assumed_vf);
2912 : : }
2913 : : }
2914 : :
2915 : 251 : if (suggested_unroll_factor > 1)
2916 : : {
2917 : 239 : if (dump_enabled_p ())
2918 : 44 : dump_printf_loc (MSG_NOTE, vect_location,
2919 : : "***** Re-trying analysis for unrolling"
2920 : : " with unroll factor %d and %s slp.\n",
2921 : : suggested_unroll_factor,
2922 : : single_lane_slp_done_for_suggested_uf
2923 : : ? "single-lane" : "");
2924 : 239 : loop_vec_info unroll_vinfo
2925 : 239 : = vect_create_loop_vinfo (loop, shared, loop_form_info, NULL);
2926 : 239 : unroll_vinfo->vector_mode = vector_mode;
2927 : 239 : unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
2928 : 239 : opt_result new_res
2929 : 239 : = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
2930 : : single_lane_slp_done_for_suggested_uf);
2931 : 239 : if (new_res)
2932 : : {
2933 : 193 : delete loop_vinfo;
2934 : 193 : loop_vinfo = unroll_vinfo;
2935 : : }
2936 : : else
2937 : 46 : delete unroll_vinfo;
2938 : : }
2939 : :
2940 : : /* Record that we have honored a user unroll factor. */
2941 : 251 : LOOP_VINFO_USER_UNROLL (loop_vinfo) = user_unroll > 1;
2942 : : }
2943 : :
2944 : : /* Remember the autodetected vector mode. */
2945 : 413506 : if (vector_mode == VOIDmode)
2946 : 211560 : autodetected_vector_mode = loop_vinfo->vector_mode;
2947 : :
2948 : : /* Advance mode_i, first skipping modes that would result in the
2949 : : same analysis result. */
2950 : 1869068 : while (mode_i + 1 < vector_modes.length ()
2951 : 1308020 : && vect_chooses_same_modes_p (loop_vinfo,
2952 : 580239 : vector_modes[mode_i + 1]))
2953 : : {
2954 : 314275 : if (dump_enabled_p ())
2955 : 15581 : dump_printf_loc (MSG_NOTE, vect_location,
2956 : : "***** The result for vector mode %s would"
2957 : : " be the same\n",
2958 : 15581 : GET_MODE_NAME (vector_modes[mode_i + 1]));
2959 : 314275 : mode_i += 1;
2960 : : }
2961 : 413506 : if (mode_i + 1 < vector_modes.length ()
2962 : 679470 : && vect_chooses_same_modes_p (autodetected_vector_mode,
2963 : 265964 : vector_modes[mode_i + 1]))
2964 : : {
2965 : 356 : if (dump_enabled_p ())
2966 : 9 : dump_printf_loc (MSG_NOTE, vect_location,
2967 : : "***** Skipping vector mode %s, which would"
2968 : : " repeat the analysis for %s\n",
2969 : 9 : GET_MODE_NAME (vector_modes[mode_i + 1]),
2970 : 9 : GET_MODE_NAME (autodetected_vector_mode));
2971 : 356 : mode_i += 1;
2972 : : }
2973 : 413506 : mode_i++;
2974 : :
2975 : 413506 : if (!res)
2976 : : {
2977 : 355817 : delete loop_vinfo;
2978 : 355817 : if (fatal)
2979 : 64868 : gcc_checking_assert (orig_loop_vinfo == NULL);
2980 : 355817 : return opt_loop_vec_info::propagate_failure (res);
2981 : : }
2982 : :
2983 : 57689 : return opt_loop_vec_info::success (loop_vinfo);
2984 : : }
2985 : :
2986 : : /* Function vect_analyze_loop.
2987 : :
2988 : : Apply a set of analyses on LOOP, and create a loop_vec_info struct
2989 : : for it. The different analyses will record information in the
2990 : : loop_vec_info struct. */
2991 : : opt_loop_vec_info
2992 : 474973 : vect_analyze_loop (class loop *loop, gimple *loop_vectorized_call,
2993 : : vec_info_shared *shared)
2994 : : {
2995 : 474973 : DUMP_VECT_SCOPE ("analyze_loop_nest");
2996 : :
2997 : 474973 : if (loop_outer (loop)
2998 : 474973 : && loop_vec_info_for_loop (loop_outer (loop))
2999 : 475459 : && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3000 : 486 : return opt_loop_vec_info::failure_at (vect_location,
3001 : : "outer-loop already vectorized.\n");
3002 : :
3003 : 474487 : if (!find_loop_nest (loop, &shared->loop_nest))
3004 : 22363 : return opt_loop_vec_info::failure_at
3005 : 22363 : (vect_location,
3006 : : "not vectorized: loop nest containing two or more consecutive inner"
3007 : : " loops cannot be vectorized\n");
3008 : :
3009 : : /* Analyze the loop form. */
3010 : 452124 : vect_loop_form_info loop_form_info;
3011 : 452124 : opt_result res = vect_analyze_loop_form (loop, loop_vectorized_call,
3012 : : &loop_form_info);
3013 : 452124 : if (!res)
3014 : : {
3015 : 240564 : if (dump_enabled_p ())
3016 : 1625 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3017 : : "bad loop form.\n");
3018 : 240564 : return opt_loop_vec_info::propagate_failure (res);
3019 : : }
3020 : 211560 : if (!integer_onep (loop_form_info.assumptions))
3021 : : {
3022 : : /* We consider to vectorize this loop by versioning it under
3023 : : some assumptions. In order to do this, we need to clear
3024 : : existing information computed by scev and niter analyzer. */
3025 : 10440 : scev_reset_htab ();
3026 : 10440 : free_numbers_of_iterations_estimates (loop);
3027 : : /* Also set flag for this loop so that following scev and niter
3028 : : analysis are done under the assumptions. */
3029 : 10440 : loop_constraint_set (loop, LOOP_C_FINITE);
3030 : : }
3031 : : else
3032 : : /* Clear the existing niter information to make sure the nonwrapping flag
3033 : : will be calculated and set propriately. */
3034 : 201120 : free_numbers_of_iterations_estimates (loop);
3035 : :
3036 : 211560 : auto_vector_modes vector_modes;
3037 : : /* Autodetect first vector size we try. */
3038 : 211560 : vector_modes.safe_push (VOIDmode);
3039 : 211560 : unsigned int autovec_flags
3040 : 423120 : = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3041 : 211560 : loop->simdlen != 0);
3042 : 211560 : bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3043 : 211560 : && !unlimited_cost_model (loop));
3044 : 211560 : machine_mode autodetected_vector_mode = VOIDmode;
3045 : 211560 : opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3046 : 211560 : unsigned int mode_i = 0;
3047 : 211560 : unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3048 : :
3049 : : /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3050 : : a mode has not been analyzed. */
3051 : 211560 : auto_vec<poly_uint64, 8> cached_vf_per_mode;
3052 : 2132268 : for (unsigned i = 0; i < vector_modes.length (); ++i)
3053 : 854574 : cached_vf_per_mode.safe_push (0);
3054 : :
3055 : : /* First determine the main loop vectorization mode, either the first
3056 : : one that works, starting with auto-detecting the vector mode and then
3057 : : following the targets order of preference, or the one with the
3058 : : lowest cost if pick_lowest_cost_p. */
3059 : 580562 : while (1)
3060 : : {
3061 : 396061 : bool fatal;
3062 : 396061 : unsigned int last_mode_i = mode_i;
3063 : : /* Set cached VF to -1 prior to analysis, which indicates a mode has
3064 : : failed. */
3065 : 396061 : cached_vf_per_mode[last_mode_i] = -1;
3066 : 396061 : opt_loop_vec_info loop_vinfo
3067 : 396061 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3068 : : NULL, vector_modes, mode_i, -1,
3069 : : autodetected_vector_mode, fatal);
3070 : 396061 : if (fatal)
3071 : : break;
3072 : :
3073 : 331193 : if (loop_vinfo)
3074 : : {
3075 : : /* Analyzis has been successful so update the VF value. The
3076 : : VF should always be a multiple of unroll_factor and we want to
3077 : : capture the original VF here. */
3078 : 50977 : cached_vf_per_mode[last_mode_i]
3079 : 50977 : = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3080 : 50977 : loop_vinfo->suggested_unroll_factor);
3081 : : /* Once we hit the desired simdlen for the first time,
3082 : : discard any previous attempts. */
3083 : 50977 : if (simdlen
3084 : 50977 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3085 : : {
3086 : 47 : delete first_loop_vinfo;
3087 : : first_loop_vinfo = opt_loop_vec_info::success (NULL);
3088 : : simdlen = 0;
3089 : : }
3090 : 50930 : else if (pick_lowest_cost_p
3091 : 0 : && first_loop_vinfo
3092 : 50930 : && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3093 : : {
3094 : : /* Pick loop_vinfo over first_loop_vinfo. */
3095 : 0 : delete first_loop_vinfo;
3096 : 0 : first_loop_vinfo = opt_loop_vec_info::success (NULL);
3097 : : }
3098 : 50977 : if (first_loop_vinfo == NULL)
3099 : : first_loop_vinfo = loop_vinfo;
3100 : : else
3101 : : {
3102 : 2 : delete loop_vinfo;
3103 : 2 : loop_vinfo = opt_loop_vec_info::success (NULL);
3104 : : }
3105 : :
3106 : : /* Commit to first_loop_vinfo if we have no reason to try
3107 : : alternatives. */
3108 : 50977 : if (!simdlen && !pick_lowest_cost_p)
3109 : : break;
3110 : : }
3111 : 280225 : if (mode_i == vector_modes.length ()
3112 : 280225 : || autodetected_vector_mode == VOIDmode)
3113 : : break;
3114 : :
3115 : : /* Try the next biggest vector size. */
3116 : 184501 : if (dump_enabled_p ())
3117 : 3753 : dump_printf_loc (MSG_NOTE, vect_location,
3118 : : "***** Re-trying analysis with vector mode %s\n",
3119 : 3753 : GET_MODE_NAME (vector_modes[mode_i]));
3120 : 184501 : }
3121 : 211560 : if (!first_loop_vinfo)
3122 : 160590 : return opt_loop_vec_info::propagate_failure (res);
3123 : :
3124 : 50970 : if (dump_enabled_p ())
3125 : 8857 : dump_printf_loc (MSG_NOTE, vect_location,
3126 : : "***** Choosing vector mode %s\n",
3127 : 8857 : GET_MODE_NAME (first_loop_vinfo->vector_mode));
3128 : :
3129 : : /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3130 : : enabled, SIMDUID is not set, it is the innermost loop and we have
3131 : : either already found the loop's SIMDLEN or there was no SIMDLEN to
3132 : : begin with.
3133 : : TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3134 : 50970 : bool vect_epilogues = (!simdlen
3135 : 50968 : && loop->inner == NULL
3136 : 50468 : && param_vect_epilogues_nomask
3137 : 49433 : && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3138 : : /* No code motion support for multiple epilogues so for now
3139 : : not supported when multiple exits. */
3140 : 24613 : && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3141 : 24201 : && !loop->simduid
3142 : 73758 : && loop_cost_model (loop) > VECT_COST_MODEL_VERY_CHEAP);
3143 : 50970 : if (!vect_epilogues)
3144 : 38565 : return first_loop_vinfo;
3145 : :
3146 : : /* Now analyze first_loop_vinfo for epilogue vectorization. */
3147 : :
3148 : : /* For epilogues start the analysis from the first mode. The motivation
3149 : : behind starting from the beginning comes from cases where the VECTOR_MODES
3150 : : array may contain length-agnostic and length-specific modes. Their
3151 : : ordering is not guaranteed, so we could end up picking a mode for the main
3152 : : loop that is after the epilogue's optimal mode. */
3153 : 12405 : int masked_p = -1;
3154 : 12405 : if (!unlimited_cost_model (loop)
3155 : 12405 : && (first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p)
3156 : : != VOIDmode))
3157 : : {
3158 : 4 : vector_modes[0]
3159 : 4 : = first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p);
3160 : 4 : cached_vf_per_mode[0] = 0;
3161 : : }
3162 : : else
3163 : 12401 : vector_modes[0] = autodetected_vector_mode;
3164 : 12405 : mode_i = 0;
3165 : :
3166 : 24820 : bool supports_partial_vectors = (param_vect_partial_vector_usage != 0
3167 : 12405 : || masked_p == 1);
3168 : : machine_mode mask_mode;
3169 : : if (supports_partial_vectors
3170 : 29 : && !partial_vectors_supported_p ()
3171 : 29 : && !(VECTOR_MODE_P (first_loop_vinfo->vector_mode)
3172 : 29 : && targetm.vectorize.get_mask_mode
3173 : 12415 : (first_loop_vinfo->vector_mode).exists (&mask_mode)
3174 : 29 : && SCALAR_INT_MODE_P (mask_mode)))
3175 : 19 : supports_partial_vectors = false;
3176 : 12405 : poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3177 : :
3178 : 12405 : loop_vec_info orig_loop_vinfo = first_loop_vinfo;
3179 : 12589 : do
3180 : : {
3181 : : /* Let the user override what the target suggests. */
3182 : 12497 : if (OPTION_SET_P (param_vect_partial_vector_usage))
3183 : 34 : masked_p = -1;
3184 : :
3185 : 42843 : while (1)
3186 : : {
3187 : : /* If the target does not support partial vectors we can shorten the
3188 : : number of modes to analyze for the epilogue as we know we can't
3189 : : pick a mode that would lead to a VF at least as big as the
3190 : : FIRST_VINFO_VF. */
3191 : 56040 : if (!supports_partial_vectors
3192 : 42843 : && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3193 : : {
3194 : 13222 : mode_i++;
3195 : 26444 : if (mode_i == vector_modes.length ())
3196 : : break;
3197 : 25373 : continue;
3198 : : }
3199 : : /* We would need an exhaustive search to find all modes we
3200 : : skipped but that would lead to the same result as the
3201 : : analysis it was skipped for and where we'd could check
3202 : : cached_vf_per_mode against.
3203 : : Check for the autodetected mode, which is the common
3204 : : situation on x86 which does not perform cost comparison. */
3205 : 41797 : if (!supports_partial_vectors
3206 : 29597 : && maybe_ge (cached_vf_per_mode[0], first_vinfo_vf)
3207 : 58720 : && vect_chooses_same_modes_p (autodetected_vector_mode,
3208 : 29099 : vector_modes[mode_i]))
3209 : : {
3210 : 12176 : mode_i++;
3211 : 24352 : if (mode_i == vector_modes.length ())
3212 : : break;
3213 : 12176 : continue;
3214 : : }
3215 : :
3216 : 17445 : if (dump_enabled_p ())
3217 : 3034 : dump_printf_loc (MSG_NOTE, vect_location,
3218 : : "***** Re-trying epilogue analysis with vector "
3219 : 3034 : "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3220 : :
3221 : 17445 : bool fatal;
3222 : 17445 : opt_loop_vec_info loop_vinfo
3223 : 17445 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3224 : : orig_loop_vinfo,
3225 : : vector_modes, mode_i, masked_p,
3226 : : autodetected_vector_mode, fatal);
3227 : 17445 : if (fatal)
3228 : : break;
3229 : :
3230 : 17445 : if (loop_vinfo)
3231 : : {
3232 : 6712 : if (pick_lowest_cost_p
3233 : 0 : && orig_loop_vinfo->epilogue_vinfo
3234 : 6712 : && vect_joust_loop_vinfos (loop_vinfo,
3235 : 0 : orig_loop_vinfo->epilogue_vinfo))
3236 : : {
3237 : 0 : gcc_assert (vect_epilogues);
3238 : 0 : delete orig_loop_vinfo->epilogue_vinfo;
3239 : 0 : orig_loop_vinfo->epilogue_vinfo = nullptr;
3240 : : }
3241 : 6712 : if (!orig_loop_vinfo->epilogue_vinfo)
3242 : 6712 : orig_loop_vinfo->epilogue_vinfo = loop_vinfo;
3243 : : else
3244 : : {
3245 : 0 : delete loop_vinfo;
3246 : 0 : loop_vinfo = opt_loop_vec_info::success (NULL);
3247 : : }
3248 : :
3249 : : /* For now only allow one epilogue loop, but allow
3250 : : pick_lowest_cost_p to replace it, so commit to the
3251 : : first epilogue if we have no reason to try alternatives. */
3252 : 6712 : if (!pick_lowest_cost_p)
3253 : : break;
3254 : : }
3255 : :
3256 : : /* Revert back to the default from the suggested prefered
3257 : : epilogue vectorization mode. */
3258 : 10733 : masked_p = -1;
3259 : 21466 : if (mode_i == vector_modes.length ())
3260 : : break;
3261 : : }
3262 : :
3263 : 12497 : orig_loop_vinfo = orig_loop_vinfo->epilogue_vinfo;
3264 : 12497 : if (!orig_loop_vinfo)
3265 : : break;
3266 : :
3267 : : /* When we selected a first vectorized epilogue, see if the target
3268 : : suggests to have another one. */
3269 : 6712 : masked_p = -1;
3270 : 6712 : if (!unlimited_cost_model (loop)
3271 : 3907 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (orig_loop_vinfo)
3272 : 10615 : && (orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p)
3273 : : != VOIDmode))
3274 : : {
3275 : 184 : vector_modes[0]
3276 : 92 : = orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p);
3277 : 92 : cached_vf_per_mode[0] = 0;
3278 : 92 : mode_i = 0;
3279 : : }
3280 : : else
3281 : : break;
3282 : 92 : }
3283 : : while (1);
3284 : :
3285 : 12405 : if (first_loop_vinfo->epilogue_vinfo)
3286 : : {
3287 : 6624 : poly_uint64 lowest_th
3288 : 6624 : = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3289 : 6624 : loop_vec_info epilog_vinfo = first_loop_vinfo->epilogue_vinfo;
3290 : 6712 : do
3291 : : {
3292 : 6712 : poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (epilog_vinfo);
3293 : 6712 : gcc_assert (!LOOP_REQUIRES_VERSIONING (epilog_vinfo)
3294 : : || maybe_ne (lowest_th, 0U));
3295 : : /* Keep track of the known smallest versioning threshold. */
3296 : 6712 : if (ordered_p (lowest_th, th))
3297 : 6712 : lowest_th = ordered_min (lowest_th, th);
3298 : 6712 : epilog_vinfo = epilog_vinfo->epilogue_vinfo;
3299 : : }
3300 : 6712 : while (epilog_vinfo);
3301 : 6624 : LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3302 : 6624 : if (dump_enabled_p ())
3303 : 1309 : dump_printf_loc (MSG_NOTE, vect_location,
3304 : : "***** Choosing epilogue vector mode %s\n",
3305 : 1309 : GET_MODE_NAME
3306 : : (first_loop_vinfo->epilogue_vinfo->vector_mode));
3307 : : }
3308 : :
3309 : 12405 : return first_loop_vinfo;
3310 : 663684 : }
3311 : :
3312 : : /* Return true if there is an in-order reduction function for CODE, storing
3313 : : it in *REDUC_FN if so. */
3314 : :
3315 : : static bool
3316 : 4939 : fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3317 : : {
3318 : : /* We support MINUS_EXPR by negating the operand. This also preserves an
3319 : : initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3320 : : (-0.0) = -0.0. */
3321 : 4939 : if (code == PLUS_EXPR || code == MINUS_EXPR)
3322 : : {
3323 : 4263 : *reduc_fn = IFN_FOLD_LEFT_PLUS;
3324 : 0 : return true;
3325 : : }
3326 : : return false;
3327 : : }
3328 : :
3329 : : /* Function reduction_fn_for_scalar_code
3330 : :
3331 : : Input:
3332 : : CODE - tree_code of a reduction operations.
3333 : :
3334 : : Output:
3335 : : REDUC_FN - the corresponding internal function to be used to reduce the
3336 : : vector of partial results into a single scalar result, or IFN_LAST
3337 : : if the operation is a supported reduction operation, but does not have
3338 : : such an internal function.
3339 : :
3340 : : Return FALSE if CODE currently cannot be vectorized as reduction. */
3341 : :
3342 : : bool
3343 : 1995101 : reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3344 : : {
3345 : 1995101 : if (code.is_tree_code ())
3346 : 1995047 : switch (tree_code (code))
3347 : : {
3348 : 14165 : case MAX_EXPR:
3349 : 14165 : *reduc_fn = IFN_REDUC_MAX;
3350 : 14165 : return true;
3351 : :
3352 : 50525 : case MIN_EXPR:
3353 : 50525 : *reduc_fn = IFN_REDUC_MIN;
3354 : 50525 : return true;
3355 : :
3356 : 1074251 : case PLUS_EXPR:
3357 : 1074251 : *reduc_fn = IFN_REDUC_PLUS;
3358 : 1074251 : return true;
3359 : :
3360 : 255003 : case BIT_AND_EXPR:
3361 : 255003 : *reduc_fn = IFN_REDUC_AND;
3362 : 255003 : return true;
3363 : :
3364 : 283478 : case BIT_IOR_EXPR:
3365 : 283478 : *reduc_fn = IFN_REDUC_IOR;
3366 : 283478 : return true;
3367 : :
3368 : 42761 : case BIT_XOR_EXPR:
3369 : 42761 : *reduc_fn = IFN_REDUC_XOR;
3370 : 42761 : return true;
3371 : :
3372 : 274864 : case MULT_EXPR:
3373 : 274864 : case MINUS_EXPR:
3374 : 274864 : *reduc_fn = IFN_LAST;
3375 : 274864 : return true;
3376 : :
3377 : : default:
3378 : : return false;
3379 : : }
3380 : : else
3381 : 54 : switch (combined_fn (code))
3382 : : {
3383 : 30 : CASE_CFN_FMAX:
3384 : 30 : *reduc_fn = IFN_REDUC_FMAX;
3385 : 30 : return true;
3386 : :
3387 : 24 : CASE_CFN_FMIN:
3388 : 24 : *reduc_fn = IFN_REDUC_FMIN;
3389 : 24 : return true;
3390 : :
3391 : : default:
3392 : : return false;
3393 : : }
3394 : : }
3395 : :
3396 : : /* If there is a neutral value X such that a reduction would not be affected
3397 : : by the introduction of additional X elements, return that X, otherwise
3398 : : return null. CODE is the code of the reduction and SCALAR_TYPE is type
3399 : : of the scalar elements. If the reduction has just a single initial value
3400 : : then INITIAL_VALUE is that value, otherwise it is null.
3401 : : If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3402 : : In that case no signed zero is returned. */
3403 : :
3404 : : tree
3405 : 71689 : neutral_op_for_reduction (tree scalar_type, code_helper code,
3406 : : tree initial_value, bool as_initial)
3407 : : {
3408 : 71689 : if (code.is_tree_code ())
3409 : 71635 : switch (tree_code (code))
3410 : : {
3411 : 10138 : case DOT_PROD_EXPR:
3412 : 10138 : case SAD_EXPR:
3413 : 10138 : case MINUS_EXPR:
3414 : 10138 : case BIT_IOR_EXPR:
3415 : 10138 : case BIT_XOR_EXPR:
3416 : 10138 : return build_zero_cst (scalar_type);
3417 : 55857 : case WIDEN_SUM_EXPR:
3418 : 55857 : case PLUS_EXPR:
3419 : 55857 : if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3420 : 19 : return build_real (scalar_type, dconstm0);
3421 : : else
3422 : 55838 : return build_zero_cst (scalar_type);
3423 : :
3424 : 1891 : case MULT_EXPR:
3425 : 1891 : return build_one_cst (scalar_type);
3426 : :
3427 : 1315 : case BIT_AND_EXPR:
3428 : 1315 : return build_all_ones_cst (scalar_type);
3429 : :
3430 : : case MAX_EXPR:
3431 : : case MIN_EXPR:
3432 : : return initial_value;
3433 : :
3434 : 356 : default:
3435 : 356 : return NULL_TREE;
3436 : : }
3437 : : else
3438 : 54 : switch (combined_fn (code))
3439 : : {
3440 : : CASE_CFN_FMIN:
3441 : : CASE_CFN_FMAX:
3442 : : return initial_value;
3443 : :
3444 : 0 : default:
3445 : 0 : return NULL_TREE;
3446 : : }
3447 : : }
3448 : :
3449 : : /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3450 : : STMT is printed with a message MSG. */
3451 : :
3452 : : static void
3453 : 495 : report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3454 : : {
3455 : 495 : dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3456 : 495 : }
3457 : :
3458 : : /* Return true if we need an in-order reduction for operation CODE
3459 : : on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3460 : : overflow must wrap. */
3461 : :
3462 : : bool
3463 : 6339503 : needs_fold_left_reduction_p (tree type, code_helper code)
3464 : : {
3465 : : /* CHECKME: check for !flag_finite_math_only too? */
3466 : 6339503 : if (SCALAR_FLOAT_TYPE_P (type))
3467 : : {
3468 : 527571 : if (code.is_tree_code ())
3469 : 527521 : switch (tree_code (code))
3470 : : {
3471 : : case MIN_EXPR:
3472 : : case MAX_EXPR:
3473 : : return false;
3474 : :
3475 : 526061 : default:
3476 : 526061 : return !flag_associative_math;
3477 : : }
3478 : : else
3479 : 50 : switch (combined_fn (code))
3480 : : {
3481 : : CASE_CFN_FMIN:
3482 : : CASE_CFN_FMAX:
3483 : : return false;
3484 : :
3485 : 2 : default:
3486 : 2 : return !flag_associative_math;
3487 : : }
3488 : : }
3489 : :
3490 : 5811932 : if (INTEGRAL_TYPE_P (type))
3491 : 5811135 : return (!code.is_tree_code ()
3492 : 5811135 : || !operation_no_trapping_overflow (type, tree_code (code)));
3493 : :
3494 : 797 : if (SAT_FIXED_POINT_TYPE_P (type))
3495 : : return true;
3496 : :
3497 : : return false;
3498 : : }
3499 : :
3500 : : /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3501 : : has a handled computation expression. Store the main reduction
3502 : : operation in *CODE. */
3503 : :
3504 : : static bool
3505 : 64034 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3506 : : tree loop_arg, code_helper *code,
3507 : : vec<std::pair<ssa_op_iter, use_operand_p> > &path,
3508 : : bool inner_loop_of_double_reduc)
3509 : : {
3510 : 64034 : auto_bitmap visited;
3511 : 64034 : tree lookfor = PHI_RESULT (phi);
3512 : 64034 : ssa_op_iter curri;
3513 : 64034 : use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3514 : 134628 : while (USE_FROM_PTR (curr) != loop_arg)
3515 : 6560 : curr = op_iter_next_use (&curri);
3516 : 64034 : curri.i = curri.numops;
3517 : 618891 : do
3518 : : {
3519 : 618891 : path.safe_push (std::make_pair (curri, curr));
3520 : 618891 : tree use = USE_FROM_PTR (curr);
3521 : 618891 : if (use == lookfor)
3522 : : break;
3523 : 555196 : gimple *def = SSA_NAME_DEF_STMT (use);
3524 : 555196 : if (gimple_nop_p (def)
3525 : 555196 : || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3526 : : {
3527 : 470045 : pop:
3528 : 470045 : do
3529 : : {
3530 : 470045 : std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3531 : 470045 : curri = x.first;
3532 : 470045 : curr = x.second;
3533 : 514961 : do
3534 : 514961 : curr = op_iter_next_use (&curri);
3535 : : /* Skip already visited or non-SSA operands (from iterating
3536 : : over PHI args). */
3537 : : while (curr != NULL_USE_OPERAND_P
3538 : 1029922 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3539 : 177177 : || ! bitmap_set_bit (visited,
3540 : 177177 : SSA_NAME_VERSION
3541 : : (USE_FROM_PTR (curr)))));
3542 : : }
3543 : 940090 : while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3544 : 158272 : if (curr == NULL_USE_OPERAND_P)
3545 : : break;
3546 : : }
3547 : : else
3548 : : {
3549 : 465664 : if (gimple_code (def) == GIMPLE_PHI)
3550 : 48766 : curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3551 : : else
3552 : 416898 : curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3553 : : while (curr != NULL_USE_OPERAND_P
3554 : 559765 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3555 : 487161 : || ! bitmap_set_bit (visited,
3556 : 487161 : SSA_NAME_VERSION
3557 : : (USE_FROM_PTR (curr)))))
3558 : 94101 : curr = op_iter_next_use (&curri);
3559 : 465664 : if (curr == NULL_USE_OPERAND_P)
3560 : 68740 : goto pop;
3561 : : }
3562 : : }
3563 : : while (1);
3564 : 64034 : if (dump_file && (dump_flags & TDF_DETAILS))
3565 : : {
3566 : 3645 : dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3567 : 3645 : unsigned i;
3568 : 3645 : std::pair<ssa_op_iter, use_operand_p> *x;
3569 : 12509 : FOR_EACH_VEC_ELT (path, i, x)
3570 : 8864 : dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3571 : 3645 : dump_printf (MSG_NOTE, "\n");
3572 : : }
3573 : :
3574 : : /* Check whether the reduction path detected is valid. */
3575 : 64034 : bool fail = path.length () == 0;
3576 : 64034 : bool neg = false;
3577 : 64034 : int sign = -1;
3578 : 64034 : *code = ERROR_MARK;
3579 : 140325 : for (unsigned i = 1; i < path.length (); ++i)
3580 : : {
3581 : 79198 : gimple *use_stmt = USE_STMT (path[i].second);
3582 : 79198 : gimple_match_op op;
3583 : 79198 : if (!gimple_extract_op (use_stmt, &op))
3584 : : {
3585 : : fail = true;
3586 : 2907 : break;
3587 : : }
3588 : 78639 : unsigned int opi = op.num_ops;
3589 : 78639 : if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3590 : : {
3591 : : /* The following make sure we can compute the operand index
3592 : : easily plus it mostly disallows chaining via COND_EXPR condition
3593 : : operands. */
3594 : 124811 : for (opi = 0; opi < op.num_ops; ++opi)
3595 : 123870 : if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3596 : : break;
3597 : : }
3598 : 3226 : else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3599 : : {
3600 : 6472 : for (opi = 0; opi < op.num_ops; ++opi)
3601 : 6472 : if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3602 : : break;
3603 : : }
3604 : 78639 : if (opi == op.num_ops)
3605 : : {
3606 : : fail = true;
3607 : : break;
3608 : : }
3609 : 77698 : op.code = canonicalize_code (op.code, op.type);
3610 : 77698 : if (op.code == MINUS_EXPR)
3611 : : {
3612 : 3844 : op.code = PLUS_EXPR;
3613 : : /* Track whether we negate the reduction value each iteration. */
3614 : 3844 : if (op.ops[1] == op.ops[opi])
3615 : 32 : neg = ! neg;
3616 : : }
3617 : 73854 : else if (op.code == IFN_COND_SUB)
3618 : : {
3619 : 2 : op.code = IFN_COND_ADD;
3620 : : /* Track whether we negate the reduction value each iteration. */
3621 : 2 : if (op.ops[2] == op.ops[opi])
3622 : 0 : neg = ! neg;
3623 : : }
3624 : : /* For an FMA the reduction code is the PLUS if the addition chain
3625 : : is the reduction. */
3626 : 73852 : else if (op.code == IFN_FMA && opi == 2)
3627 : 28 : op.code = PLUS_EXPR;
3628 : 77698 : if (CONVERT_EXPR_CODE_P (op.code)
3629 : 77698 : && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3630 : : ;
3631 : 74240 : else if (*code == ERROR_MARK)
3632 : : {
3633 : 62278 : *code = op.code;
3634 : 62278 : sign = TYPE_SIGN (op.type);
3635 : : }
3636 : 11962 : else if (op.code != *code)
3637 : : {
3638 : : fail = true;
3639 : : break;
3640 : : }
3641 : 10756 : else if ((op.code == MIN_EXPR
3642 : 10672 : || op.code == MAX_EXPR)
3643 : 10768 : && sign != TYPE_SIGN (op.type))
3644 : : {
3645 : : fail = true;
3646 : : break;
3647 : : }
3648 : : /* Check there's only a single stmt the op is used on. For the
3649 : : not value-changing tail and the last stmt allow out-of-loop uses,
3650 : : but not when this is the inner loop of a double reduction.
3651 : : ??? We could relax this and handle arbitrary live stmts by
3652 : : forcing a scalar epilogue for example. */
3653 : 76489 : imm_use_iterator imm_iter;
3654 : 76489 : use_operand_p use_p;
3655 : 76489 : gimple *op_use_stmt;
3656 : 76489 : unsigned cnt = 0;
3657 : 79685 : bool cond_fn_p = op.code.is_internal_fn ()
3658 : 3196 : && (conditional_internal_fn_code (internal_fn (op.code))
3659 : 76489 : != ERROR_MARK);
3660 : :
3661 : 178071 : FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3662 : : {
3663 : : /* In case of a COND_OP (mask, op1, op2, op1) reduction we should
3664 : : have op1 twice (once as definition, once as else) in the same
3665 : : operation. Enforce this. */
3666 : 101582 : if (cond_fn_p && op_use_stmt == use_stmt)
3667 : : {
3668 : 3140 : gcall *call = as_a<gcall *> (use_stmt);
3669 : 3140 : unsigned else_pos
3670 : 3140 : = internal_fn_else_index (internal_fn (op.code));
3671 : 3140 : if (gimple_call_arg (call, else_pos) != op.ops[opi])
3672 : : {
3673 : : fail = true;
3674 : : break;
3675 : : }
3676 : 15700 : for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
3677 : : {
3678 : 12560 : if (j == else_pos)
3679 : 3140 : continue;
3680 : 9420 : if (gimple_call_arg (call, j) == op.ops[opi])
3681 : 3140 : cnt++;
3682 : : }
3683 : : }
3684 : 98442 : else if (!is_gimple_debug (op_use_stmt)
3685 : 98442 : && ((*code != ERROR_MARK || inner_loop_of_double_reduc)
3686 : 1776 : || flow_bb_inside_loop_p (loop,
3687 : 1776 : gimple_bb (op_use_stmt))))
3688 : 147237 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3689 : 73623 : cnt++;
3690 : 76489 : }
3691 : :
3692 : 76489 : if (cnt != 1)
3693 : : {
3694 : : fail = true;
3695 : : break;
3696 : : }
3697 : : }
3698 : 67289 : return ! fail && ! neg && *code != ERROR_MARK;
3699 : 64034 : }
3700 : :
3701 : : bool
3702 : 21 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3703 : : tree loop_arg, enum tree_code code)
3704 : : {
3705 : 21 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3706 : 21 : code_helper code_;
3707 : 21 : return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path, false)
3708 : 21 : && code_ == code);
3709 : 21 : }
3710 : :
3711 : :
3712 : :
3713 : : /* Function vect_is_simple_reduction
3714 : :
3715 : : (1) Detect a cross-iteration def-use cycle that represents a simple
3716 : : reduction computation. We look for the following pattern:
3717 : :
3718 : : loop_header:
3719 : : a1 = phi < a0, a2 >
3720 : : a3 = ...
3721 : : a2 = operation (a3, a1)
3722 : :
3723 : : or
3724 : :
3725 : : a3 = ...
3726 : : loop_header:
3727 : : a1 = phi < a0, a2 >
3728 : : a2 = operation (a3, a1)
3729 : :
3730 : : such that:
3731 : : 1. operation is commutative and associative and it is safe to
3732 : : change the order of the computation
3733 : : 2. no uses for a2 in the loop (a2 is used out of the loop)
3734 : : 3. no uses of a1 in the loop besides the reduction operation
3735 : : 4. no uses of a1 outside the loop.
3736 : :
3737 : : Conditions 1,4 are tested here.
3738 : : Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3739 : :
3740 : : (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3741 : : nested cycles.
3742 : :
3743 : : (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3744 : : reductions:
3745 : :
3746 : : a1 = phi < a0, a2 >
3747 : : inner loop (def of a3)
3748 : : a2 = phi < a3 >
3749 : :
3750 : : (4) Detect condition expressions, ie:
3751 : : for (int i = 0; i < N; i++)
3752 : : if (a[i] < val)
3753 : : ret_val = a[i];
3754 : :
3755 : : */
3756 : :
3757 : : static stmt_vec_info
3758 : 85020 : vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3759 : : gphi **double_reduc, bool *reduc_chain_p, bool slp)
3760 : : {
3761 : 85020 : gphi *phi = as_a <gphi *> (phi_info->stmt);
3762 : 85020 : gimple *phi_use_stmt = NULL;
3763 : 85020 : imm_use_iterator imm_iter;
3764 : 85020 : use_operand_p use_p;
3765 : :
3766 : : /* When double_reduc is NULL we are testing the inner loop of a
3767 : : double reduction. */
3768 : 85020 : bool inner_loop_of_double_reduc = double_reduc == NULL;
3769 : 85020 : if (double_reduc)
3770 : 84029 : *double_reduc = NULL;
3771 : 85020 : *reduc_chain_p = false;
3772 : 85020 : STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3773 : :
3774 : 85020 : tree phi_name = PHI_RESULT (phi);
3775 : : /* ??? If there are no uses of the PHI result the inner loop reduction
3776 : : won't be detected as possibly double-reduction by vectorizable_reduction
3777 : : because that tries to walk the PHI arg from the preheader edge which
3778 : : can be constant. See PR60382. */
3779 : 85020 : if (has_zero_uses (phi_name))
3780 : : return NULL;
3781 : 84892 : class loop *loop = (gimple_bb (phi))->loop_father;
3782 : 84892 : unsigned nphi_def_loop_uses = 0;
3783 : 210314 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3784 : : {
3785 : 129277 : gimple *use_stmt = USE_STMT (use_p);
3786 : 129277 : if (is_gimple_debug (use_stmt))
3787 : 31168 : continue;
3788 : :
3789 : 98109 : if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3790 : : {
3791 : 3855 : if (dump_enabled_p ())
3792 : 30 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3793 : : "intermediate value used outside loop.\n");
3794 : :
3795 : 3855 : return NULL;
3796 : : }
3797 : :
3798 : : /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
3799 : : op1 twice (once as definition, once as else) in the same operation.
3800 : : Only count it as one. */
3801 : 94254 : if (use_stmt != phi_use_stmt)
3802 : : {
3803 : 90853 : nphi_def_loop_uses++;
3804 : 90853 : phi_use_stmt = use_stmt;
3805 : : }
3806 : : }
3807 : :
3808 : 81037 : tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3809 : 81037 : if (TREE_CODE (latch_def) != SSA_NAME)
3810 : : {
3811 : 1222 : if (dump_enabled_p ())
3812 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3813 : : "reduction: not ssa_name: %T\n", latch_def);
3814 : 1222 : return NULL;
3815 : : }
3816 : :
3817 : 79815 : stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3818 : 79815 : if (!def_stmt_info
3819 : 79815 : || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3820 : 135 : return NULL;
3821 : :
3822 : 79680 : bool nested_in_vect_loop
3823 : 79680 : = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3824 : 79680 : unsigned nlatch_def_loop_uses = 0;
3825 : 79680 : auto_vec<gphi *, 3> lcphis;
3826 : 300292 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3827 : : {
3828 : 220612 : gimple *use_stmt = USE_STMT (use_p);
3829 : 220612 : if (is_gimple_debug (use_stmt))
3830 : 60950 : continue;
3831 : 159662 : if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3832 : 88315 : nlatch_def_loop_uses++;
3833 : : else
3834 : : /* We can have more than one loop-closed PHI. */
3835 : 71347 : lcphis.safe_push (as_a <gphi *> (use_stmt));
3836 : : }
3837 : :
3838 : : /* If we are vectorizing an inner reduction we are executing that
3839 : : in the original order only in case we are not dealing with a
3840 : : double reduction. */
3841 : 79680 : if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3842 : : {
3843 : 2195 : if (dump_enabled_p ())
3844 : 361 : report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3845 : : "detected nested cycle: ");
3846 : 2195 : return def_stmt_info;
3847 : : }
3848 : :
3849 : : /* When the inner loop of a double reduction ends up with more than
3850 : : one loop-closed PHI we have failed to classify alternate such
3851 : : PHIs as double reduction, leading to wrong code. See PR103237. */
3852 : 78464 : if (inner_loop_of_double_reduc && lcphis.length () != 1)
3853 : : {
3854 : 1 : if (dump_enabled_p ())
3855 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3856 : : "unhandle double reduction\n");
3857 : 1 : return NULL;
3858 : : }
3859 : :
3860 : : /* If this isn't a nested cycle or if the nested cycle reduction value
3861 : : is used ouside of the inner loop we cannot handle uses of the reduction
3862 : : value. */
3863 : 77484 : if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3864 : : {
3865 : 12314 : if (dump_enabled_p ())
3866 : 316 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3867 : : "reduction used in loop.\n");
3868 : 12314 : return NULL;
3869 : : }
3870 : :
3871 : : /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3872 : : defined in the inner loop. */
3873 : 65170 : if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3874 : : {
3875 : 1157 : tree op1 = PHI_ARG_DEF (def_stmt, 0);
3876 : 1157 : if (gimple_phi_num_args (def_stmt) != 1
3877 : 1157 : || TREE_CODE (op1) != SSA_NAME)
3878 : : {
3879 : 52 : if (dump_enabled_p ())
3880 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3881 : : "unsupported phi node definition.\n");
3882 : :
3883 : 52 : return NULL;
3884 : : }
3885 : :
3886 : : /* Verify there is an inner cycle composed of the PHI phi_use_stmt
3887 : : and the latch definition op1. */
3888 : 1105 : gimple *def1 = SSA_NAME_DEF_STMT (op1);
3889 : 1105 : if (gimple_bb (def1)
3890 : 1105 : && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3891 : 1105 : && loop->inner
3892 : 1097 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3893 : 1097 : && (is_gimple_assign (def1) || is_gimple_call (def1))
3894 : 1088 : && is_a <gphi *> (phi_use_stmt)
3895 : 1077 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
3896 : 1077 : && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
3897 : : loop_latch_edge (loop->inner)))
3898 : 2180 : && lcphis.length () == 1)
3899 : : {
3900 : 991 : if (dump_enabled_p ())
3901 : 134 : report_vect_op (MSG_NOTE, def_stmt,
3902 : : "detected double reduction: ");
3903 : :
3904 : 991 : *double_reduc = as_a <gphi *> (phi_use_stmt);
3905 : 991 : return def_stmt_info;
3906 : : }
3907 : :
3908 : 114 : return NULL;
3909 : : }
3910 : :
3911 : : /* Look for the expression computing latch_def from then loop PHI result. */
3912 : 64013 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3913 : 64013 : code_helper code;
3914 : 64013 : if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3915 : : path, inner_loop_of_double_reduc))
3916 : : {
3917 : 60758 : STMT_VINFO_REDUC_CODE (phi_info) = code;
3918 : 60758 : if (code == COND_EXPR && !nested_in_vect_loop)
3919 : 4147 : STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3920 : :
3921 : : /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3922 : : reduction chain for which the additional restriction is that
3923 : : all operations in the chain are the same. */
3924 : 60758 : auto_vec<stmt_vec_info, 8> reduc_chain;
3925 : 60758 : unsigned i;
3926 : 60758 : bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3927 : 196321 : for (i = path.length () - 1; i >= 1; --i)
3928 : : {
3929 : 74805 : gimple *stmt = USE_STMT (path[i].second);
3930 : 74805 : stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3931 : 74805 : gimple_match_op op;
3932 : 74805 : if (!gimple_extract_op (stmt, &op))
3933 : 0 : gcc_unreachable ();
3934 : 74805 : if (gassign *assign = dyn_cast<gassign *> (stmt))
3935 : 71599 : STMT_VINFO_REDUC_IDX (stmt_info)
3936 : 71599 : = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3937 : : else
3938 : : {
3939 : 3206 : gcall *call = as_a<gcall *> (stmt);
3940 : 3206 : STMT_VINFO_REDUC_IDX (stmt_info)
3941 : 3206 : = path[i].second->use - gimple_call_arg_ptr (call, 0);
3942 : : }
3943 : 74805 : bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
3944 : 74805 : && (i == 1 || i == path.length () - 1));
3945 : 7172 : if ((op.code != code && !leading_conversion)
3946 : : /* We can only handle the final value in epilogue
3947 : : generation for reduction chains. */
3948 : 78190 : || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
3949 : : is_slp_reduc = false;
3950 : : /* For reduction chains we support a trailing/leading
3951 : : conversions. We do not store those in the actual chain. */
3952 : 74805 : if (leading_conversion)
3953 : 3385 : continue;
3954 : 71420 : reduc_chain.safe_push (stmt_info);
3955 : : }
3956 : 112684 : if (slp && is_slp_reduc && reduc_chain.length () > 1)
3957 : : {
3958 : 8465 : for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3959 : : {
3960 : 5200 : REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3961 : 5200 : REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3962 : : }
3963 : 3265 : REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3964 : 3265 : REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3965 : :
3966 : : /* Save the chain for further analysis in SLP detection. */
3967 : 3265 : LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3968 : 6530 : REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3969 : :
3970 : 3265 : *reduc_chain_p = true;
3971 : 3265 : if (dump_enabled_p ())
3972 : 293 : dump_printf_loc (MSG_NOTE, vect_location,
3973 : : "reduction: detected reduction chain\n");
3974 : : }
3975 : 57493 : else if (dump_enabled_p ())
3976 : 3286 : dump_printf_loc (MSG_NOTE, vect_location,
3977 : : "reduction: detected reduction\n");
3978 : :
3979 : 60758 : return def_stmt_info;
3980 : 60758 : }
3981 : :
3982 : 3255 : if (dump_enabled_p ())
3983 : 80 : dump_printf_loc (MSG_NOTE, vect_location,
3984 : : "reduction: unknown pattern\n");
3985 : :
3986 : : return NULL;
3987 : 143693 : }
3988 : :
3989 : : /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3990 : : PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3991 : : or -1 if not known. */
3992 : :
3993 : : static int
3994 : 350900 : vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3995 : : {
3996 : 350900 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
3997 : 350900 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3998 : : {
3999 : 138980 : if (dump_enabled_p ())
4000 : 2751 : dump_printf_loc (MSG_NOTE, vect_location,
4001 : : "cost model: epilogue peel iters set to vf/2 "
4002 : : "because loop iterations are unknown .\n");
4003 : 138980 : return assumed_vf / 2;
4004 : : }
4005 : : else
4006 : : {
4007 : 211920 : int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4008 : 211920 : peel_iters_prologue = MIN (niters, peel_iters_prologue);
4009 : 211920 : int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4010 : : /* If we need to peel for gaps, but no peeling is required, we have to
4011 : : peel VF iterations. */
4012 : 211920 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4013 : 211920 : peel_iters_epilogue = assumed_vf;
4014 : 211920 : return peel_iters_epilogue;
4015 : : }
4016 : : }
4017 : :
4018 : : /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4019 : : int
4020 : 271205 : vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4021 : : int *peel_iters_epilogue,
4022 : : stmt_vector_for_cost *scalar_cost_vec,
4023 : : stmt_vector_for_cost *prologue_cost_vec,
4024 : : stmt_vector_for_cost *epilogue_cost_vec)
4025 : : {
4026 : 271205 : int retval = 0;
4027 : :
4028 : 271205 : *peel_iters_epilogue
4029 : 271205 : = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4030 : :
4031 : 271205 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4032 : : {
4033 : : /* If peeled iterations are known but number of scalar loop
4034 : : iterations are unknown, count a taken branch per peeled loop. */
4035 : 91698 : if (peel_iters_prologue > 0)
4036 : 54384 : retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4037 : : vect_prologue);
4038 : 91698 : if (*peel_iters_epilogue > 0)
4039 : 91626 : retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4040 : : vect_epilogue);
4041 : : }
4042 : :
4043 : 271205 : stmt_info_for_cost *si;
4044 : 271205 : int j;
4045 : 271205 : if (peel_iters_prologue)
4046 : 677182 : FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4047 : 562548 : retval += record_stmt_cost (prologue_cost_vec,
4048 : 562548 : si->count * peel_iters_prologue,
4049 : : si->kind, si->stmt_info, si->misalign,
4050 : : vect_prologue);
4051 : 271205 : if (*peel_iters_epilogue)
4052 : 1058436 : FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4053 : 876693 : retval += record_stmt_cost (epilogue_cost_vec,
4054 : 876693 : si->count * *peel_iters_epilogue,
4055 : : si->kind, si->stmt_info, si->misalign,
4056 : : vect_epilogue);
4057 : :
4058 : 271205 : return retval;
4059 : : }
4060 : :
4061 : : /* Function vect_estimate_min_profitable_iters
4062 : :
4063 : : Return the number of iterations required for the vector version of the
4064 : : loop to be profitable relative to the cost of the scalar version of the
4065 : : loop.
4066 : :
4067 : : *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4068 : : of iterations for vectorization. -1 value means loop vectorization
4069 : : is not profitable. This returned value may be used for dynamic
4070 : : profitability check.
4071 : :
4072 : : *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4073 : : for static check against estimated number of iterations. */
4074 : :
4075 : : static void
4076 : 95935 : vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4077 : : int *ret_min_profitable_niters,
4078 : : int *ret_min_profitable_estimate,
4079 : : unsigned *suggested_unroll_factor)
4080 : : {
4081 : 95935 : int min_profitable_iters;
4082 : 95935 : int min_profitable_estimate;
4083 : 95935 : int peel_iters_prologue;
4084 : 95935 : int peel_iters_epilogue;
4085 : 95935 : unsigned vec_inside_cost = 0;
4086 : 95935 : int vec_outside_cost = 0;
4087 : 95935 : unsigned vec_prologue_cost = 0;
4088 : 95935 : unsigned vec_epilogue_cost = 0;
4089 : 95935 : int scalar_single_iter_cost = 0;
4090 : 95935 : int scalar_outside_cost = 0;
4091 : 95935 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
4092 : 95935 : int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4093 : 95935 : vector_costs *target_cost_data = loop_vinfo->vector_costs;
4094 : :
4095 : : /* Cost model disabled. */
4096 : 95935 : if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4097 : : {
4098 : 16052 : if (dump_enabled_p ())
4099 : 9805 : dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4100 : 16052 : *ret_min_profitable_niters = 0;
4101 : 16052 : *ret_min_profitable_estimate = 0;
4102 : 16052 : return;
4103 : : }
4104 : :
4105 : : /* Requires loop versioning tests to handle misalignment. */
4106 : 79883 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4107 : : {
4108 : : /* FIXME: Make cost depend on complexity of individual check. */
4109 : 27 : unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4110 : 27 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4111 : 27 : if (dump_enabled_p ())
4112 : 1 : dump_printf (MSG_NOTE,
4113 : : "cost model: Adding cost of checks for loop "
4114 : : "versioning to treat misalignment.\n");
4115 : : }
4116 : :
4117 : : /* Requires loop versioning with alias checks. */
4118 : 79883 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4119 : : {
4120 : : /* FIXME: Make cost depend on complexity of individual check. */
4121 : 4079 : unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4122 : 4079 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4123 : 4079 : len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4124 : 0 : if (len)
4125 : : /* Count LEN - 1 ANDs and LEN comparisons. */
4126 : 0 : (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4127 : : scalar_stmt, vect_prologue);
4128 : 4079 : len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4129 : 1134 : if (len)
4130 : : {
4131 : : /* Count LEN - 1 ANDs and LEN comparisons. */
4132 : 1134 : unsigned int nstmts = len * 2 - 1;
4133 : : /* +1 for each bias that needs adding. */
4134 : 2268 : for (unsigned int i = 0; i < len; ++i)
4135 : 1134 : if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4136 : 123 : nstmts += 1;
4137 : 1134 : (void) add_stmt_cost (target_cost_data, nstmts,
4138 : : scalar_stmt, vect_prologue);
4139 : : }
4140 : 4079 : if (dump_enabled_p ())
4141 : 15 : dump_printf (MSG_NOTE,
4142 : : "cost model: Adding cost of checks for loop "
4143 : : "versioning aliasing.\n");
4144 : : }
4145 : :
4146 : : /* Requires loop versioning with niter checks. */
4147 : 79883 : if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4148 : : {
4149 : : /* FIXME: Make cost depend on complexity of individual check. */
4150 : 684 : (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4151 : : NULL, NULL, NULL_TREE, 0, vect_prologue);
4152 : 684 : if (dump_enabled_p ())
4153 : 1 : dump_printf (MSG_NOTE,
4154 : : "cost model: Adding cost of checks for loop "
4155 : : "versioning niters.\n");
4156 : : }
4157 : :
4158 : 79883 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4159 : 4774 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4160 : : vect_prologue);
4161 : :
4162 : : /* Count statements in scalar loop. Using this as scalar cost for a single
4163 : : iteration for now.
4164 : :
4165 : : TODO: Add outer loop support.
4166 : :
4167 : : TODO: Consider assigning different costs to different scalar
4168 : : statements. */
4169 : :
4170 : 79883 : scalar_single_iter_cost = (loop_vinfo->scalar_costs->total_cost ()
4171 : 79883 : * param_vect_scalar_cost_multiplier) / 100;
4172 : :
4173 : : /* Add additional cost for the peeled instructions in prologue and epilogue
4174 : : loop. (For fully-masked loops there will be no peeling.)
4175 : :
4176 : : FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4177 : : at compile-time - we assume it's vf/2 (the worst would be vf-1).
4178 : :
4179 : : TODO: Build an expression that represents peel_iters for prologue and
4180 : : epilogue to be used in a run-time test. */
4181 : :
4182 : 79883 : bool prologue_need_br_taken_cost = false;
4183 : 79883 : bool prologue_need_br_not_taken_cost = false;
4184 : :
4185 : : /* Calculate peel_iters_prologue. */
4186 : 79883 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4187 : : peel_iters_prologue = 0;
4188 : 79883 : else if (npeel < 0)
4189 : : {
4190 : 169 : peel_iters_prologue = assumed_vf / 2;
4191 : 169 : if (dump_enabled_p ())
4192 : 4 : dump_printf (MSG_NOTE, "cost model: "
4193 : : "prologue peel iters set to vf/2.\n");
4194 : :
4195 : : /* If peeled iterations are unknown, count a taken branch and a not taken
4196 : : branch per peeled loop. Even if scalar loop iterations are known,
4197 : : vector iterations are not known since peeled prologue iterations are
4198 : : not known. Hence guards remain the same. */
4199 : : prologue_need_br_taken_cost = true;
4200 : : prologue_need_br_not_taken_cost = true;
4201 : : }
4202 : : else
4203 : : {
4204 : 79714 : peel_iters_prologue = npeel;
4205 : 79714 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4206 : : /* If peeled iterations are known but number of scalar loop
4207 : : iterations are unknown, count a taken branch per peeled loop. */
4208 : 79883 : prologue_need_br_taken_cost = true;
4209 : : }
4210 : :
4211 : 79883 : bool epilogue_need_br_taken_cost = false;
4212 : 79883 : bool epilogue_need_br_not_taken_cost = false;
4213 : :
4214 : : /* Calculate peel_iters_epilogue. */
4215 : 79883 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4216 : : /* We need to peel exactly one iteration for gaps. */
4217 : 19 : peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4218 : 79864 : else if (npeel < 0)
4219 : : {
4220 : : /* If peeling for alignment is unknown, loop bound of main loop
4221 : : becomes unknown. */
4222 : 169 : peel_iters_epilogue = assumed_vf / 2;
4223 : 169 : if (dump_enabled_p ())
4224 : 4 : dump_printf (MSG_NOTE, "cost model: "
4225 : : "epilogue peel iters set to vf/2 because "
4226 : : "peeling for alignment is unknown.\n");
4227 : :
4228 : : /* See the same reason above in peel_iters_prologue calculation. */
4229 : : epilogue_need_br_taken_cost = true;
4230 : : epilogue_need_br_not_taken_cost = true;
4231 : : }
4232 : : else
4233 : : {
4234 : 79695 : peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4235 : 79695 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4236 : : /* If peeled iterations are known but number of scalar loop
4237 : : iterations are unknown, count a taken branch per peeled loop. */
4238 : 79883 : epilogue_need_br_taken_cost = true;
4239 : : }
4240 : :
4241 : 79883 : stmt_info_for_cost *si;
4242 : 79883 : int j;
4243 : : /* Add costs associated with peel_iters_prologue. */
4244 : 79883 : if (peel_iters_prologue)
4245 : 809 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4246 : : {
4247 : 631 : (void) add_stmt_cost (target_cost_data,
4248 : 631 : si->count * peel_iters_prologue, si->kind,
4249 : : si->stmt_info, si->node, si->vectype,
4250 : : si->misalign, vect_prologue);
4251 : : }
4252 : :
4253 : : /* Add costs associated with peel_iters_epilogue. */
4254 : 79883 : if (peel_iters_epilogue)
4255 : 271121 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4256 : : {
4257 : 214889 : (void) add_stmt_cost (target_cost_data,
4258 : 214889 : si->count * peel_iters_epilogue, si->kind,
4259 : : si->stmt_info, si->node, si->vectype,
4260 : : si->misalign, vect_epilogue);
4261 : : }
4262 : :
4263 : : /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4264 : :
4265 : 79883 : if (prologue_need_br_taken_cost)
4266 : 170 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4267 : : vect_prologue);
4268 : :
4269 : 79883 : if (prologue_need_br_not_taken_cost)
4270 : 169 : (void) add_stmt_cost (target_cost_data, 1,
4271 : : cond_branch_not_taken, vect_prologue);
4272 : :
4273 : 79883 : if (epilogue_need_br_taken_cost)
4274 : 46898 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4275 : : vect_epilogue);
4276 : :
4277 : 79883 : if (epilogue_need_br_not_taken_cost)
4278 : 169 : (void) add_stmt_cost (target_cost_data, 1,
4279 : : cond_branch_not_taken, vect_epilogue);
4280 : :
4281 : : /* Take care of special costs for rgroup controls of partial vectors. */
4282 : 19 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4283 : 79902 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4284 : : == vect_partial_vectors_avx512))
4285 : : {
4286 : : /* Calculate how many masks we need to generate. */
4287 : 19 : unsigned int num_masks = 0;
4288 : 19 : bool need_saturation = false;
4289 : 78 : for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4290 : 21 : if (rgm.type)
4291 : : {
4292 : 19 : unsigned nvectors = rgm.factor;
4293 : 19 : num_masks += nvectors;
4294 : 19 : if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4295 : 19 : < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4296 : 5 : need_saturation = true;
4297 : : }
4298 : :
4299 : : /* ??? The target isn't able to identify the costs below as
4300 : : producing masks so it cannot penaltize cases where we'd run
4301 : : out of mask registers for example. */
4302 : :
4303 : : /* ??? We are also failing to account for smaller vector masks
4304 : : we generate by splitting larger masks in vect_get_loop_mask. */
4305 : :
4306 : : /* In the worst case, we need to generate each mask in the prologue
4307 : : and in the loop body. We need one splat per group and one
4308 : : compare per mask.
4309 : :
4310 : : Sometimes the prologue mask will fold to a constant,
4311 : : so the actual prologue cost might be smaller. However, it's
4312 : : simpler and safer to use the worst-case cost; if this ends up
4313 : : being the tie-breaker between vectorizing or not, then it's
4314 : : probably better not to vectorize. */
4315 : 19 : (void) add_stmt_cost (target_cost_data,
4316 : : num_masks
4317 : 19 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4318 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4319 : : vect_prologue);
4320 : 38 : (void) add_stmt_cost (target_cost_data,
4321 : : num_masks
4322 : 38 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4323 : : vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4324 : :
4325 : : /* When we need saturation we need it both in the prologue and
4326 : : the epilogue. */
4327 : 19 : if (need_saturation)
4328 : : {
4329 : 5 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4330 : : NULL, NULL, NULL_TREE, 0, vect_prologue);
4331 : 5 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4332 : : NULL, NULL, NULL_TREE, 0, vect_body);
4333 : : }
4334 : : }
4335 : 0 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4336 : 79864 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4337 : : == vect_partial_vectors_while_ult))
4338 : : {
4339 : : /* Calculate how many masks we need to generate. */
4340 : : unsigned int num_masks = 0;
4341 : : rgroup_controls *rgm;
4342 : : unsigned int num_vectors_m1;
4343 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4344 : : num_vectors_m1, rgm)
4345 : 0 : if (rgm->type)
4346 : 0 : num_masks += num_vectors_m1 + 1;
4347 : 0 : gcc_assert (num_masks > 0);
4348 : :
4349 : : /* In the worst case, we need to generate each mask in the prologue
4350 : : and in the loop body. One of the loop body mask instructions
4351 : : replaces the comparison in the scalar loop, and since we don't
4352 : : count the scalar comparison against the scalar body, we shouldn't
4353 : : count that vector instruction against the vector body either.
4354 : :
4355 : : Sometimes we can use unpacks instead of generating prologue
4356 : : masks and sometimes the prologue mask will fold to a constant,
4357 : : so the actual prologue cost might be smaller. However, it's
4358 : : simpler and safer to use the worst-case cost; if this ends up
4359 : : being the tie-breaker between vectorizing or not, then it's
4360 : : probably better not to vectorize. */
4361 : 0 : (void) add_stmt_cost (target_cost_data, num_masks,
4362 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4363 : : vect_prologue);
4364 : 0 : (void) add_stmt_cost (target_cost_data, num_masks - 1,
4365 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4366 : : vect_body);
4367 : : }
4368 : 79864 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4369 : : {
4370 : : /* Referring to the functions vect_set_loop_condition_partial_vectors
4371 : : and vect_set_loop_controls_directly, we need to generate each
4372 : : length in the prologue and in the loop body if required. Although
4373 : : there are some possible optimizations, we consider the worst case
4374 : : here. */
4375 : :
4376 : 0 : bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4377 : 0 : signed char partial_load_store_bias
4378 : : = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4379 : 0 : bool need_iterate_p
4380 : 0 : = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4381 : 0 : && !vect_known_niters_smaller_than_vf (loop_vinfo));
4382 : :
4383 : : /* Calculate how many statements to be added. */
4384 : 0 : unsigned int prologue_stmts = 0;
4385 : 0 : unsigned int body_stmts = 0;
4386 : :
4387 : 0 : rgroup_controls *rgc;
4388 : 0 : unsigned int num_vectors_m1;
4389 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4390 : 0 : if (rgc->type)
4391 : : {
4392 : : /* May need one SHIFT for nitems_total computation. */
4393 : 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4394 : 0 : if (nitems != 1 && !niters_known_p)
4395 : 0 : prologue_stmts += 1;
4396 : :
4397 : : /* May need one MAX and one MINUS for wrap around. */
4398 : 0 : if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4399 : 0 : prologue_stmts += 2;
4400 : :
4401 : : /* Need one MAX and one MINUS for each batch limit excepting for
4402 : : the 1st one. */
4403 : 0 : prologue_stmts += num_vectors_m1 * 2;
4404 : :
4405 : 0 : unsigned int num_vectors = num_vectors_m1 + 1;
4406 : :
4407 : : /* Need to set up lengths in prologue, only one MIN required
4408 : : for each since start index is zero. */
4409 : 0 : prologue_stmts += num_vectors;
4410 : :
4411 : : /* If we have a non-zero partial load bias, we need one PLUS
4412 : : to adjust the load length. */
4413 : 0 : if (partial_load_store_bias != 0)
4414 : 0 : body_stmts += 1;
4415 : :
4416 : 0 : unsigned int length_update_cost = 0;
4417 : 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4418 : : /* For decrement IV style, Each only need a single SELECT_VL
4419 : : or MIN since beginning to calculate the number of elements
4420 : : need to be processed in current iteration. */
4421 : : length_update_cost = 1;
4422 : : else
4423 : : /* For increment IV stype, Each may need two MINs and one MINUS to
4424 : : update lengths in body for next iteration. */
4425 : 0 : length_update_cost = 3;
4426 : :
4427 : 0 : if (need_iterate_p)
4428 : 0 : body_stmts += length_update_cost * num_vectors;
4429 : : }
4430 : :
4431 : 0 : (void) add_stmt_cost (target_cost_data, prologue_stmts,
4432 : : scalar_stmt, vect_prologue);
4433 : 0 : (void) add_stmt_cost (target_cost_data, body_stmts,
4434 : : scalar_stmt, vect_body);
4435 : : }
4436 : :
4437 : : /* FORNOW: The scalar outside cost is incremented in one of the
4438 : : following ways:
4439 : :
4440 : : 1. The vectorizer checks for alignment and aliasing and generates
4441 : : a condition that allows dynamic vectorization. A cost model
4442 : : check is ANDED with the versioning condition. Hence scalar code
4443 : : path now has the added cost of the versioning check.
4444 : :
4445 : : if (cost > th & versioning_check)
4446 : : jmp to vector code
4447 : :
4448 : : Hence run-time scalar is incremented by not-taken branch cost.
4449 : :
4450 : : 2. The vectorizer then checks if a prologue is required. If the
4451 : : cost model check was not done before during versioning, it has to
4452 : : be done before the prologue check.
4453 : :
4454 : : if (cost <= th)
4455 : : prologue = scalar_iters
4456 : : if (prologue == 0)
4457 : : jmp to vector code
4458 : : else
4459 : : execute prologue
4460 : : if (prologue == num_iters)
4461 : : go to exit
4462 : :
4463 : : Hence the run-time scalar cost is incremented by a taken branch,
4464 : : plus a not-taken branch, plus a taken branch cost.
4465 : :
4466 : : 3. The vectorizer then checks if an epilogue is required. If the
4467 : : cost model check was not done before during prologue check, it
4468 : : has to be done with the epilogue check.
4469 : :
4470 : : if (prologue == 0)
4471 : : jmp to vector code
4472 : : else
4473 : : execute prologue
4474 : : if (prologue == num_iters)
4475 : : go to exit
4476 : : vector code:
4477 : : if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4478 : : jmp to epilogue
4479 : :
4480 : : Hence the run-time scalar cost should be incremented by 2 taken
4481 : : branches.
4482 : :
4483 : : TODO: The back end may reorder the BBS's differently and reverse
4484 : : conditions/branch directions. Change the estimates below to
4485 : : something more reasonable. */
4486 : :
4487 : : /* If the number of iterations is known and we do not do versioning, we can
4488 : : decide whether to vectorize at compile time. Hence the scalar version
4489 : : do not carry cost model guard costs. */
4490 : 32455 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4491 : 112338 : || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4492 : : {
4493 : : /* Cost model check occurs at versioning. */
4494 : 48038 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4495 : 4774 : scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4496 : : else
4497 : : {
4498 : : /* Cost model check occurs at prologue generation. */
4499 : 43264 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4500 : 26 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4501 : 26 : + vect_get_stmt_cost (cond_branch_not_taken);
4502 : : /* Cost model check occurs at epilogue generation. */
4503 : : else
4504 : 43238 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4505 : : }
4506 : : }
4507 : :
4508 : : /* Complete the target-specific cost calculations. */
4509 : 79883 : loop_vinfo->vector_costs->finish_cost (loop_vinfo->scalar_costs);
4510 : 79883 : vec_prologue_cost = loop_vinfo->vector_costs->prologue_cost ();
4511 : 79883 : vec_inside_cost = loop_vinfo->vector_costs->body_cost ();
4512 : 79883 : vec_epilogue_cost = loop_vinfo->vector_costs->epilogue_cost ();
4513 : 79883 : if (suggested_unroll_factor)
4514 : 79704 : *suggested_unroll_factor
4515 : 79704 : = loop_vinfo->vector_costs->suggested_unroll_factor ();
4516 : :
4517 : 79704 : if (suggested_unroll_factor && *suggested_unroll_factor > 1
4518 : 232 : && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4519 : 0 : && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4520 : : *suggested_unroll_factor,
4521 : : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4522 : : {
4523 : 0 : if (dump_enabled_p ())
4524 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4525 : : "can't unroll as unrolled vectorization factor larger"
4526 : : " than maximum vectorization factor: "
4527 : : HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4528 : : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4529 : 0 : *suggested_unroll_factor = 1;
4530 : : }
4531 : :
4532 : 79883 : vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4533 : :
4534 : 79883 : if (dump_enabled_p ())
4535 : : {
4536 : 614 : dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4537 : 614 : dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4538 : : vec_inside_cost);
4539 : 614 : dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4540 : : vec_prologue_cost);
4541 : 614 : dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4542 : : vec_epilogue_cost);
4543 : 614 : dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4544 : : scalar_single_iter_cost);
4545 : 614 : dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4546 : : scalar_outside_cost);
4547 : 614 : dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4548 : : vec_outside_cost);
4549 : 614 : dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4550 : : peel_iters_prologue);
4551 : 614 : dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4552 : : peel_iters_epilogue);
4553 : : }
4554 : :
4555 : : /* Calculate number of iterations required to make the vector version
4556 : : profitable, relative to the loop bodies only. The following condition
4557 : : must hold true:
4558 : : SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4559 : : where
4560 : : SIC = scalar iteration cost, VIC = vector iteration cost,
4561 : : VOC = vector outside cost, VF = vectorization factor,
4562 : : NPEEL = prologue iterations + epilogue iterations,
4563 : : SOC = scalar outside cost for run time cost model check. */
4564 : :
4565 : 79883 : int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4566 : 79883 : - vec_inside_cost);
4567 : 79883 : if (saving_per_viter <= 0)
4568 : : {
4569 : 25325 : if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4570 : 0 : warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4571 : : "vectorization did not happen for a simd loop");
4572 : :
4573 : 25325 : if (dump_enabled_p ())
4574 : 22 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4575 : : "cost model: the vector iteration cost = %d "
4576 : : "divided by the scalar iteration cost = %d "
4577 : : "is greater or equal to the vectorization factor = %d"
4578 : : ".\n",
4579 : : vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4580 : 25325 : *ret_min_profitable_niters = -1;
4581 : 25325 : *ret_min_profitable_estimate = -1;
4582 : 25325 : return;
4583 : : }
4584 : :
4585 : : /* ??? The "if" arm is written to handle all cases; see below for what
4586 : : we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4587 : 54558 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4588 : : {
4589 : : /* Rewriting the condition above in terms of the number of
4590 : : vector iterations (vniters) rather than the number of
4591 : : scalar iterations (niters) gives:
4592 : :
4593 : : SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4594 : :
4595 : : <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4596 : :
4597 : : For integer N, X and Y when X > 0:
4598 : :
4599 : : N * X > Y <==> N >= (Y /[floor] X) + 1. */
4600 : 11 : int outside_overhead = (vec_outside_cost
4601 : 11 : - scalar_single_iter_cost * peel_iters_prologue
4602 : 11 : - scalar_single_iter_cost * peel_iters_epilogue
4603 : : - scalar_outside_cost);
4604 : : /* We're only interested in cases that require at least one
4605 : : vector iteration. */
4606 : 11 : int min_vec_niters = 1;
4607 : 11 : if (outside_overhead > 0)
4608 : 8 : min_vec_niters = outside_overhead / saving_per_viter + 1;
4609 : :
4610 : 11 : if (dump_enabled_p ())
4611 : 3 : dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4612 : : min_vec_niters);
4613 : :
4614 : 11 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4615 : : {
4616 : : /* Now that we know the minimum number of vector iterations,
4617 : : find the minimum niters for which the scalar cost is larger:
4618 : :
4619 : : SIC * niters > VIC * vniters + VOC - SOC
4620 : :
4621 : : We know that the minimum niters is no more than
4622 : : vniters * VF + NPEEL, but it might be (and often is) less
4623 : : than that if a partial vector iteration is cheaper than the
4624 : : equivalent scalar code. */
4625 : 11 : int threshold = (vec_inside_cost * min_vec_niters
4626 : 11 : + vec_outside_cost
4627 : 11 : - scalar_outside_cost);
4628 : 11 : if (threshold <= 0)
4629 : : min_profitable_iters = 1;
4630 : : else
4631 : 11 : min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4632 : : }
4633 : : else
4634 : : /* Convert the number of vector iterations into a number of
4635 : : scalar iterations. */
4636 : 0 : min_profitable_iters = (min_vec_niters * assumed_vf
4637 : 0 : + peel_iters_prologue
4638 : : + peel_iters_epilogue);
4639 : : }
4640 : : else
4641 : : {
4642 : 54547 : min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4643 : 54547 : * assumed_vf
4644 : 54547 : - vec_inside_cost * peel_iters_prologue
4645 : 54547 : - vec_inside_cost * peel_iters_epilogue);
4646 : 54547 : if (min_profitable_iters <= 0)
4647 : : min_profitable_iters = 0;
4648 : : else
4649 : : {
4650 : 45638 : min_profitable_iters /= saving_per_viter;
4651 : :
4652 : 45638 : if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4653 : 45638 : <= (((int) vec_inside_cost * min_profitable_iters)
4654 : 45638 : + (((int) vec_outside_cost - scalar_outside_cost)
4655 : : * assumed_vf)))
4656 : 45638 : min_profitable_iters++;
4657 : : }
4658 : : }
4659 : :
4660 : 54558 : if (dump_enabled_p ())
4661 : 592 : dump_printf (MSG_NOTE,
4662 : : " Calculated minimum iters for profitability: %d\n",
4663 : : min_profitable_iters);
4664 : :
4665 : 54558 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4666 : 54547 : && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4667 : : /* We want the vectorized loop to execute at least once. */
4668 : : min_profitable_iters = assumed_vf + peel_iters_prologue;
4669 : 9819 : else if (min_profitable_iters < peel_iters_prologue)
4670 : : /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4671 : : vectorized loop executes at least once. */
4672 : : min_profitable_iters = peel_iters_prologue;
4673 : :
4674 : 54558 : if (dump_enabled_p ())
4675 : 592 : dump_printf_loc (MSG_NOTE, vect_location,
4676 : : " Runtime profitability threshold = %d\n",
4677 : : min_profitable_iters);
4678 : :
4679 : 54558 : *ret_min_profitable_niters = min_profitable_iters;
4680 : :
4681 : : /* Calculate number of iterations required to make the vector version
4682 : : profitable, relative to the loop bodies only.
4683 : :
4684 : : Non-vectorized variant is SIC * niters and it must win over vector
4685 : : variant on the expected loop trip count. The following condition must hold true:
4686 : : SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4687 : :
4688 : 54558 : if (vec_outside_cost <= 0)
4689 : : min_profitable_estimate = 0;
4690 : : /* ??? This "else if" arm is written to handle all cases; see below for
4691 : : what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4692 : 49241 : else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4693 : : {
4694 : : /* This is a repeat of the code above, but with + SOC rather
4695 : : than - SOC. */
4696 : 11 : int outside_overhead = (vec_outside_cost
4697 : 11 : - scalar_single_iter_cost * peel_iters_prologue
4698 : 11 : - scalar_single_iter_cost * peel_iters_epilogue
4699 : : + scalar_outside_cost);
4700 : 11 : int min_vec_niters = 1;
4701 : 11 : if (outside_overhead > 0)
4702 : 11 : min_vec_niters = outside_overhead / saving_per_viter + 1;
4703 : :
4704 : 11 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4705 : : {
4706 : 11 : int threshold = (vec_inside_cost * min_vec_niters
4707 : 11 : + vec_outside_cost
4708 : 11 : + scalar_outside_cost);
4709 : 11 : min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4710 : : }
4711 : : else
4712 : : min_profitable_estimate = (min_vec_niters * assumed_vf
4713 : : + peel_iters_prologue
4714 : : + peel_iters_epilogue);
4715 : : }
4716 : : else
4717 : : {
4718 : 49230 : min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4719 : 49230 : * assumed_vf
4720 : 49230 : - vec_inside_cost * peel_iters_prologue
4721 : 49230 : - vec_inside_cost * peel_iters_epilogue)
4722 : 49230 : / ((scalar_single_iter_cost * assumed_vf)
4723 : : - vec_inside_cost);
4724 : : }
4725 : 54558 : min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4726 : 54558 : if (dump_enabled_p ())
4727 : 592 : dump_printf_loc (MSG_NOTE, vect_location,
4728 : : " Static estimate profitability threshold = %d\n",
4729 : : min_profitable_estimate);
4730 : :
4731 : 54558 : *ret_min_profitable_estimate = min_profitable_estimate;
4732 : : }
4733 : :
4734 : : /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4735 : : vector elements (not bits) for a vector with NELT elements. */
4736 : : static void
4737 : 1957 : calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4738 : : vec_perm_builder *sel)
4739 : : {
4740 : : /* The encoding is a single stepped pattern. Any wrap-around is handled
4741 : : by vec_perm_indices. */
4742 : 1957 : sel->new_vector (nelt, 1, 3);
4743 : 7828 : for (unsigned int i = 0; i < 3; i++)
4744 : 5871 : sel->quick_push (i + offset);
4745 : 1957 : }
4746 : :
4747 : : /* Checks whether the target supports whole-vector shifts for vectors of mode
4748 : : MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4749 : : it supports vec_perm_const with masks for all necessary shift amounts. */
4750 : : static bool
4751 : 6911 : have_whole_vector_shift (machine_mode mode)
4752 : : {
4753 : 6911 : if (can_implement_p (vec_shr_optab, mode))
4754 : : return true;
4755 : :
4756 : : /* Variable-length vectors should be handled via the optab. */
4757 : 55 : unsigned int nelt;
4758 : 110 : if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4759 : : return false;
4760 : :
4761 : 55 : vec_perm_builder sel;
4762 : 55 : vec_perm_indices indices;
4763 : 285 : for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4764 : : {
4765 : 230 : calc_vec_perm_mask_for_shift (i, nelt, &sel);
4766 : 230 : indices.new_vector (sel, 2, nelt);
4767 : 230 : if (!can_vec_perm_const_p (mode, mode, indices, false))
4768 : : return false;
4769 : : }
4770 : : return true;
4771 : 55 : }
4772 : :
4773 : : /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
4774 : : multiplication operands have differing signs and (b) we intend
4775 : : to emulate the operation using a series of signed DOT_PROD_EXPRs.
4776 : : See vect_emulate_mixed_dot_prod for the actual sequence used. */
4777 : :
4778 : : static bool
4779 : 1926 : vect_is_emulated_mixed_dot_prod (slp_tree slp_node)
4780 : : {
4781 : 1926 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
4782 : 1926 : gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
4783 : 1658 : if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
4784 : : return false;
4785 : :
4786 : 554 : tree rhs1 = gimple_assign_rhs1 (assign);
4787 : 554 : tree rhs2 = gimple_assign_rhs2 (assign);
4788 : 554 : if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
4789 : : return false;
4790 : :
4791 : 399 : return !directly_supported_p (DOT_PROD_EXPR,
4792 : : SLP_TREE_VECTYPE (slp_node),
4793 : 133 : SLP_TREE_VECTYPE
4794 : : (SLP_TREE_CHILDREN (slp_node)[0]),
4795 : 133 : optab_vector_mixed_sign);
4796 : : }
4797 : :
4798 : : /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4799 : : functions. Design better to avoid maintenance issues. */
4800 : :
4801 : : /* Function vect_model_reduction_cost.
4802 : :
4803 : : Models cost for a reduction operation, including the vector ops
4804 : : generated within the strip-mine loop in some cases, the initial
4805 : : definition before the loop, and the epilogue code that must be generated. */
4806 : :
4807 : : static void
4808 : 42673 : vect_model_reduction_cost (loop_vec_info loop_vinfo,
4809 : : slp_tree node, internal_fn reduc_fn,
4810 : : vect_reduction_type reduction_type,
4811 : : int ncopies, stmt_vector_for_cost *cost_vec)
4812 : : {
4813 : 42673 : int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4814 : 42673 : tree vectype;
4815 : 42673 : machine_mode mode;
4816 : 42673 : class loop *loop = NULL;
4817 : :
4818 : 42673 : if (loop_vinfo)
4819 : 42673 : loop = LOOP_VINFO_LOOP (loop_vinfo);
4820 : :
4821 : : /* Condition reductions generate two reductions in the loop. */
4822 : 42673 : if (reduction_type == COND_REDUCTION)
4823 : 193 : ncopies *= 2;
4824 : :
4825 : 42673 : vectype = SLP_TREE_VECTYPE (node);
4826 : 42673 : mode = TYPE_MODE (vectype);
4827 : 42673 : stmt_vec_info orig_stmt_info
4828 : 42673 : = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
4829 : :
4830 : 42673 : gimple_match_op op;
4831 : 42673 : if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4832 : 0 : gcc_unreachable ();
4833 : :
4834 : 42673 : if (reduction_type == EXTRACT_LAST_REDUCTION)
4835 : : /* No extra instructions are needed in the prologue. The loop body
4836 : : operations are costed in vectorizable_condition. */
4837 : : inside_cost = 0;
4838 : 42673 : else if (reduction_type == FOLD_LEFT_REDUCTION)
4839 : : {
4840 : : /* No extra instructions needed in the prologue. */
4841 : 4165 : prologue_cost = 0;
4842 : :
4843 : 4165 : if (reduc_fn != IFN_LAST)
4844 : : /* Count one reduction-like operation per vector. */
4845 : 0 : inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4846 : : node, 0, vect_body);
4847 : : else
4848 : : {
4849 : : /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4850 : 4165 : unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4851 : 4165 : inside_cost = record_stmt_cost (cost_vec, nelements,
4852 : : vec_to_scalar, node, 0,
4853 : : vect_body);
4854 : 4165 : inside_cost += record_stmt_cost (cost_vec, nelements,
4855 : : scalar_stmt, node, 0,
4856 : : vect_body);
4857 : : }
4858 : : }
4859 : : else
4860 : : {
4861 : : /* Add in the cost of the initial definitions. */
4862 : 38508 : int prologue_stmts;
4863 : 38508 : if (reduction_type == COND_REDUCTION)
4864 : : /* For cond reductions we have four vectors: initial index, step,
4865 : : initial result of the data reduction, initial value of the index
4866 : : reduction. */
4867 : : prologue_stmts = 4;
4868 : : else
4869 : : /* We need the initial reduction value. */
4870 : 38315 : prologue_stmts = 1;
4871 : 38508 : prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4872 : : scalar_to_vec, node, 0,
4873 : : vect_prologue);
4874 : : }
4875 : :
4876 : : /* Determine cost of epilogue code.
4877 : :
4878 : : We have a reduction operator that will reduce the vector in one statement.
4879 : : Also requires scalar extract. */
4880 : :
4881 : 42673 : if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4882 : : {
4883 : 42546 : if (reduc_fn != IFN_LAST)
4884 : : {
4885 : 31484 : if (reduction_type == COND_REDUCTION)
4886 : : {
4887 : : /* An EQ stmt and an COND_EXPR stmt. */
4888 : 7 : epilogue_cost += record_stmt_cost (cost_vec, 2,
4889 : : vector_stmt, node, 0,
4890 : : vect_epilogue);
4891 : : /* Reduction of the max index and a reduction of the found
4892 : : values. */
4893 : 7 : epilogue_cost += record_stmt_cost (cost_vec, 2,
4894 : : vec_to_scalar, node, 0,
4895 : : vect_epilogue);
4896 : : /* A broadcast of the max value. */
4897 : 7 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4898 : : scalar_to_vec, node, 0,
4899 : : vect_epilogue);
4900 : : }
4901 : : else
4902 : : {
4903 : 31477 : epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4904 : : node, 0, vect_epilogue);
4905 : 31477 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4906 : : vec_to_scalar, node, 0,
4907 : : vect_epilogue);
4908 : : }
4909 : : }
4910 : 11062 : else if (reduction_type == COND_REDUCTION)
4911 : : {
4912 : 186 : unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4913 : : /* Extraction of scalar elements. */
4914 : 372 : epilogue_cost += record_stmt_cost (cost_vec,
4915 : 186 : 2 * estimated_nunits,
4916 : : vec_to_scalar, node, 0,
4917 : : vect_epilogue);
4918 : : /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4919 : 186 : epilogue_cost += record_stmt_cost (cost_vec,
4920 : 186 : 2 * estimated_nunits - 3,
4921 : : scalar_stmt, node, 0,
4922 : : vect_epilogue);
4923 : : }
4924 : 10876 : else if (reduction_type == EXTRACT_LAST_REDUCTION
4925 : 10876 : || reduction_type == FOLD_LEFT_REDUCTION)
4926 : : /* No extra instructions need in the epilogue. */
4927 : : ;
4928 : : else
4929 : : {
4930 : 6711 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4931 : 6711 : tree bitsize = TYPE_SIZE (op.type);
4932 : 6711 : int element_bitsize = tree_to_uhwi (bitsize);
4933 : 6711 : int nelements = vec_size_in_bits / element_bitsize;
4934 : :
4935 : 6711 : if (op.code == COND_EXPR)
4936 : 28 : op.code = MAX_EXPR;
4937 : :
4938 : : /* We have a whole vector shift available. */
4939 : 793 : if (VECTOR_MODE_P (mode)
4940 : 6711 : && directly_supported_p (op.code, vectype)
4941 : 11970 : && have_whole_vector_shift (mode))
4942 : : {
4943 : : /* Final reduction via vector shifts and the reduction operator.
4944 : : Also requires scalar extract. */
4945 : 15777 : epilogue_cost += record_stmt_cost (cost_vec,
4946 : 10518 : exact_log2 (nelements) * 2,
4947 : : vector_stmt, node, 0,
4948 : : vect_epilogue);
4949 : 5259 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4950 : : vec_to_scalar, node, 0,
4951 : : vect_epilogue);
4952 : : }
4953 : : else
4954 : : /* Use extracts and reduction op for final reduction. For N
4955 : : elements, we have N extracts and N-1 reduction ops. */
4956 : 1452 : epilogue_cost += record_stmt_cost (cost_vec,
4957 : 1452 : nelements + nelements - 1,
4958 : : vector_stmt, node, 0,
4959 : : vect_epilogue);
4960 : : }
4961 : : }
4962 : :
4963 : 42673 : if (dump_enabled_p ())
4964 : 2512 : dump_printf (MSG_NOTE,
4965 : : "vect_model_reduction_cost: inside_cost = %d, "
4966 : : "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4967 : : prologue_cost, epilogue_cost);
4968 : 42673 : }
4969 : :
4970 : : /* SEQ is a sequence of instructions that initialize the reduction
4971 : : described by REDUC_INFO. Emit them in the appropriate place. */
4972 : :
4973 : : static void
4974 : 417 : vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4975 : : vect_reduc_info reduc_info, gimple *seq)
4976 : : {
4977 : 417 : if (VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info))
4978 : : {
4979 : : /* When reusing an accumulator from the main loop, we only need
4980 : : initialization instructions if the main loop can be skipped.
4981 : : In that case, emit the initialization instructions at the end
4982 : : of the guard block that does the skip. */
4983 : 23 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
4984 : 23 : gcc_assert (skip_edge);
4985 : 23 : gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4986 : 23 : gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4987 : : }
4988 : : else
4989 : : {
4990 : : /* The normal case: emit the initialization instructions on the
4991 : : preheader edge. */
4992 : 394 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4993 : 394 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4994 : : }
4995 : 417 : }
4996 : :
4997 : : /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4998 : : which performs a reduction involving GROUP_SIZE scalar statements.
4999 : : NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5000 : : is nonnull, introducing extra elements of that value will not change the
5001 : : result. */
5002 : :
5003 : : static void
5004 : 20870 : get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5005 : : vect_reduc_info reduc_info,
5006 : : tree vector_type,
5007 : : vec<tree> *vec_oprnds,
5008 : : unsigned int number_of_vectors,
5009 : : unsigned int group_size, tree neutral_op)
5010 : : {
5011 : 20870 : vec<tree> &initial_values = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info);
5012 : 20870 : unsigned HOST_WIDE_INT nunits;
5013 : 20870 : unsigned j, number_of_places_left_in_vector;
5014 : 20870 : unsigned int i;
5015 : :
5016 : 41740 : gcc_assert (group_size == initial_values.length () || neutral_op);
5017 : :
5018 : : /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5019 : : created vectors. It is greater than 1 if unrolling is performed.
5020 : :
5021 : : For example, we have two scalar operands, s1 and s2 (e.g., group of
5022 : : strided accesses of size two), while NUNITS is four (i.e., four scalars
5023 : : of this type can be packed in a vector). The output vector will contain
5024 : : two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5025 : : will be 2).
5026 : :
5027 : : If GROUP_SIZE > NUNITS, the scalars will be split into several
5028 : : vectors containing the operands.
5029 : :
5030 : : For example, NUNITS is four as before, and the group size is 8
5031 : : (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5032 : : {s5, s6, s7, s8}. */
5033 : :
5034 : 20870 : if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5035 : : nunits = group_size;
5036 : :
5037 : 20870 : number_of_places_left_in_vector = nunits;
5038 : 20870 : bool constant_p = true;
5039 : 20870 : tree_vector_builder elts (vector_type, nunits, 1);
5040 : 20870 : elts.quick_grow (nunits);
5041 : 20870 : gimple_seq ctor_seq = NULL;
5042 : 20870 : if (neutral_op
5043 : 41653 : && !useless_type_conversion_p (TREE_TYPE (vector_type),
5044 : 20783 : TREE_TYPE (neutral_op)))
5045 : 1 : neutral_op = gimple_convert (&ctor_seq,
5046 : 1 : TREE_TYPE (vector_type),
5047 : : neutral_op);
5048 : 210872 : for (j = 0; j < nunits * number_of_vectors; ++j)
5049 : : {
5050 : 190002 : tree op;
5051 : 190002 : i = j % group_size;
5052 : :
5053 : : /* Get the def before the loop. In reduction chain we have only
5054 : : one initial value. Else we have as many as PHIs in the group. */
5055 : 190002 : if (i >= initial_values.length () || (j > i && neutral_op))
5056 : : op = neutral_op;
5057 : : else
5058 : : {
5059 : 43090 : if (!useless_type_conversion_p (TREE_TYPE (vector_type),
5060 : 21545 : TREE_TYPE (initial_values[i])))
5061 : 18 : initial_values[i] = gimple_convert (&ctor_seq,
5062 : 9 : TREE_TYPE (vector_type),
5063 : 9 : initial_values[i]);
5064 : 21545 : op = initial_values[i];
5065 : : }
5066 : :
5067 : : /* Create 'vect_ = {op0,op1,...,opn}'. */
5068 : 190002 : number_of_places_left_in_vector--;
5069 : 190002 : elts[nunits - number_of_places_left_in_vector - 1] = op;
5070 : 190002 : if (!CONSTANT_CLASS_P (op))
5071 : 2265 : constant_p = false;
5072 : :
5073 : 190002 : if (number_of_places_left_in_vector == 0)
5074 : : {
5075 : 24390 : tree init;
5076 : 48780 : if (constant_p && !neutral_op
5077 : 48729 : ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5078 : 24390 : : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5079 : : /* Build the vector directly from ELTS. */
5080 : 24390 : init = gimple_build_vector (&ctor_seq, &elts);
5081 : 0 : else if (neutral_op)
5082 : : {
5083 : : /* Build a vector of the neutral value and shift the
5084 : : other elements into place. */
5085 : 0 : init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5086 : : neutral_op);
5087 : 0 : int k = nunits;
5088 : 0 : while (k > 0 && operand_equal_p (elts[k - 1], neutral_op))
5089 : : k -= 1;
5090 : 0 : while (k > 0)
5091 : : {
5092 : 0 : k -= 1;
5093 : 0 : init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5094 : 0 : vector_type, init, elts[k]);
5095 : : }
5096 : : }
5097 : : else
5098 : : {
5099 : : /* First time round, duplicate ELTS to fill the
5100 : : required number of vectors. */
5101 : 0 : duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5102 : : elts, number_of_vectors, *vec_oprnds);
5103 : 0 : break;
5104 : : }
5105 : 24390 : vec_oprnds->quick_push (init);
5106 : :
5107 : 24390 : number_of_places_left_in_vector = nunits;
5108 : 24390 : elts.new_vector (vector_type, nunits, 1);
5109 : 24390 : elts.quick_grow (nunits);
5110 : 24390 : constant_p = true;
5111 : : }
5112 : : }
5113 : 20870 : if (ctor_seq != NULL)
5114 : 417 : vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5115 : 20870 : }
5116 : :
5117 : : vect_reduc_info
5118 : 125708 : info_for_reduction (loop_vec_info loop_vinfo, slp_tree node)
5119 : : {
5120 : 125708 : if (node->cycle_info.id == -1)
5121 : : return NULL;
5122 : 123964 : return loop_vinfo->reduc_infos[node->cycle_info.id];
5123 : : }
5124 : :
5125 : : /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5126 : : REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5127 : : return false. */
5128 : :
5129 : : static bool
5130 : 18488 : vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5131 : : vect_reduc_info reduc_info, tree vectype)
5132 : : {
5133 : 18488 : loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5134 : 18488 : if (!main_loop_vinfo)
5135 : : return false;
5136 : :
5137 : 3879 : if (VECT_REDUC_INFO_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5138 : : return false;
5139 : :
5140 : 3861 : unsigned int num_phis = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).length ();
5141 : 3861 : auto_vec<tree, 16> main_loop_results (num_phis);
5142 : 3861 : auto_vec<tree, 16> initial_values (num_phis);
5143 : 3861 : if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5144 : : {
5145 : : /* The epilogue loop can be entered either from the main loop or
5146 : : from an earlier guard block. */
5147 : 3674 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
5148 : 14716 : for (tree incoming_value : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info))
5149 : : {
5150 : : /* Look for:
5151 : :
5152 : : INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5153 : : INITIAL_VALUE(guard block)>. */
5154 : 3694 : gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5155 : :
5156 : 3694 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5157 : 3694 : gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5158 : :
5159 : 3694 : tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5160 : 3694 : tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5161 : :
5162 : 3694 : main_loop_results.quick_push (from_main_loop);
5163 : 3694 : initial_values.quick_push (from_skip);
5164 : : }
5165 : : }
5166 : : else
5167 : : /* The main loop dominates the epilogue loop. */
5168 : 187 : main_loop_results.splice (VECT_REDUC_INFO_INITIAL_VALUES (reduc_info));
5169 : :
5170 : : /* See if the main loop has the kind of accumulator we need. */
5171 : 3861 : vect_reusable_accumulator *accumulator
5172 : 3861 : = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5173 : 3861 : if (!accumulator
5174 : 7708 : || num_phis != VECT_REDUC_INFO_SCALAR_RESULTS (accumulator->reduc_info).length ()
5175 : 11569 : || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5176 : : VECT_REDUC_INFO_SCALAR_RESULTS (accumulator->reduc_info).begin ()))
5177 : : return false;
5178 : :
5179 : : /* Handle the case where we can reduce wider vectors to narrower ones. */
5180 : 3850 : tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5181 : 3850 : unsigned HOST_WIDE_INT m;
5182 : 3850 : if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5183 : 3850 : TYPE_VECTOR_SUBPARTS (vectype), &m))
5184 : 0 : return false;
5185 : : /* Check the intermediate vector types and operations are available. */
5186 : 3850 : tree prev_vectype = old_vectype;
5187 : 3850 : poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5188 : 11030 : while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5189 : : {
5190 : 3850 : intermediate_nunits = exact_div (intermediate_nunits, 2);
5191 : 3850 : tree intermediate_vectype = get_related_vectype_for_scalar_type
5192 : 3850 : (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5193 : 3850 : if (!intermediate_vectype
5194 : 3850 : || !directly_supported_p (VECT_REDUC_INFO_CODE (reduc_info),
5195 : : intermediate_vectype)
5196 : 7180 : || !can_vec_extract (TYPE_MODE (prev_vectype),
5197 : 3330 : TYPE_MODE (intermediate_vectype)))
5198 : : return false;
5199 : : prev_vectype = intermediate_vectype;
5200 : : }
5201 : :
5202 : : /* Non-SLP reductions might apply an adjustment after the reduction
5203 : : operation, in order to simplify the initialization of the accumulator.
5204 : : If the epilogue loop carries on from where the main loop left off,
5205 : : it should apply the same adjustment to the final reduction result.
5206 : :
5207 : : If the epilogue loop can also be entered directly (rather than via
5208 : : the main loop), we need to be able to handle that case in the same way,
5209 : : with the same adjustment. (In principle we could add a PHI node
5210 : : to select the correct adjustment, but in practice that shouldn't be
5211 : : necessary.) */
5212 : 3330 : tree main_adjustment
5213 : 3330 : = VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5214 : 3330 : if (loop_vinfo->main_loop_edge && main_adjustment)
5215 : : {
5216 : 2824 : gcc_assert (num_phis == 1);
5217 : 2824 : tree initial_value = initial_values[0];
5218 : : /* Check that we can use INITIAL_VALUE as the adjustment and
5219 : : initialize the accumulator with a neutral value instead. */
5220 : 2824 : if (!operand_equal_p (initial_value, main_adjustment))
5221 : 106 : return false;
5222 : 2718 : code_helper code = VECT_REDUC_INFO_CODE (reduc_info);
5223 : 2718 : initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5224 : : code, initial_value);
5225 : : }
5226 : 3224 : VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5227 : 3224 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).truncate (0);
5228 : 3224 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).splice (initial_values);
5229 : 3224 : VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info) = accumulator;
5230 : 3224 : return true;
5231 : 3861 : }
5232 : :
5233 : : /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5234 : : CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5235 : :
5236 : : static tree
5237 : 4874 : vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5238 : : gimple_seq *seq)
5239 : : {
5240 : 4874 : unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5241 : 4874 : unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5242 : 4874 : tree stype = TREE_TYPE (vectype);
5243 : 4874 : tree new_temp = vec_def;
5244 : 8137 : while (nunits > nunits1)
5245 : : {
5246 : 3263 : nunits /= 2;
5247 : 3263 : tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5248 : 3263 : stype, nunits);
5249 : 3263 : unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5250 : :
5251 : : /* The target has to make sure we support lowpart/highpart
5252 : : extraction, either via direct vector extract or through
5253 : : an integer mode punning. */
5254 : 3263 : tree dst1, dst2;
5255 : 3263 : gimple *epilog_stmt;
5256 : 3263 : if (convert_optab_handler (vec_extract_optab,
5257 : 3263 : TYPE_MODE (TREE_TYPE (new_temp)),
5258 : 3263 : TYPE_MODE (vectype1))
5259 : : != CODE_FOR_nothing)
5260 : : {
5261 : : /* Extract sub-vectors directly once vec_extract becomes
5262 : : a conversion optab. */
5263 : 2022 : dst1 = make_ssa_name (vectype1);
5264 : 2022 : epilog_stmt
5265 : 4044 : = gimple_build_assign (dst1, BIT_FIELD_REF,
5266 : : build3 (BIT_FIELD_REF, vectype1,
5267 : 2022 : new_temp, TYPE_SIZE (vectype1),
5268 : : bitsize_int (0)));
5269 : 2022 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5270 : 2022 : dst2 = make_ssa_name (vectype1);
5271 : 2022 : epilog_stmt
5272 : 2022 : = gimple_build_assign (dst2, BIT_FIELD_REF,
5273 : : build3 (BIT_FIELD_REF, vectype1,
5274 : 2022 : new_temp, TYPE_SIZE (vectype1),
5275 : 2022 : bitsize_int (bitsize)));
5276 : 2022 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5277 : : }
5278 : : else
5279 : : {
5280 : : /* Extract via punning to appropriately sized integer mode
5281 : : vector. */
5282 : 1241 : tree eltype = build_nonstandard_integer_type (bitsize, 1);
5283 : 1241 : tree etype = build_vector_type (eltype, 2);
5284 : 2482 : gcc_assert (convert_optab_handler (vec_extract_optab,
5285 : : TYPE_MODE (etype),
5286 : : TYPE_MODE (eltype))
5287 : : != CODE_FOR_nothing);
5288 : 1241 : tree tem = make_ssa_name (etype);
5289 : 1241 : epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5290 : : build1 (VIEW_CONVERT_EXPR,
5291 : : etype, new_temp));
5292 : 1241 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5293 : 1241 : new_temp = tem;
5294 : 1241 : tem = make_ssa_name (eltype);
5295 : 1241 : epilog_stmt
5296 : 2482 : = gimple_build_assign (tem, BIT_FIELD_REF,
5297 : : build3 (BIT_FIELD_REF, eltype,
5298 : 1241 : new_temp, TYPE_SIZE (eltype),
5299 : : bitsize_int (0)));
5300 : 1241 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5301 : 1241 : dst1 = make_ssa_name (vectype1);
5302 : 1241 : epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5303 : : build1 (VIEW_CONVERT_EXPR,
5304 : : vectype1, tem));
5305 : 1241 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5306 : 1241 : tem = make_ssa_name (eltype);
5307 : 1241 : epilog_stmt
5308 : 1241 : = gimple_build_assign (tem, BIT_FIELD_REF,
5309 : : build3 (BIT_FIELD_REF, eltype,
5310 : 1241 : new_temp, TYPE_SIZE (eltype),
5311 : 1241 : bitsize_int (bitsize)));
5312 : 1241 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5313 : 1241 : dst2 = make_ssa_name (vectype1);
5314 : 1241 : epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5315 : : build1 (VIEW_CONVERT_EXPR,
5316 : : vectype1, tem));
5317 : 1241 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5318 : : }
5319 : :
5320 : 3263 : new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5321 : : }
5322 : :
5323 : 4874 : return new_temp;
5324 : : }
5325 : :
5326 : : /* Function vect_create_epilog_for_reduction
5327 : :
5328 : : Create code at the loop-epilog to finalize the result of a reduction
5329 : : computation.
5330 : :
5331 : : STMT_INFO is the scalar reduction stmt that is being vectorized.
5332 : : SLP_NODE is an SLP node containing a group of reduction statements. The
5333 : : first one in this group is STMT_INFO.
5334 : : SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5335 : : REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5336 : : (counting from 0)
5337 : : LOOP_EXIT is the edge to update in the merge block. In the case of a single
5338 : : exit this edge is always the main loop exit.
5339 : :
5340 : : This function:
5341 : : 1. Completes the reduction def-use cycles.
5342 : : 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5343 : : by calling the function specified by REDUC_FN if available, or by
5344 : : other means (whole-vector shifts or a scalar loop).
5345 : : The function also creates a new phi node at the loop exit to preserve
5346 : : loop-closed form, as illustrated below.
5347 : :
5348 : : The flow at the entry to this function:
5349 : :
5350 : : loop:
5351 : : vec_def = phi <vec_init, null> # REDUCTION_PHI
5352 : : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5353 : : s_loop = scalar_stmt # (scalar) STMT_INFO
5354 : : loop_exit:
5355 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5356 : : use <s_out0>
5357 : : use <s_out0>
5358 : :
5359 : : The above is transformed by this function into:
5360 : :
5361 : : loop:
5362 : : vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5363 : : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5364 : : s_loop = scalar_stmt # (scalar) STMT_INFO
5365 : : loop_exit:
5366 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5367 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5368 : : v_out2 = reduce <v_out1>
5369 : : s_out3 = extract_field <v_out2, 0>
5370 : : s_out4 = adjust_result <s_out3>
5371 : : use <s_out4>
5372 : : use <s_out4>
5373 : : */
5374 : :
5375 : : static void
5376 : 21191 : vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5377 : : stmt_vec_info stmt_info,
5378 : : slp_tree slp_node,
5379 : : slp_instance slp_node_instance,
5380 : : edge loop_exit)
5381 : : {
5382 : 21191 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
5383 : 21191 : code_helper code = VECT_REDUC_INFO_CODE (reduc_info);
5384 : 21191 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
5385 : 21191 : tree vectype;
5386 : 21191 : machine_mode mode;
5387 : 21191 : basic_block exit_bb;
5388 : 21191 : gimple *new_phi = NULL, *phi = NULL;
5389 : 21191 : gimple_stmt_iterator exit_gsi;
5390 : 21191 : tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5391 : 21191 : gimple *epilog_stmt = NULL;
5392 : 21191 : gimple *exit_phi;
5393 : 21191 : tree bitsize;
5394 : 21191 : tree def;
5395 : 21191 : tree orig_name, scalar_result;
5396 : 21191 : imm_use_iterator imm_iter;
5397 : 21191 : use_operand_p use_p;
5398 : 21191 : gimple *use_stmt;
5399 : 21191 : auto_vec<tree> reduc_inputs;
5400 : 21191 : int j, i;
5401 : 21191 : vec<tree> &scalar_results = VECT_REDUC_INFO_SCALAR_RESULTS (reduc_info);
5402 : 21191 : unsigned int k;
5403 : : /* SLP reduction without reduction chain, e.g.,
5404 : : # a1 = phi <a2, a0>
5405 : : # b1 = phi <b2, b0>
5406 : : a2 = operation (a1)
5407 : : b2 = operation (b1) */
5408 : 21191 : const bool slp_reduc
5409 : 21191 : = SLP_INSTANCE_KIND (slp_node_instance) != slp_inst_kind_reduc_chain;
5410 : 21191 : tree induction_index = NULL_TREE;
5411 : :
5412 : 21191 : unsigned int group_size = SLP_TREE_LANES (slp_node);
5413 : :
5414 : 21191 : bool double_reduc = false;
5415 : 21191 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5416 : 21191 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5417 : : {
5418 : 70 : double_reduc = true;
5419 : 70 : gcc_assert (slp_reduc);
5420 : : }
5421 : :
5422 : 21191 : vectype = VECT_REDUC_INFO_VECTYPE (reduc_info);
5423 : 21191 : gcc_assert (vectype);
5424 : 21191 : mode = TYPE_MODE (vectype);
5425 : :
5426 : 21191 : tree induc_val = NULL_TREE;
5427 : 21191 : tree adjustment_def = NULL;
5428 : : /* Optimize: for induction condition reduction, if we can't use zero
5429 : : for induc_val, use initial_def. */
5430 : 21191 : if (VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5431 : 66 : induc_val = VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info);
5432 : 21125 : else if (double_reduc)
5433 : : ;
5434 : : else
5435 : 21055 : adjustment_def = VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info);
5436 : :
5437 : 21191 : stmt_vec_info single_live_out_stmt[] = { stmt_info };
5438 : 21191 : array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5439 : 21191 : if (slp_reduc)
5440 : : /* All statements produce live-out values. */
5441 : 37774 : live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5442 : :
5443 : 21191 : unsigned vec_num
5444 : 21191 : = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5445 : :
5446 : : /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5447 : : which is updated with the current index of the loop for every match of
5448 : : the original loop's cond_expr (VEC_STMT). This results in a vector
5449 : : containing the last time the condition passed for that vector lane.
5450 : : The first match will be a 1 to allow 0 to be used for non-matching
5451 : : indexes. If there are no matches at all then the vector will be all
5452 : : zeroes.
5453 : :
5454 : : PR92772: This algorithm is broken for architectures that support
5455 : : masked vectors, but do not provide fold_extract_last. */
5456 : 21191 : if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION)
5457 : : {
5458 : 73 : gcc_assert (!double_reduc);
5459 : 73 : auto_vec<std::pair<tree, bool>, 2> ccompares;
5460 : 73 : slp_tree cond_node = slp_node_instance->root;
5461 : 167 : while (cond_node != slp_node_instance->reduc_phis)
5462 : : {
5463 : 94 : stmt_vec_info cond_info = SLP_TREE_REPRESENTATIVE (cond_node);
5464 : 94 : if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5465 : : {
5466 : 82 : gimple *vec_stmt
5467 : 82 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (cond_node)[0]);
5468 : 82 : gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5469 : 82 : ccompares.safe_push
5470 : 82 : (std::make_pair (gimple_assign_rhs1 (vec_stmt),
5471 : 82 : SLP_TREE_REDUC_IDX (cond_node) == 2));
5472 : : }
5473 : 94 : int slp_reduc_idx = SLP_TREE_REDUC_IDX (cond_node);
5474 : 94 : cond_node = SLP_TREE_CHILDREN (cond_node)[slp_reduc_idx];
5475 : : }
5476 : 73 : gcc_assert (ccompares.length () != 0);
5477 : :
5478 : 73 : tree indx_before_incr, indx_after_incr;
5479 : 73 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5480 : 73 : int scalar_precision
5481 : 73 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5482 : 73 : tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5483 : 73 : tree cr_index_vector_type = get_related_vectype_for_scalar_type
5484 : 73 : (TYPE_MODE (vectype), cr_index_scalar_type,
5485 : : TYPE_VECTOR_SUBPARTS (vectype));
5486 : :
5487 : : /* First we create a simple vector induction variable which starts
5488 : : with the values {1,2,3,...} (SERIES_VECT) and increments by the
5489 : : vector size (STEP). */
5490 : :
5491 : : /* Create a {1,2,3,...} vector. */
5492 : 73 : tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5493 : :
5494 : : /* Create a vector of the step value. */
5495 : 73 : tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5496 : 73 : tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5497 : :
5498 : : /* Create an induction variable. */
5499 : 73 : gimple_stmt_iterator incr_gsi;
5500 : 73 : bool insert_after;
5501 : 73 : vect_iv_increment_position (LOOP_VINFO_IV_EXIT (loop_vinfo),
5502 : : &incr_gsi, &insert_after);
5503 : 73 : create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
5504 : : insert_after, &indx_before_incr, &indx_after_incr);
5505 : :
5506 : : /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5507 : : filled with zeros (VEC_ZERO). */
5508 : :
5509 : : /* Create a vector of 0s. */
5510 : 73 : tree zero = build_zero_cst (cr_index_scalar_type);
5511 : 73 : tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5512 : :
5513 : : /* Create a vector phi node. */
5514 : 73 : tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5515 : 73 : new_phi = create_phi_node (new_phi_tree, loop->header);
5516 : 73 : add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5517 : : loop_preheader_edge (loop), UNKNOWN_LOCATION);
5518 : :
5519 : : /* Now take the condition from the loops original cond_exprs
5520 : : and produce a new cond_exprs (INDEX_COND_EXPR) which for
5521 : : every match uses values from the induction variable
5522 : : (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5523 : : (NEW_PHI_TREE).
5524 : : Finally, we update the phi (NEW_PHI_TREE) to take the value of
5525 : : the new cond_expr (INDEX_COND_EXPR). */
5526 : 73 : gimple_seq stmts = NULL;
5527 : 228 : for (int i = ccompares.length () - 1; i != -1; --i)
5528 : : {
5529 : 82 : tree ccompare = ccompares[i].first;
5530 : 82 : if (ccompares[i].second)
5531 : 69 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5532 : : cr_index_vector_type,
5533 : : ccompare,
5534 : : indx_before_incr, new_phi_tree);
5535 : : else
5536 : 13 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5537 : : cr_index_vector_type,
5538 : : ccompare,
5539 : : new_phi_tree, indx_before_incr);
5540 : : }
5541 : 73 : gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5542 : :
5543 : : /* Update the phi with the vec cond. */
5544 : 73 : induction_index = new_phi_tree;
5545 : 73 : add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5546 : : loop_latch_edge (loop), UNKNOWN_LOCATION);
5547 : 73 : }
5548 : :
5549 : : /* 2. Create epilog code.
5550 : : The reduction epilog code operates across the elements of the vector
5551 : : of partial results computed by the vectorized loop.
5552 : : The reduction epilog code consists of:
5553 : :
5554 : : step 1: compute the scalar result in a vector (v_out2)
5555 : : step 2: extract the scalar result (s_out3) from the vector (v_out2)
5556 : : step 3: adjust the scalar result (s_out3) if needed.
5557 : :
5558 : : Step 1 can be accomplished using one the following three schemes:
5559 : : (scheme 1) using reduc_fn, if available.
5560 : : (scheme 2) using whole-vector shifts, if available.
5561 : : (scheme 3) using a scalar loop. In this case steps 1+2 above are
5562 : : combined.
5563 : :
5564 : : The overall epilog code looks like this:
5565 : :
5566 : : s_out0 = phi <s_loop> # original EXIT_PHI
5567 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5568 : : v_out2 = reduce <v_out1> # step 1
5569 : : s_out3 = extract_field <v_out2, 0> # step 2
5570 : : s_out4 = adjust_result <s_out3> # step 3
5571 : :
5572 : : (step 3 is optional, and steps 1 and 2 may be combined).
5573 : : Lastly, the uses of s_out0 are replaced by s_out4. */
5574 : :
5575 : :
5576 : : /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5577 : : v_out1 = phi <VECT_DEF>
5578 : : Store them in NEW_PHIS. */
5579 : : /* We need to reduce values in all exits. */
5580 : 21191 : exit_bb = loop_exit->dest;
5581 : 21191 : exit_gsi = gsi_after_labels (exit_bb);
5582 : 21191 : reduc_inputs.create (vec_num);
5583 : 45906 : for (unsigned i = 0; i < vec_num; i++)
5584 : : {
5585 : 24715 : gimple_seq stmts = NULL;
5586 : 24715 : def = vect_get_slp_vect_def (slp_node, i);
5587 : 24715 : tree new_def = copy_ssa_name (def);
5588 : 24715 : phi = create_phi_node (new_def, exit_bb);
5589 : 24715 : if (LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit)
5590 : 24688 : SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
5591 : : else
5592 : : {
5593 : 57 : for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
5594 : 30 : SET_PHI_ARG_DEF (phi, k, def);
5595 : : }
5596 : 24715 : new_def = gimple_convert (&stmts, vectype, new_def);
5597 : 24715 : reduc_inputs.quick_push (new_def);
5598 : 24715 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5599 : : }
5600 : :
5601 : : /* 2.2 Get the original scalar reduction variable as defined in the loop.
5602 : : In case STMT is a "pattern-stmt" (i.e. - it represents a reduction
5603 : : pattern), the scalar-def is taken from the original stmt that the
5604 : : pattern-stmt (STMT) replaces. */
5605 : :
5606 : 21806 : tree scalar_dest = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
5607 : 21191 : tree scalar_type = TREE_TYPE (scalar_dest);
5608 : 21191 : scalar_results.truncate (0);
5609 : 21191 : scalar_results.reserve_exact (group_size);
5610 : 21191 : new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5611 : 21191 : bitsize = TYPE_SIZE (scalar_type);
5612 : :
5613 : : /* True if we should implement SLP_REDUC using native reduction operations
5614 : : instead of scalar operations. */
5615 : 21191 : const bool direct_slp_reduc
5616 : 21191 : = (reduc_fn != IFN_LAST
5617 : 21191 : && slp_reduc
5618 : 21191 : && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5619 : :
5620 : : /* In case of reduction chain, e.g.,
5621 : : # a1 = phi <a3, a0>
5622 : : a2 = operation (a1)
5623 : : a3 = operation (a2),
5624 : :
5625 : : we may end up with more than one vector result. Here we reduce them
5626 : : to one vector.
5627 : :
5628 : : The same is true for a SLP reduction, e.g.,
5629 : : # a1 = phi <a2, a0>
5630 : : # b1 = phi <b2, b0>
5631 : : a2 = operation (a1)
5632 : : b2 = operation (a2),
5633 : :
5634 : : where we can end up with more than one vector as well. We can
5635 : : easily accumulate vectors when the number of vector elements is
5636 : : a multiple of the SLP group size.
5637 : :
5638 : : The same is true if we couldn't use a single defuse cycle. */
5639 : 21191 : if (!slp_reduc
5640 : : || direct_slp_reduc
5641 : 21191 : || (slp_reduc
5642 : 18887 : && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size)))
5643 : : {
5644 : 21164 : gimple_seq stmts = NULL;
5645 : 21164 : tree single_input = reduc_inputs[0];
5646 : 24636 : for (k = 1; k < reduc_inputs.length (); k++)
5647 : 6944 : single_input = gimple_build (&stmts, code, vectype,
5648 : 3472 : single_input, reduc_inputs[k]);
5649 : 21164 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5650 : :
5651 : 21164 : reduc_inputs.truncate (0);
5652 : 21164 : reduc_inputs.safe_push (single_input);
5653 : : }
5654 : :
5655 : 21191 : tree orig_reduc_input = reduc_inputs[0];
5656 : :
5657 : : /* If this loop is an epilogue loop that can be skipped after the
5658 : : main loop, we can only share a reduction operation between the
5659 : : main loop and the epilogue if we put it at the target of the
5660 : : skip edge.
5661 : :
5662 : : We can still reuse accumulators if this check fails. Doing so has
5663 : : the minor(?) benefit of making the epilogue loop's scalar result
5664 : : independent of the main loop's scalar result. */
5665 : 21191 : bool unify_with_main_loop_p = false;
5666 : 21191 : if (VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)
5667 : 3224 : && loop_vinfo->skip_this_loop_edge
5668 : 3024 : && single_succ_p (exit_bb)
5669 : 21208 : && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5670 : : {
5671 : 17 : unify_with_main_loop_p = true;
5672 : :
5673 : 17 : basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5674 : 17 : reduc_inputs[0] = make_ssa_name (vectype);
5675 : 17 : gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5676 : 17 : add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5677 : : UNKNOWN_LOCATION);
5678 : 17 : add_phi_arg (new_phi,
5679 : 17 : VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)->reduc_input,
5680 : : loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5681 : 17 : exit_gsi = gsi_after_labels (reduc_block);
5682 : : }
5683 : :
5684 : : /* Shouldn't be used beyond this point. */
5685 : 21191 : exit_bb = nullptr;
5686 : :
5687 : 21191 : if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
5688 : 73 : && reduc_fn != IFN_LAST)
5689 : : {
5690 : : /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5691 : : various data values where the condition matched and another vector
5692 : : (INDUCTION_INDEX) containing all the indexes of those matches. We
5693 : : need to extract the last matching index (which will be the index with
5694 : : highest value) and use this to index into the data vector.
5695 : : For the case where there were no matches, the data vector will contain
5696 : : all default values and the index vector will be all zeros. */
5697 : :
5698 : : /* Get various versions of the type of the vector of indexes. */
5699 : 4 : tree index_vec_type = TREE_TYPE (induction_index);
5700 : 4 : gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5701 : 4 : tree index_scalar_type = TREE_TYPE (index_vec_type);
5702 : 4 : tree index_vec_cmp_type = truth_type_for (index_vec_type);
5703 : :
5704 : : /* Get an unsigned integer version of the type of the data vector. */
5705 : 4 : int scalar_precision
5706 : 4 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5707 : 4 : tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5708 : 4 : tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5709 : : vectype);
5710 : :
5711 : : /* First we need to create a vector (ZERO_VEC) of zeros and another
5712 : : vector (MAX_INDEX_VEC) filled with the last matching index, which we
5713 : : can create using a MAX reduction and then expanding.
5714 : : In the case where the loop never made any matches, the max index will
5715 : : be zero. */
5716 : :
5717 : : /* Vector of {0, 0, 0,...}. */
5718 : 4 : tree zero_vec = build_zero_cst (vectype);
5719 : :
5720 : : /* Find maximum value from the vector of found indexes. */
5721 : 4 : tree max_index = make_ssa_name (index_scalar_type);
5722 : 4 : gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5723 : : 1, induction_index);
5724 : 4 : gimple_call_set_lhs (max_index_stmt, max_index);
5725 : 4 : gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5726 : :
5727 : : /* Vector of {max_index, max_index, max_index,...}. */
5728 : 4 : tree max_index_vec = make_ssa_name (index_vec_type);
5729 : 4 : tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5730 : : max_index);
5731 : 4 : gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5732 : : max_index_vec_rhs);
5733 : 4 : gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5734 : :
5735 : : /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5736 : : with the vector (INDUCTION_INDEX) of found indexes, choosing values
5737 : : from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5738 : : otherwise. Only one value should match, resulting in a vector
5739 : : (VEC_COND) with one data value and the rest zeros.
5740 : : In the case where the loop never made any matches, every index will
5741 : : match, resulting in a vector with all data values (which will all be
5742 : : the default value). */
5743 : :
5744 : : /* Compare the max index vector to the vector of found indexes to find
5745 : : the position of the max value. */
5746 : 4 : tree vec_compare = make_ssa_name (index_vec_cmp_type);
5747 : 4 : gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5748 : : induction_index,
5749 : : max_index_vec);
5750 : 4 : gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5751 : :
5752 : : /* Use the compare to choose either values from the data vector or
5753 : : zero. */
5754 : 4 : tree vec_cond = make_ssa_name (vectype);
5755 : 4 : gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5756 : : vec_compare,
5757 : 4 : reduc_inputs[0],
5758 : : zero_vec);
5759 : 4 : gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5760 : :
5761 : : /* Finally we need to extract the data value from the vector (VEC_COND)
5762 : : into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5763 : : reduction, but because this doesn't exist, we can use a MAX reduction
5764 : : instead. The data value might be signed or a float so we need to cast
5765 : : it first.
5766 : : In the case where the loop never made any matches, the data values are
5767 : : all identical, and so will reduce down correctly. */
5768 : :
5769 : : /* Make the matched data values unsigned. */
5770 : 4 : tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5771 : 4 : tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5772 : : vec_cond);
5773 : 4 : gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5774 : : VIEW_CONVERT_EXPR,
5775 : : vec_cond_cast_rhs);
5776 : 4 : gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5777 : :
5778 : : /* Reduce down to a scalar value. */
5779 : 4 : tree data_reduc = make_ssa_name (scalar_type_unsigned);
5780 : 4 : gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5781 : : 1, vec_cond_cast);
5782 : 4 : gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5783 : 4 : gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5784 : :
5785 : : /* Convert the reduced value back to the result type and set as the
5786 : : result. */
5787 : 4 : gimple_seq stmts = NULL;
5788 : 4 : new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5789 : : data_reduc);
5790 : 4 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5791 : 4 : scalar_results.safe_push (new_temp);
5792 : 4 : }
5793 : 21187 : else if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
5794 : 69 : && reduc_fn == IFN_LAST)
5795 : : {
5796 : : /* Condition reduction without supported IFN_REDUC_MAX. Generate
5797 : : idx = 0;
5798 : : idx_val = induction_index[0];
5799 : : val = data_reduc[0];
5800 : : for (idx = 0, val = init, i = 0; i < nelts; ++i)
5801 : : if (induction_index[i] > idx_val)
5802 : : val = data_reduc[i], idx_val = induction_index[i];
5803 : : return val; */
5804 : :
5805 : 69 : tree data_eltype = TREE_TYPE (vectype);
5806 : 69 : tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5807 : 69 : unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5808 : 69 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5809 : : /* Enforced by vectorizable_reduction, which ensures we have target
5810 : : support before allowing a conditional reduction on variable-length
5811 : : vectors. */
5812 : 69 : unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5813 : 69 : tree idx_val = NULL_TREE, val = NULL_TREE;
5814 : 461 : for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5815 : : {
5816 : 392 : tree old_idx_val = idx_val;
5817 : 392 : tree old_val = val;
5818 : 392 : idx_val = make_ssa_name (idx_eltype);
5819 : 392 : epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5820 : : build3 (BIT_FIELD_REF, idx_eltype,
5821 : : induction_index,
5822 : 392 : bitsize_int (el_size),
5823 : 392 : bitsize_int (off)));
5824 : 392 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5825 : 392 : val = make_ssa_name (data_eltype);
5826 : 784 : epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5827 : : build3 (BIT_FIELD_REF,
5828 : : data_eltype,
5829 : 392 : reduc_inputs[0],
5830 : 392 : bitsize_int (el_size),
5831 : 392 : bitsize_int (off)));
5832 : 392 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5833 : 392 : if (off != 0)
5834 : : {
5835 : 323 : tree new_idx_val = idx_val;
5836 : 323 : if (off != v_size - el_size)
5837 : : {
5838 : 254 : new_idx_val = make_ssa_name (idx_eltype);
5839 : 254 : epilog_stmt = gimple_build_assign (new_idx_val,
5840 : : MAX_EXPR, idx_val,
5841 : : old_idx_val);
5842 : 254 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5843 : : }
5844 : 323 : tree cond = make_ssa_name (boolean_type_node);
5845 : 323 : epilog_stmt = gimple_build_assign (cond, GT_EXPR,
5846 : : idx_val, old_idx_val);
5847 : 323 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5848 : 323 : tree new_val = make_ssa_name (data_eltype);
5849 : 323 : epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
5850 : : cond, val, old_val);
5851 : 323 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5852 : 323 : idx_val = new_idx_val;
5853 : 323 : val = new_val;
5854 : : }
5855 : : }
5856 : : /* Convert the reduced value back to the result type and set as the
5857 : : result. */
5858 : 69 : gimple_seq stmts = NULL;
5859 : 69 : val = gimple_convert (&stmts, scalar_type, val);
5860 : 69 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5861 : 69 : scalar_results.safe_push (val);
5862 : 69 : }
5863 : :
5864 : : /* 2.3 Create the reduction code, using one of the three schemes described
5865 : : above. In SLP we simply need to extract all the elements from the
5866 : : vector (without reducing them), so we use scalar shifts. */
5867 : 21118 : else if (reduc_fn != IFN_LAST && (!slp_reduc || group_size == 1))
5868 : : {
5869 : 19466 : tree tmp;
5870 : 19466 : tree vec_elem_type;
5871 : :
5872 : : /* Case 1: Create:
5873 : : v_out2 = reduc_expr <v_out1> */
5874 : :
5875 : 19466 : if (dump_enabled_p ())
5876 : 1320 : dump_printf_loc (MSG_NOTE, vect_location,
5877 : : "Reduce using direct vector reduction.\n");
5878 : :
5879 : 19466 : gimple_seq stmts = NULL;
5880 : 19466 : vec_elem_type = TREE_TYPE (vectype);
5881 : 19466 : new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5882 : 19466 : vec_elem_type, reduc_inputs[0]);
5883 : 19466 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5884 : 19466 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5885 : :
5886 : 19466 : if ((VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5887 : 66 : && induc_val)
5888 : : {
5889 : : /* Earlier we set the initial value to be a vector if induc_val
5890 : : values. Check the result and if it is induc_val then replace
5891 : : with the original initial value, unless induc_val is
5892 : : the same as initial_def already. */
5893 : 63 : tree zcompare = make_ssa_name (boolean_type_node);
5894 : 63 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
5895 : : new_temp, induc_val);
5896 : 63 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5897 : 63 : tree initial_def = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[0];
5898 : 63 : tmp = make_ssa_name (new_scalar_dest);
5899 : 63 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5900 : : initial_def, new_temp);
5901 : 63 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5902 : 63 : new_temp = tmp;
5903 : : }
5904 : :
5905 : 19466 : scalar_results.safe_push (new_temp);
5906 : 19466 : }
5907 : 1495 : else if (direct_slp_reduc)
5908 : : {
5909 : : /* Here we create one vector for each of the GROUP_SIZE results,
5910 : : with the elements for other SLP statements replaced with the
5911 : : neutral value. We can then do a normal reduction on each vector. */
5912 : :
5913 : : /* Enforced by vectorizable_reduction. */
5914 : : gcc_assert (reduc_inputs.length () == 1);
5915 : : gcc_assert (pow2p_hwi (group_size));
5916 : :
5917 : : gimple_seq seq = NULL;
5918 : :
5919 : : /* Build a vector {0, 1, 2, ...}, with the same number of elements
5920 : : and the same element size as VECTYPE. */
5921 : : tree index = build_index_vector (vectype, 0, 1);
5922 : : tree index_type = TREE_TYPE (index);
5923 : : tree index_elt_type = TREE_TYPE (index_type);
5924 : : tree mask_type = truth_type_for (index_type);
5925 : :
5926 : : /* Create a vector that, for each element, identifies which of
5927 : : the results should use it. */
5928 : : tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5929 : : index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5930 : : build_vector_from_val (index_type, index_mask));
5931 : :
5932 : : /* Get a neutral vector value. This is simply a splat of the neutral
5933 : : scalar value if we have one, otherwise the initial scalar value
5934 : : is itself a neutral value. */
5935 : : tree vector_identity = NULL_TREE;
5936 : : tree neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5937 : : NULL_TREE, false);
5938 : : if (neutral_op)
5939 : : vector_identity = gimple_build_vector_from_val (&seq, vectype,
5940 : : neutral_op);
5941 : : for (unsigned int i = 0; i < group_size; ++i)
5942 : : {
5943 : : /* If there's no univeral neutral value, we can use the
5944 : : initial scalar value from the original PHI. This is used
5945 : : for MIN and MAX reduction, for example. */
5946 : : if (!neutral_op)
5947 : : {
5948 : : tree scalar_value
5949 : : = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[i];
5950 : : scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5951 : : scalar_value);
5952 : : vector_identity = gimple_build_vector_from_val (&seq, vectype,
5953 : : scalar_value);
5954 : : }
5955 : :
5956 : : /* Calculate the equivalent of:
5957 : :
5958 : : sel[j] = (index[j] == i);
5959 : :
5960 : : which selects the elements of REDUC_INPUTS[0] that should
5961 : : be included in the result. */
5962 : : tree compare_val = build_int_cst (index_elt_type, i);
5963 : : compare_val = build_vector_from_val (index_type, compare_val);
5964 : : tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5965 : : index, compare_val);
5966 : :
5967 : : /* Calculate the equivalent of:
5968 : :
5969 : : vec = seq ? reduc_inputs[0] : vector_identity;
5970 : :
5971 : : VEC is now suitable for a full vector reduction. */
5972 : : tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5973 : : sel, reduc_inputs[0], vector_identity);
5974 : :
5975 : : /* Do the reduction and convert it to the appropriate type. */
5976 : : tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5977 : : TREE_TYPE (vectype), vec);
5978 : : scalar = gimple_convert (&seq, scalar_type, scalar);
5979 : : scalar_results.safe_push (scalar);
5980 : : }
5981 : : gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5982 : : }
5983 : : else
5984 : : {
5985 : 1495 : bool reduce_with_shift;
5986 : 1495 : tree vec_temp;
5987 : :
5988 : 1495 : gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5989 : :
5990 : : /* See if the target wants to do the final (shift) reduction
5991 : : in a vector mode of smaller size and first reduce upper/lower
5992 : : halves against each other. */
5993 : 1652 : enum machine_mode mode1 = mode;
5994 : 1652 : tree stype = TREE_TYPE (vectype);
5995 : 1652 : unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5996 : 1652 : unsigned nunits1 = nunits;
5997 : 1652 : if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5998 : 1652 : && reduc_inputs.length () == 1)
5999 : : {
6000 : 41 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6001 : : /* For SLP reductions we have to make sure lanes match up, but
6002 : : since we're doing individual element final reduction reducing
6003 : : vector width here is even more important.
6004 : : ??? We can also separate lanes with permutes, for the common
6005 : : case of power-of-two group-size odd/even extracts would work. */
6006 : 41 : if (slp_reduc && nunits != nunits1)
6007 : : {
6008 : 41 : nunits1 = least_common_multiple (nunits1, group_size);
6009 : 82 : gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6010 : : }
6011 : : }
6012 : 1652 : if (!slp_reduc
6013 : 1652 : && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6014 : 0 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6015 : :
6016 : 1652 : tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6017 : 1652 : stype, nunits1);
6018 : 1652 : reduce_with_shift = have_whole_vector_shift (mode1);
6019 : 635 : if (!VECTOR_MODE_P (mode1)
6020 : 2287 : || !directly_supported_p (code, vectype1))
6021 : : reduce_with_shift = false;
6022 : :
6023 : : /* First reduce the vector to the desired vector size we should
6024 : : do shift reduction on by combining upper and lower halves. */
6025 : 1652 : gimple_seq stmts = NULL;
6026 : 1652 : new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6027 : : code, &stmts);
6028 : 1652 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6029 : 1652 : reduc_inputs[0] = new_temp;
6030 : :
6031 : 1652 : if (reduce_with_shift && (!slp_reduc || group_size == 1))
6032 : : {
6033 : 1454 : int element_bitsize = tree_to_uhwi (bitsize);
6034 : : /* Enforced by vectorizable_reduction, which disallows SLP reductions
6035 : : for variable-length vectors and also requires direct target support
6036 : : for loop reductions. */
6037 : 1454 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6038 : 1454 : int nelements = vec_size_in_bits / element_bitsize;
6039 : 1454 : vec_perm_builder sel;
6040 : 1454 : vec_perm_indices indices;
6041 : :
6042 : 1454 : int elt_offset;
6043 : :
6044 : 1454 : tree zero_vec = build_zero_cst (vectype1);
6045 : : /* Case 2: Create:
6046 : : for (offset = nelements/2; offset >= 1; offset/=2)
6047 : : {
6048 : : Create: va' = vec_shift <va, offset>
6049 : : Create: va = vop <va, va'>
6050 : : } */
6051 : :
6052 : 1454 : tree rhs;
6053 : :
6054 : 1454 : if (dump_enabled_p ())
6055 : 320 : dump_printf_loc (MSG_NOTE, vect_location,
6056 : : "Reduce using vector shifts\n");
6057 : :
6058 : 1454 : gimple_seq stmts = NULL;
6059 : 1454 : new_temp = gimple_convert (&stmts, vectype1, new_temp);
6060 : 1454 : for (elt_offset = nelements / 2;
6061 : 3181 : elt_offset >= 1;
6062 : 1727 : elt_offset /= 2)
6063 : : {
6064 : 1727 : calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6065 : 1727 : indices.new_vector (sel, 2, nelements);
6066 : 1727 : tree mask = vect_gen_perm_mask_any (vectype1, indices);
6067 : 1727 : new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6068 : : new_temp, zero_vec, mask);
6069 : 1727 : new_temp = gimple_build (&stmts, code,
6070 : : vectype1, new_name, new_temp);
6071 : : }
6072 : 1454 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6073 : :
6074 : : /* 2.4 Extract the final scalar result. Create:
6075 : : s_out3 = extract_field <v_out2, bitpos> */
6076 : :
6077 : 1454 : if (dump_enabled_p ())
6078 : 320 : dump_printf_loc (MSG_NOTE, vect_location,
6079 : : "extract scalar result\n");
6080 : :
6081 : 1454 : rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6082 : : bitsize, bitsize_zero_node);
6083 : 1454 : epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6084 : 1454 : new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6085 : 1454 : gimple_assign_set_lhs (epilog_stmt, new_temp);
6086 : 1454 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6087 : 1454 : scalar_results.safe_push (new_temp);
6088 : 1454 : }
6089 : : else
6090 : : {
6091 : : /* Case 3: Create:
6092 : : s = extract_field <v_out2, 0>
6093 : : for (offset = element_size;
6094 : : offset < vector_size;
6095 : : offset += element_size;)
6096 : : {
6097 : : Create: s' = extract_field <v_out2, offset>
6098 : : Create: s = op <s, s'> // For non SLP cases
6099 : : } */
6100 : :
6101 : 198 : if (dump_enabled_p ())
6102 : 120 : dump_printf_loc (MSG_NOTE, vect_location,
6103 : : "Reduce using scalar code.\n");
6104 : :
6105 : 198 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6106 : 198 : int element_bitsize = tree_to_uhwi (bitsize);
6107 : 198 : tree compute_type = TREE_TYPE (vectype);
6108 : 198 : gimple_seq stmts = NULL;
6109 : 448 : FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6110 : : {
6111 : 250 : int bit_offset;
6112 : 500 : new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6113 : 250 : vec_temp, bitsize, bitsize_zero_node);
6114 : :
6115 : : /* In SLP we don't need to apply reduction operation, so we just
6116 : : collect s' values in SCALAR_RESULTS. */
6117 : 250 : if (slp_reduc)
6118 : 240 : scalar_results.safe_push (new_temp);
6119 : :
6120 : 530 : for (bit_offset = element_bitsize;
6121 : 780 : bit_offset < vec_size_in_bits;
6122 : 530 : bit_offset += element_bitsize)
6123 : : {
6124 : 530 : tree bitpos = bitsize_int (bit_offset);
6125 : 530 : new_name = gimple_build (&stmts, BIT_FIELD_REF,
6126 : : compute_type, vec_temp,
6127 : : bitsize, bitpos);
6128 : 530 : if (slp_reduc)
6129 : : {
6130 : : /* In SLP we don't need to apply reduction operation, so
6131 : : we just collect s' values in SCALAR_RESULTS. */
6132 : 520 : new_temp = new_name;
6133 : 520 : scalar_results.safe_push (new_name);
6134 : : }
6135 : : else
6136 : 10 : new_temp = gimple_build (&stmts, code, compute_type,
6137 : : new_name, new_temp);
6138 : : }
6139 : : }
6140 : :
6141 : : /* The only case where we need to reduce scalar results in a SLP
6142 : : reduction, is unrolling. If the size of SCALAR_RESULTS is
6143 : : greater than GROUP_SIZE, we reduce them combining elements modulo
6144 : : GROUP_SIZE. */
6145 : 198 : if (slp_reduc)
6146 : : {
6147 : 188 : tree res, first_res, new_res;
6148 : :
6149 : : /* Reduce multiple scalar results in case of SLP unrolling. */
6150 : 447 : for (j = group_size; scalar_results.iterate (j, &res);
6151 : : j++)
6152 : : {
6153 : 259 : first_res = scalar_results[j % group_size];
6154 : 259 : new_res = gimple_build (&stmts, code, compute_type,
6155 : : first_res, res);
6156 : 259 : scalar_results[j % group_size] = new_res;
6157 : : }
6158 : 188 : scalar_results.truncate (group_size);
6159 : 877 : for (k = 0; k < group_size; k++)
6160 : 1002 : scalar_results[k] = gimple_convert (&stmts, scalar_type,
6161 : 501 : scalar_results[k]);
6162 : : }
6163 : : else
6164 : : {
6165 : : /* Reduction chain - we have one scalar to keep in
6166 : : SCALAR_RESULTS. */
6167 : 10 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6168 : 10 : scalar_results.safe_push (new_temp);
6169 : : }
6170 : :
6171 : 198 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6172 : : }
6173 : :
6174 : 1652 : if ((VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6175 : 0 : && induc_val)
6176 : : {
6177 : : /* Earlier we set the initial value to be a vector if induc_val
6178 : : values. Check the result and if it is induc_val then replace
6179 : : with the original initial value, unless induc_val is
6180 : : the same as initial_def already. */
6181 : 0 : tree zcompare = make_ssa_name (boolean_type_node);
6182 : 0 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6183 : 0 : scalar_results[0], induc_val);
6184 : 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6185 : 0 : tree initial_def = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[0];
6186 : 0 : tree tmp = make_ssa_name (new_scalar_dest);
6187 : 0 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6188 : 0 : initial_def, scalar_results[0]);
6189 : 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6190 : 0 : scalar_results[0] = tmp;
6191 : : }
6192 : : }
6193 : :
6194 : : /* 2.5 Adjust the final result by the initial value of the reduction
6195 : : variable. (When such adjustment is not needed, then
6196 : : 'adjustment_def' is zero). For example, if code is PLUS we create:
6197 : : new_temp = loop_exit_def + adjustment_def */
6198 : :
6199 : 21191 : if (adjustment_def)
6200 : : {
6201 : 15633 : gcc_assert (!slp_reduc || group_size == 1);
6202 : 15633 : gimple_seq stmts = NULL;
6203 : 15633 : if (double_reduc)
6204 : : {
6205 : 0 : gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6206 : 0 : adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6207 : 0 : new_temp = gimple_build (&stmts, code, vectype,
6208 : 0 : reduc_inputs[0], adjustment_def);
6209 : : }
6210 : : else
6211 : : {
6212 : 15633 : new_temp = scalar_results[0];
6213 : 15633 : gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6214 : 15633 : adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6215 : : adjustment_def);
6216 : 15633 : new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6217 : 15633 : new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6218 : : new_temp, adjustment_def);
6219 : 15633 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6220 : : }
6221 : :
6222 : 15633 : epilog_stmt = gimple_seq_last_stmt (stmts);
6223 : 15633 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6224 : 15633 : scalar_results[0] = new_temp;
6225 : : }
6226 : :
6227 : : /* Record this operation if it could be reused by the epilogue loop. */
6228 : 21191 : if (VECT_REDUC_INFO_TYPE (reduc_info) == TREE_CODE_REDUCTION
6229 : 21191 : && reduc_inputs.length () == 1)
6230 : 21016 : loop_vinfo->reusable_accumulators.put (scalar_results[0],
6231 : : { orig_reduc_input, reduc_info });
6232 : :
6233 : : /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6234 : : phis with new adjusted scalar results, i.e., replace use <s_out0>
6235 : : with use <s_out4>.
6236 : :
6237 : : Transform:
6238 : : loop_exit:
6239 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6240 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6241 : : v_out2 = reduce <v_out1>
6242 : : s_out3 = extract_field <v_out2, 0>
6243 : : s_out4 = adjust_result <s_out3>
6244 : : use <s_out0>
6245 : : use <s_out0>
6246 : :
6247 : : into:
6248 : :
6249 : : loop_exit:
6250 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6251 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6252 : : v_out2 = reduce <v_out1>
6253 : : s_out3 = extract_field <v_out2, 0>
6254 : : s_out4 = adjust_result <s_out3>
6255 : : use <s_out4>
6256 : : use <s_out4> */
6257 : :
6258 : 42382 : gcc_assert (live_out_stmts.size () == scalar_results.length ());
6259 : 21191 : auto_vec<gimple *> phis;
6260 : 42695 : for (k = 0; k < live_out_stmts.size (); k++)
6261 : : {
6262 : 21504 : stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6263 : 21504 : tree scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6264 : :
6265 : : /* Find the loop-closed-use at the loop exit of the original scalar
6266 : : result. (The reduction result is expected to have two immediate uses,
6267 : : one at the latch block, and one at the loop exit). Note with
6268 : : early break we can have two exit blocks, so pick the correct PHI. */
6269 : 88597 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6270 : 67093 : if (!is_gimple_debug (USE_STMT (use_p))
6271 : 67093 : && !flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6272 : : {
6273 : 21499 : gcc_assert (is_a <gphi *> (USE_STMT (use_p)));
6274 : 21499 : if (gimple_bb (USE_STMT (use_p)) == loop_exit->dest)
6275 : 21491 : phis.safe_push (USE_STMT (use_p));
6276 : : }
6277 : :
6278 : 42995 : FOR_EACH_VEC_ELT (phis, i, exit_phi)
6279 : : {
6280 : : /* Replace the uses: */
6281 : 21491 : orig_name = PHI_RESULT (exit_phi);
6282 : :
6283 : : /* Look for a single use at the target of the skip edge. */
6284 : 21491 : if (unify_with_main_loop_p)
6285 : : {
6286 : 33 : use_operand_p use_p;
6287 : 33 : gimple *user;
6288 : 33 : if (!single_imm_use (orig_name, &use_p, &user))
6289 : 0 : gcc_unreachable ();
6290 : 33 : orig_name = gimple_get_lhs (user);
6291 : : }
6292 : :
6293 : 21491 : scalar_result = scalar_results[k];
6294 : 58562 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6295 : : {
6296 : 111257 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6297 : 37093 : SET_USE (use_p, scalar_result);
6298 : 37071 : update_stmt (use_stmt);
6299 : 21491 : }
6300 : : }
6301 : :
6302 : 21504 : phis.truncate (0);
6303 : : }
6304 : 21191 : }
6305 : :
6306 : : /* Return a vector of type VECTYPE that is equal to the vector select
6307 : : operation "MASK ? VEC : IDENTITY". Insert the select statements
6308 : : before GSI. */
6309 : :
6310 : : static tree
6311 : 0 : merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6312 : : tree vec, tree identity)
6313 : : {
6314 : 0 : tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6315 : 0 : gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6316 : : mask, vec, identity);
6317 : 0 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6318 : 0 : return cond;
6319 : : }
6320 : :
6321 : : /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6322 : : order, starting with LHS. Insert the extraction statements before GSI and
6323 : : associate the new scalar SSA names with variable SCALAR_DEST.
6324 : : If MASK is nonzero mask the input and then operate on it unconditionally.
6325 : : Return the SSA name for the result. */
6326 : :
6327 : : static tree
6328 : 1001 : vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6329 : : tree_code code, tree lhs, tree vector_rhs,
6330 : : tree mask)
6331 : : {
6332 : 1001 : tree vectype = TREE_TYPE (vector_rhs);
6333 : 1001 : tree scalar_type = TREE_TYPE (vectype);
6334 : 1001 : tree bitsize = TYPE_SIZE (scalar_type);
6335 : 1001 : unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6336 : 1001 : unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6337 : :
6338 : : /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6339 : : to perform an unconditional element-wise reduction of it. */
6340 : 1001 : if (mask)
6341 : : {
6342 : 11 : tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
6343 : : "masked_vector_rhs");
6344 : 11 : tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
6345 : : false);
6346 : 11 : tree vector_identity = build_vector_from_val (vectype, neutral_op);
6347 : 11 : gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
6348 : : mask, vector_rhs, vector_identity);
6349 : 11 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6350 : 11 : vector_rhs = masked_vector_rhs;
6351 : : }
6352 : :
6353 : 1001 : for (unsigned HOST_WIDE_INT bit_offset = 0;
6354 : 4297 : bit_offset < vec_size_in_bits;
6355 : 3296 : bit_offset += element_bitsize)
6356 : : {
6357 : 3296 : tree bitpos = bitsize_int (bit_offset);
6358 : 3296 : tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6359 : : bitsize, bitpos);
6360 : :
6361 : 3296 : gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6362 : 3296 : rhs = make_ssa_name (scalar_dest, stmt);
6363 : 3296 : gimple_assign_set_lhs (stmt, rhs);
6364 : 3296 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6365 : : /* Fold the vector extract, combining it with a previous reversal
6366 : : like seen in PR90579. */
6367 : 3296 : auto gsi2 = gsi_for_stmt (stmt);
6368 : 3296 : if (fold_stmt (&gsi2, follow_all_ssa_edges))
6369 : 356 : update_stmt (gsi_stmt (gsi2));
6370 : :
6371 : 3296 : stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6372 : 3296 : tree new_name = make_ssa_name (scalar_dest, stmt);
6373 : 3296 : gimple_assign_set_lhs (stmt, new_name);
6374 : 3296 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6375 : 3296 : lhs = new_name;
6376 : : }
6377 : 1001 : return lhs;
6378 : : }
6379 : :
6380 : : /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6381 : : type of the vector input. */
6382 : :
6383 : : static internal_fn
6384 : 846 : get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6385 : : {
6386 : 846 : internal_fn mask_reduc_fn;
6387 : 846 : internal_fn mask_len_reduc_fn;
6388 : :
6389 : 846 : switch (reduc_fn)
6390 : : {
6391 : 0 : case IFN_FOLD_LEFT_PLUS:
6392 : 0 : mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6393 : 0 : mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6394 : 0 : break;
6395 : :
6396 : : default:
6397 : : return IFN_LAST;
6398 : : }
6399 : :
6400 : 0 : if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6401 : : OPTIMIZE_FOR_SPEED))
6402 : : return mask_reduc_fn;
6403 : 0 : if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6404 : : OPTIMIZE_FOR_SPEED))
6405 : : return mask_len_reduc_fn;
6406 : : return IFN_LAST;
6407 : : }
6408 : :
6409 : : /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6410 : : statement that sets the live-out value. REDUC_DEF_STMT is the phi
6411 : : statement. CODE is the operation performed by STMT_INFO and OPS are
6412 : : its scalar operands. REDUC_INDEX is the index of the operand in
6413 : : OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6414 : : implements in-order reduction, or IFN_LAST if we should open-code it.
6415 : : VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6416 : : that should be used to control the operation in a fully-masked loop. */
6417 : :
6418 : : static bool
6419 : 838 : vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6420 : : stmt_vec_info stmt_info,
6421 : : gimple_stmt_iterator *gsi,
6422 : : slp_tree slp_node,
6423 : : code_helper code, internal_fn reduc_fn,
6424 : : int num_ops, tree vectype_in,
6425 : : int reduc_index, vec_loop_masks *masks,
6426 : : vec_loop_lens *lens)
6427 : : {
6428 : 838 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6429 : 838 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
6430 : 838 : internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6431 : :
6432 : 838 : gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6433 : :
6434 : 838 : bool is_cond_op = false;
6435 : 838 : if (!code.is_tree_code ())
6436 : : {
6437 : 9 : code = conditional_internal_fn_code (internal_fn (code));
6438 : 9 : gcc_assert (code != ERROR_MARK);
6439 : : is_cond_op = true;
6440 : : }
6441 : :
6442 : 838 : gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
6443 : :
6444 : 838 : gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6445 : : TYPE_VECTOR_SUBPARTS (vectype_in)));
6446 : :
6447 : : /* ??? We should, when transforming the cycle PHI, record the existing
6448 : : scalar def as vector def so looking up the vector def works. This
6449 : : would also allow generalizing this for reduction paths of length > 1
6450 : : and/or SLP reductions. */
6451 : 838 : slp_tree reduc_node = SLP_TREE_CHILDREN (slp_node)[reduc_index];
6452 : 838 : tree reduc_var = vect_get_slp_scalar_def (reduc_node, 0);
6453 : :
6454 : : /* The operands either come from a binary operation or an IFN_COND operation.
6455 : : The former is a gimple assign with binary rhs and the latter is a
6456 : : gimple call with four arguments. */
6457 : 838 : gcc_assert (num_ops == 2 || num_ops == 4);
6458 : :
6459 : 838 : int group_size = 1;
6460 : 838 : stmt_vec_info scalar_dest_def_info;
6461 : 838 : auto_vec<tree> vec_oprnds0, vec_opmask;
6462 : 838 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[(is_cond_op ? 2 : 0)
6463 : 838 : + (1 - reduc_index)],
6464 : : &vec_oprnds0);
6465 : 838 : group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6466 : 838 : scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6467 : : /* For an IFN_COND_OP we also need the vector mask operand. */
6468 : 838 : if (is_cond_op)
6469 : 9 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], &vec_opmask);
6470 : :
6471 : 838 : gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
6472 : 838 : tree scalar_dest = gimple_get_lhs (sdef);
6473 : 838 : tree scalar_type = TREE_TYPE (scalar_dest);
6474 : :
6475 : 838 : int vec_num = vec_oprnds0.length ();
6476 : 838 : tree vec_elem_type = TREE_TYPE (vectype_out);
6477 : 838 : gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6478 : :
6479 : 838 : tree vector_identity = NULL_TREE;
6480 : 838 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6481 : : {
6482 : 0 : vector_identity = build_zero_cst (vectype_out);
6483 : 0 : if (!HONOR_SIGNED_ZEROS (vectype_out))
6484 : : ;
6485 : : else
6486 : : {
6487 : 0 : gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
6488 : 0 : vector_identity = const_unop (NEGATE_EXPR, vectype_out,
6489 : : vector_identity);
6490 : : }
6491 : : }
6492 : :
6493 : 838 : tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6494 : 838 : int i;
6495 : 838 : tree def0;
6496 : 1839 : FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6497 : : {
6498 : 1001 : gimple *new_stmt;
6499 : 1001 : tree mask = NULL_TREE;
6500 : 1001 : tree len = NULL_TREE;
6501 : 1001 : tree bias = NULL_TREE;
6502 : 1001 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6503 : : {
6504 : 0 : tree loop_mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
6505 : : vec_num, vectype_in, i);
6506 : 0 : if (is_cond_op)
6507 : 0 : mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (loop_mask),
6508 : 0 : loop_mask, vec_opmask[i], gsi);
6509 : : else
6510 : : mask = loop_mask;
6511 : : }
6512 : 1001 : else if (is_cond_op)
6513 : 11 : mask = vec_opmask[i];
6514 : 1001 : if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
6515 : : {
6516 : 0 : len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
6517 : : i, 1);
6518 : 0 : signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
6519 : 0 : bias = build_int_cst (intQI_type_node, biasval);
6520 : 0 : if (!is_cond_op)
6521 : 0 : mask = build_minus_one_cst (truth_type_for (vectype_in));
6522 : : }
6523 : :
6524 : : /* Handle MINUS by adding the negative. */
6525 : 1001 : if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6526 : : {
6527 : 0 : tree negated = make_ssa_name (vectype_out);
6528 : 0 : new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6529 : 0 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6530 : 0 : def0 = negated;
6531 : : }
6532 : :
6533 : 0 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
6534 : 1001 : && mask && mask_reduc_fn == IFN_LAST)
6535 : 0 : def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6536 : : vector_identity);
6537 : :
6538 : : /* On the first iteration the input is simply the scalar phi
6539 : : result, and for subsequent iterations it is the output of
6540 : : the preceding operation. */
6541 : 1001 : if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6542 : : {
6543 : 0 : if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6544 : 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
6545 : : def0, mask, len, bias);
6546 : 0 : else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
6547 : 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6548 : : def0, mask);
6549 : : else
6550 : 0 : new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6551 : : def0);
6552 : : /* For chained SLP reductions the output of the previous reduction
6553 : : operation serves as the input of the next. For the final statement
6554 : : the output cannot be a temporary - we reuse the original
6555 : : scalar destination of the last statement. */
6556 : 0 : if (i != vec_num - 1)
6557 : : {
6558 : 0 : gimple_set_lhs (new_stmt, scalar_dest_var);
6559 : 0 : reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6560 : 0 : gimple_set_lhs (new_stmt, reduc_var);
6561 : : }
6562 : : }
6563 : : else
6564 : : {
6565 : 1001 : reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
6566 : : tree_code (code), reduc_var, def0,
6567 : : mask);
6568 : 1001 : new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6569 : : /* Remove the statement, so that we can use the same code paths
6570 : : as for statements that we've just created. */
6571 : 1001 : gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6572 : 1001 : gsi_remove (&tmp_gsi, true);
6573 : : }
6574 : :
6575 : 1001 : if (i == vec_num - 1)
6576 : : {
6577 : 838 : gimple_set_lhs (new_stmt, scalar_dest);
6578 : 838 : vect_finish_replace_stmt (loop_vinfo,
6579 : : scalar_dest_def_info,
6580 : : new_stmt);
6581 : : }
6582 : : else
6583 : 163 : vect_finish_stmt_generation (loop_vinfo,
6584 : : scalar_dest_def_info,
6585 : : new_stmt, gsi);
6586 : :
6587 : 1001 : slp_node->push_vec_def (new_stmt);
6588 : : }
6589 : :
6590 : 838 : return true;
6591 : 838 : }
6592 : :
6593 : : /* Function is_nonwrapping_integer_induction.
6594 : :
6595 : : Check if STMT_VINO (which is part of loop LOOP) both increments and
6596 : : does not cause overflow. */
6597 : :
6598 : : static bool
6599 : 377 : is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6600 : : {
6601 : 377 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6602 : 377 : tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6603 : 377 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6604 : 377 : tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6605 : 377 : widest_int ni, max_loop_value, lhs_max;
6606 : 377 : wi::overflow_type overflow = wi::OVF_NONE;
6607 : :
6608 : : /* Make sure the loop is integer based. */
6609 : 377 : if (TREE_CODE (base) != INTEGER_CST
6610 : 112 : || TREE_CODE (step) != INTEGER_CST)
6611 : : return false;
6612 : :
6613 : : /* Check that the max size of the loop will not wrap. */
6614 : :
6615 : 112 : if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6616 : : return true;
6617 : :
6618 : 8 : if (! max_stmt_executions (loop, &ni))
6619 : : return false;
6620 : :
6621 : 8 : max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6622 : 8 : &overflow);
6623 : 8 : if (overflow)
6624 : : return false;
6625 : :
6626 : 8 : max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6627 : 16 : TYPE_SIGN (lhs_type), &overflow);
6628 : 8 : if (overflow)
6629 : : return false;
6630 : :
6631 : 8 : return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6632 : 8 : <= TYPE_PRECISION (lhs_type));
6633 : 377 : }
6634 : :
6635 : : /* Check if masking can be supported by inserting a conditional expression.
6636 : : CODE is the code for the operation. COND_FN is the conditional internal
6637 : : function, if it exists. VECTYPE_IN is the type of the vector input. */
6638 : : static bool
6639 : 2333 : use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6640 : : tree vectype_in)
6641 : : {
6642 : 2333 : if (cond_fn != IFN_LAST
6643 : 2333 : && direct_internal_fn_supported_p (cond_fn, vectype_in,
6644 : : OPTIMIZE_FOR_SPEED))
6645 : : return false;
6646 : :
6647 : 2287 : if (code.is_tree_code ())
6648 : 2010 : switch (tree_code (code))
6649 : : {
6650 : : case DOT_PROD_EXPR:
6651 : : case SAD_EXPR:
6652 : : return true;
6653 : :
6654 : : default:
6655 : : break;
6656 : : }
6657 : : return false;
6658 : : }
6659 : :
6660 : : /* Insert a conditional expression to enable masked vectorization. CODE is the
6661 : : code for the operation. VOP is the array of operands. MASK is the loop
6662 : : mask. GSI is a statement iterator used to place the new conditional
6663 : : expression. */
6664 : : static void
6665 : 4 : build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6666 : : gimple_stmt_iterator *gsi)
6667 : : {
6668 : 4 : switch (tree_code (code))
6669 : : {
6670 : 4 : case DOT_PROD_EXPR:
6671 : 4 : {
6672 : 4 : tree vectype = TREE_TYPE (vop[1]);
6673 : 4 : tree zero = build_zero_cst (vectype);
6674 : 4 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6675 : 4 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6676 : : mask, vop[1], zero);
6677 : 4 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6678 : 4 : vop[1] = masked_op1;
6679 : 4 : break;
6680 : : }
6681 : :
6682 : 0 : case SAD_EXPR:
6683 : 0 : {
6684 : 0 : tree vectype = TREE_TYPE (vop[1]);
6685 : 0 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6686 : 0 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6687 : : mask, vop[1], vop[0]);
6688 : 0 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6689 : 0 : vop[1] = masked_op1;
6690 : 0 : break;
6691 : : }
6692 : :
6693 : 0 : default:
6694 : 0 : gcc_unreachable ();
6695 : : }
6696 : 4 : }
6697 : :
6698 : : /* Given an operation with CODE in loop reduction path whose reduction PHI is
6699 : : specified by REDUC_INFO, the operation has TYPE of scalar result, and its
6700 : : input vectype is represented by VECTYPE_IN. The vectype of vectorized result
6701 : : may be different from VECTYPE_IN, either in base type or vectype lanes,
6702 : : lane-reducing operation is the case. This function check if it is possible,
6703 : : and how to perform partial vectorization on the operation in the context
6704 : : of LOOP_VINFO. */
6705 : :
6706 : : static void
6707 : 8 : vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo,
6708 : : vect_reduc_info reduc_info,
6709 : : slp_tree slp_node,
6710 : : code_helper code, tree type,
6711 : : tree vectype_in)
6712 : : {
6713 : 8 : enum vect_reduction_type reduc_type = VECT_REDUC_INFO_TYPE (reduc_info);
6714 : 8 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
6715 : 8 : internal_fn cond_fn = get_conditional_internal_fn (code, type);
6716 : :
6717 : 8 : if (reduc_type != FOLD_LEFT_REDUCTION
6718 : 8 : && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6719 : 12 : && (cond_fn == IFN_LAST
6720 : 4 : || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6721 : : OPTIMIZE_FOR_SPEED)))
6722 : : {
6723 : 0 : if (dump_enabled_p ())
6724 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6725 : : "can't operate on partial vectors because"
6726 : : " no conditional operation is available.\n");
6727 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6728 : : }
6729 : 8 : else if (reduc_type == FOLD_LEFT_REDUCTION
6730 : 8 : && reduc_fn == IFN_LAST
6731 : 8 : && !expand_vec_cond_expr_p (vectype_in, truth_type_for (vectype_in)))
6732 : : {
6733 : 0 : if (dump_enabled_p ())
6734 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6735 : : "can't operate on partial vectors because"
6736 : : " no conditional operation is available.\n");
6737 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6738 : : }
6739 : 8 : else if (reduc_type == FOLD_LEFT_REDUCTION
6740 : 0 : && internal_fn_mask_index (reduc_fn) == -1
6741 : 0 : && FLOAT_TYPE_P (vectype_in)
6742 : 8 : && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
6743 : : {
6744 : 0 : if (dump_enabled_p ())
6745 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6746 : : "can't operate on partial vectors because"
6747 : : " signed zeros cannot be preserved.\n");
6748 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6749 : : }
6750 : : else
6751 : : {
6752 : 8 : internal_fn mask_reduc_fn
6753 : 8 : = get_masked_reduction_fn (reduc_fn, vectype_in);
6754 : 8 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6755 : 8 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
6756 : 8 : unsigned nvectors = vect_get_num_copies (loop_vinfo, slp_node);
6757 : :
6758 : 8 : if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6759 : 0 : vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_in, 1);
6760 : : else
6761 : 8 : vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_in, NULL);
6762 : : }
6763 : 8 : }
6764 : :
6765 : : /* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
6766 : : the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC,
6767 : : and the analysis is for slp if SLP_NODE is not NULL.
6768 : :
6769 : : For a lane-reducing operation, the loop reduction path that it lies in,
6770 : : may contain normal operation, or other lane-reducing operation of different
6771 : : input type size, an example as:
6772 : :
6773 : : int sum = 0;
6774 : : for (i)
6775 : : {
6776 : : ...
6777 : : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
6778 : : sum += w[i]; // widen-sum <vector(16) char>
6779 : : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
6780 : : sum += n[i]; // normal <vector(4) int>
6781 : : ...
6782 : : }
6783 : :
6784 : : Vectorization factor is essentially determined by operation whose input
6785 : : vectype has the most lanes ("vector(16) char" in the example), while we
6786 : : need to choose input vectype with the least lanes ("vector(4) int" in the
6787 : : example) to determine effective number of vector reduction PHIs. */
6788 : :
6789 : : bool
6790 : 315169 : vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
6791 : : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
6792 : : {
6793 : 315169 : gimple *stmt = stmt_info->stmt;
6794 : :
6795 : 315169 : if (!lane_reducing_stmt_p (stmt))
6796 : : return false;
6797 : :
6798 : 439 : tree type = TREE_TYPE (gimple_assign_lhs (stmt));
6799 : :
6800 : 439 : if (!INTEGRAL_TYPE_P (type))
6801 : : return false;
6802 : :
6803 : : /* Do not try to vectorize bit-precision reductions. */
6804 : 439 : if (!type_has_mode_precision_p (type))
6805 : : return false;
6806 : :
6807 : 439 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
6808 : :
6809 : : /* TODO: Support lane-reducing operation that does not directly participate
6810 : : in loop reduction. */
6811 : 439 : if (!reduc_info)
6812 : : return false;
6813 : :
6814 : : /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
6815 : : recoginized. */
6816 : 439 : gcc_assert (!nested_in_vect_loop_p (LOOP_VINFO_LOOP (loop_vinfo), stmt_info));
6817 : 439 : gcc_assert (VECT_REDUC_INFO_TYPE (reduc_info) == TREE_CODE_REDUCTION);
6818 : :
6819 : 1756 : for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
6820 : : {
6821 : 1317 : slp_tree slp_op;
6822 : 1317 : tree op;
6823 : 1317 : tree vectype;
6824 : 1317 : enum vect_def_type dt;
6825 : :
6826 : 1317 : if (!vect_is_simple_use (loop_vinfo, slp_node, i, &op,
6827 : : &slp_op, &dt, &vectype))
6828 : : {
6829 : 0 : if (dump_enabled_p ())
6830 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6831 : : "use not simple.\n");
6832 : 0 : return false;
6833 : : }
6834 : :
6835 : 1317 : if (!vectype)
6836 : : {
6837 : 4 : vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
6838 : : slp_op);
6839 : 4 : if (!vectype)
6840 : : return false;
6841 : : }
6842 : :
6843 : 1317 : if (!vect_maybe_update_slp_op_vectype (slp_op, vectype))
6844 : : {
6845 : 0 : if (dump_enabled_p ())
6846 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6847 : : "incompatible vector types for invariants\n");
6848 : 0 : return false;
6849 : : }
6850 : :
6851 : 1317 : if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6852 : 439 : continue;
6853 : :
6854 : : /* There should be at most one cycle def in the stmt. */
6855 : 878 : if (VECTORIZABLE_CYCLE_DEF (dt))
6856 : : return false;
6857 : : }
6858 : :
6859 : 439 : slp_tree node_in = SLP_TREE_CHILDREN (slp_node)[0];
6860 : 439 : tree vectype_in = SLP_TREE_VECTYPE (node_in);
6861 : 439 : gcc_assert (vectype_in);
6862 : :
6863 : : /* Compute number of effective vector statements for costing. */
6864 : 439 : unsigned int ncopies_for_cost = vect_get_num_copies (loop_vinfo, node_in);
6865 : 439 : gcc_assert (ncopies_for_cost >= 1);
6866 : :
6867 : 439 : if (vect_is_emulated_mixed_dot_prod (slp_node))
6868 : : {
6869 : : /* We need extra two invariants: one that contains the minimum signed
6870 : : value and one that contains half of its negative. */
6871 : 8 : int prologue_stmts = 2;
6872 : 8 : unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
6873 : : scalar_to_vec, slp_node, 0,
6874 : : vect_prologue);
6875 : 8 : if (dump_enabled_p ())
6876 : 0 : dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
6877 : : "extra prologue_cost = %d .\n", cost);
6878 : :
6879 : : /* Three dot-products and a subtraction. */
6880 : 8 : ncopies_for_cost *= 4;
6881 : : }
6882 : :
6883 : 439 : record_stmt_cost (cost_vec, (int) ncopies_for_cost, vector_stmt, slp_node,
6884 : : 0, vect_body);
6885 : :
6886 : 439 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
6887 : : {
6888 : 4 : enum tree_code code = gimple_assign_rhs_code (stmt);
6889 : 4 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
6890 : 4 : node_in, code, type,
6891 : : vectype_in);
6892 : : }
6893 : :
6894 : : /* Transform via vect_transform_reduction. */
6895 : 439 : SLP_TREE_TYPE (slp_node) = reduc_vec_info_type;
6896 : 439 : return true;
6897 : : }
6898 : :
6899 : : /* Function vectorizable_reduction.
6900 : :
6901 : : Check if STMT_INFO performs a reduction operation that can be vectorized.
6902 : : If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6903 : : stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6904 : : Return true if STMT_INFO is vectorizable in this way.
6905 : :
6906 : : This function also handles reduction idioms (patterns) that have been
6907 : : recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6908 : : may be of this form:
6909 : : X = pattern_expr (arg0, arg1, ..., X)
6910 : : and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6911 : : sequence that had been detected and replaced by the pattern-stmt
6912 : : (STMT_INFO).
6913 : :
6914 : : This function also handles reduction of condition expressions, for example:
6915 : : for (int i = 0; i < N; i++)
6916 : : if (a[i] < value)
6917 : : last = a[i];
6918 : : This is handled by vectorising the loop and creating an additional vector
6919 : : containing the loop indexes for which "a[i] < value" was true. In the
6920 : : function epilogue this is reduced to a single max value and then used to
6921 : : index into the vector of results.
6922 : :
6923 : : In some cases of reduction patterns, the type of the reduction variable X is
6924 : : different than the type of the other arguments of STMT_INFO.
6925 : : In such cases, the vectype that is used when transforming STMT_INFO into
6926 : : a vector stmt is different than the vectype that is used to determine the
6927 : : vectorization factor, because it consists of a different number of elements
6928 : : than the actual number of elements that are being operated upon in parallel.
6929 : :
6930 : : For example, consider an accumulation of shorts into an int accumulator.
6931 : : On some targets it's possible to vectorize this pattern operating on 8
6932 : : shorts at a time (hence, the vectype for purposes of determining the
6933 : : vectorization factor should be V8HI); on the other hand, the vectype that
6934 : : is used to create the vector form is actually V4SI (the type of the result).
6935 : :
6936 : : Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6937 : : indicates what is the actual level of parallelism (V8HI in the example), so
6938 : : that the right vectorization factor would be derived. This vectype
6939 : : corresponds to the type of arguments to the reduction stmt, and should *NOT*
6940 : : be used to create the vectorized stmt. The right vectype for the vectorized
6941 : : stmt is obtained from the type of the result X:
6942 : : get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6943 : :
6944 : : This means that, contrary to "regular" reductions (or "regular" stmts in
6945 : : general), the following equation:
6946 : : STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6947 : : does *NOT* necessarily hold for reduction patterns. */
6948 : :
6949 : : bool
6950 : 314730 : vectorizable_reduction (loop_vec_info loop_vinfo,
6951 : : stmt_vec_info stmt_info, slp_tree slp_node,
6952 : : slp_instance slp_node_instance,
6953 : : stmt_vector_for_cost *cost_vec)
6954 : : {
6955 : 314730 : tree vectype_in = NULL_TREE;
6956 : 314730 : enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6957 : 314730 : stmt_vec_info cond_stmt_vinfo = NULL;
6958 : 314730 : int i;
6959 : 314730 : int ncopies;
6960 : 314730 : bool single_defuse_cycle = false;
6961 : 314730 : tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6962 : 314730 : tree cond_reduc_val = NULL_TREE;
6963 : 314730 : const bool reduc_chain
6964 : 314730 : = SLP_INSTANCE_KIND (slp_node_instance) == slp_inst_kind_reduc_chain;
6965 : :
6966 : : /* Make sure it was already recognized as a reduction computation. */
6967 : 314730 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6968 : : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6969 : 314730 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6970 : : return false;
6971 : :
6972 : : /* The reduction meta. */
6973 : 53921 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
6974 : :
6975 : 53921 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6976 : : {
6977 : 1290 : gcc_assert (is_a <gphi *> (stmt_info->stmt));
6978 : : /* We eventually need to set a vector type on invariant arguments. */
6979 : : unsigned j;
6980 : : slp_tree child;
6981 : 3870 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6982 : 2580 : if (!vect_maybe_update_slp_op_vectype (child,
6983 : : SLP_TREE_VECTYPE (slp_node)))
6984 : : {
6985 : 0 : if (dump_enabled_p ())
6986 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6987 : : "incompatible vector types for "
6988 : : "invariants\n");
6989 : 0 : return false;
6990 : : }
6991 : : /* Analysis for double-reduction is done on the outer
6992 : : loop PHI, nested cycles have no further restrictions. */
6993 : 1290 : SLP_TREE_TYPE (slp_node) = cycle_phi_info_type;
6994 : 1290 : return true;
6995 : : }
6996 : :
6997 : 52631 : if (!is_a <gphi *> (stmt_info->stmt))
6998 : : {
6999 : 6809 : gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def);
7000 : 6809 : SLP_TREE_TYPE (slp_node) = reduc_vec_info_type;
7001 : 6809 : return true;
7002 : : }
7003 : :
7004 : 45822 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7005 : 45822 : stmt_vec_info phi_info = stmt_info;
7006 : 45822 : bool double_reduc = false;
7007 : 45822 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7008 : : {
7009 : : /* We arrive here for both the inner loop LC PHI and the
7010 : : outer loop PHI. The latter is what we want to analyze the
7011 : : reduction with. The LC PHI is handled by vectorizable_lc_phi. */
7012 : 388 : if (gimple_bb (stmt_info->stmt) != loop->header)
7013 : 109 : return false;
7014 : :
7015 : : /* Set loop and phi_info to the inner loop. */
7016 : 279 : use_operand_p use_p;
7017 : 279 : gimple *use_stmt;
7018 : 279 : bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7019 : : &use_p, &use_stmt);
7020 : 279 : gcc_assert (res);
7021 : 279 : phi_info = loop_vinfo->lookup_stmt (use_stmt);
7022 : 279 : loop = loop->inner;
7023 : 279 : double_reduc = true;
7024 : : }
7025 : :
7026 : 45713 : slp_node_instance->reduc_phis = slp_node;
7027 : : /* ??? We're leaving slp_node to point to the PHIs, we only
7028 : : need it to get at the number of vector stmts which wasn't
7029 : : yet initialized for the instance root. */
7030 : :
7031 : : /* PHIs should not participate in patterns. */
7032 : 45713 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7033 : 45713 : gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7034 : :
7035 : : /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7036 : : and compute the reduction chain length. Discover the real
7037 : : reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7038 : 45713 : tree reduc_def
7039 : 45713 : = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_latch_edge (loop));
7040 : 45713 : unsigned reduc_chain_length = 0;
7041 : 45713 : bool only_slp_reduc_chain = true;
7042 : 45713 : stmt_info = NULL;
7043 : 45713 : slp_tree slp_for_stmt_info = NULL;
7044 : 45713 : slp_tree vdef_slp = slp_node_instance->root;
7045 : : /* For double-reductions we start SLP analysis at the inner loop LC PHI
7046 : : which is the def of the outer loop live stmt. */
7047 : 45713 : if (double_reduc)
7048 : 279 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[0];
7049 : 101520 : while (reduc_def != PHI_RESULT (reduc_def_phi))
7050 : : {
7051 : 55831 : stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7052 : 55831 : stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7053 : 55831 : int reduc_idx = STMT_VINFO_REDUC_IDX (vdef);
7054 : 55831 : if (STMT_VINFO_REDUC_IDX (vdef) == -1
7055 : 55831 : || SLP_TREE_REDUC_IDX (vdef_slp) == -1)
7056 : : {
7057 : 0 : if (dump_enabled_p ())
7058 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7059 : : "reduction chain broken by patterns.\n");
7060 : 24 : return false;
7061 : : }
7062 : 55831 : if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7063 : 49118 : only_slp_reduc_chain = false;
7064 : 55831 : gimple_match_op op;
7065 : 55831 : if (!gimple_extract_op (vdef->stmt, &op))
7066 : : {
7067 : 0 : if (dump_enabled_p ())
7068 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7069 : : "reduction chain includes unsupported"
7070 : : " statement type.\n");
7071 : 0 : return false;
7072 : : }
7073 : 55831 : if (CONVERT_EXPR_CODE_P (op.code))
7074 : : {
7075 : 3210 : if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7076 : : {
7077 : 24 : if (dump_enabled_p ())
7078 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7079 : : "conversion in the reduction chain.\n");
7080 : 24 : return false;
7081 : : }
7082 : 3186 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[0];
7083 : : }
7084 : : else
7085 : : {
7086 : : /* First non-conversion stmt. */
7087 : 52621 : if (!stmt_info)
7088 : : {
7089 : 45709 : stmt_info = vdef;
7090 : 45709 : slp_for_stmt_info = vdef_slp;
7091 : : }
7092 : :
7093 : 52621 : if (lane_reducing_op_p (op.code))
7094 : : {
7095 : : /* The last operand of lane-reducing operation is for
7096 : : reduction. */
7097 : 708 : gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
7098 : :
7099 : 708 : slp_tree op_node = SLP_TREE_CHILDREN (vdef_slp)[0];
7100 : 708 : tree vectype_op = SLP_TREE_VECTYPE (op_node);
7101 : 708 : tree type_op = TREE_TYPE (op.ops[0]);
7102 : 708 : if (!vectype_op)
7103 : : {
7104 : 9 : vectype_op = get_vectype_for_scalar_type (loop_vinfo,
7105 : : type_op);
7106 : 9 : if (!vectype_op
7107 : 9 : || !vect_maybe_update_slp_op_vectype (op_node,
7108 : : vectype_op))
7109 : 0 : return false;
7110 : : }
7111 : :
7112 : : /* To accommodate lane-reducing operations of mixed input
7113 : : vectypes, choose input vectype with the least lanes for the
7114 : : reduction PHI statement, which would result in the most
7115 : : ncopies for vectorized reduction results. */
7116 : 708 : if (!vectype_in
7117 : 708 : || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7118 : 584 : < GET_MODE_SIZE (SCALAR_TYPE_MODE (type_op))))
7119 : 416 : vectype_in = vectype_op;
7120 : : }
7121 : 51913 : else if (!vectype_in)
7122 : 45293 : vectype_in = SLP_TREE_VECTYPE (slp_node);
7123 : 52621 : if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7124 : : {
7125 : 45929 : gcc_assert (reduc_idx == SLP_TREE_REDUC_IDX (vdef_slp));
7126 : 45929 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[reduc_idx];
7127 : : }
7128 : : }
7129 : :
7130 : 55807 : reduc_def = op.ops[reduc_idx];
7131 : 55807 : reduc_chain_length++;
7132 : : }
7133 : : /* PHIs should not participate in patterns. */
7134 : 45689 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7135 : :
7136 : : /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7137 : : element. */
7138 : 45689 : if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7139 : : {
7140 : 2754 : gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7141 : : stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7142 : : }
7143 : 45689 : if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7144 : 2754 : gcc_assert (REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7145 : :
7146 : : /* 1. Is vectorizable reduction? */
7147 : : /* Not supportable if the reduction variable is used in the loop, unless
7148 : : it's a reduction chain. */
7149 : 45689 : if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7150 : 0 : && !reduc_chain)
7151 : : return false;
7152 : :
7153 : : /* Reductions that are not used even in an enclosing outer-loop,
7154 : : are expected to be "live" (used out of the loop). */
7155 : 45689 : if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7156 : 0 : && !STMT_VINFO_LIVE_P (stmt_info))
7157 : : return false;
7158 : :
7159 : : /* 2. Has this been recognized as a reduction pattern?
7160 : :
7161 : : Check if STMT represents a pattern that has been recognized
7162 : : in earlier analysis stages. For stmts that represent a pattern,
7163 : : the STMT_VINFO_RELATED_STMT field records the last stmt in
7164 : : the original sequence that constitutes the pattern. */
7165 : :
7166 : 45689 : stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7167 : 45689 : if (orig_stmt_info)
7168 : : {
7169 : 2994 : gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7170 : 2994 : gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7171 : : }
7172 : :
7173 : : /* 3. Check the operands of the operation. The first operands are defined
7174 : : inside the loop body. The last operand is the reduction variable,
7175 : : which is defined by the loop-header-phi. */
7176 : :
7177 : 45689 : tree vectype_out = SLP_TREE_VECTYPE (slp_for_stmt_info);
7178 : 45689 : VECT_REDUC_INFO_VECTYPE (reduc_info) = vectype_out;
7179 : :
7180 : 45689 : gimple_match_op op;
7181 : 45689 : if (!gimple_extract_op (stmt_info->stmt, &op))
7182 : 0 : gcc_unreachable ();
7183 : 45689 : bool lane_reducing = lane_reducing_op_p (op.code);
7184 : :
7185 : 45689 : if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7186 : 13293 : && !SCALAR_FLOAT_TYPE_P (op.type))
7187 : : return false;
7188 : :
7189 : : /* Do not try to vectorize bit-precision reductions. */
7190 : 45689 : if (!type_has_mode_precision_p (op.type))
7191 : : return false;
7192 : :
7193 : : /* Lane-reducing ops also never can be used in a SLP reduction group
7194 : : since we'll mix lanes belonging to different reductions. But it's
7195 : : OK to use them in a reduction chain or when the reduction group
7196 : : has just one element. */
7197 : 44060 : if (lane_reducing
7198 : 44060 : && !reduc_chain
7199 : 389 : && SLP_TREE_LANES (slp_node) > 1)
7200 : : {
7201 : 0 : if (dump_enabled_p ())
7202 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7203 : : "lane-reducing reduction in reduction group.\n");
7204 : 0 : return false;
7205 : : }
7206 : :
7207 : : /* All uses but the last are expected to be defined in the loop.
7208 : : The last use is the reduction variable. In case of nested cycle this
7209 : : assumption is not true: we use reduc_index to record the index of the
7210 : : reduction variable. */
7211 : 44060 : slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7212 : 44060 : tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7213 : : /* We need to skip an extra operand for COND_EXPRs with embedded
7214 : : comparison. */
7215 : 44060 : unsigned opno_adjust = 0;
7216 : 44060 : if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7217 : 44060 : opno_adjust = 1;
7218 : 139871 : for (i = 0; i < (int) op.num_ops; i++)
7219 : : {
7220 : : /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7221 : 95819 : if (i == 0 && op.code == COND_EXPR)
7222 : 48038 : continue;
7223 : :
7224 : 95138 : stmt_vec_info def_stmt_info;
7225 : 95138 : enum vect_def_type dt;
7226 : 95138 : if (!vect_is_simple_use (loop_vinfo, slp_for_stmt_info,
7227 : 95138 : i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7228 : 95138 : &vectype_op[i], &def_stmt_info))
7229 : : {
7230 : 0 : if (dump_enabled_p ())
7231 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7232 : : "use not simple.\n");
7233 : 8 : return false;
7234 : : }
7235 : :
7236 : : /* Skip reduction operands, and for an IFN_COND_OP we might hit the
7237 : : reduction operand twice (once as definition, once as else). */
7238 : 95138 : if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7239 : 47357 : continue;
7240 : :
7241 : : /* There should be only one cycle def in the stmt, the one
7242 : : leading to reduc_def. */
7243 : 47781 : if (VECTORIZABLE_CYCLE_DEF (dt))
7244 : : return false;
7245 : :
7246 : 47773 : if (!vectype_op[i])
7247 : 3859 : vectype_op[i]
7248 : 3859 : = get_vectype_for_scalar_type (loop_vinfo,
7249 : 3859 : TREE_TYPE (op.ops[i]), slp_op[i]);
7250 : :
7251 : : /* Record how the non-reduction-def value of COND_EXPR is defined.
7252 : : ??? For a chain of multiple CONDs we'd have to match them up all. */
7253 : 47773 : if (op.code == COND_EXPR && reduc_chain_length == 1)
7254 : : {
7255 : 652 : if (dt == vect_constant_def)
7256 : : {
7257 : 45 : cond_reduc_dt = dt;
7258 : 45 : cond_reduc_val = op.ops[i];
7259 : : }
7260 : 607 : else if (dt == vect_induction_def
7261 : 377 : && def_stmt_info
7262 : 984 : && is_nonwrapping_integer_induction (def_stmt_info, loop))
7263 : : {
7264 : 112 : cond_reduc_dt = dt;
7265 : 112 : cond_stmt_vinfo = def_stmt_info;
7266 : : }
7267 : : }
7268 : : }
7269 : :
7270 : 44052 : enum vect_reduction_type reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7271 : : /* If we have a condition reduction, see if we can simplify it further. */
7272 : 44052 : if (reduction_type == COND_REDUCTION)
7273 : : {
7274 : 669 : if (SLP_TREE_LANES (slp_node) != 1)
7275 : : return false;
7276 : :
7277 : : /* When the condition uses the reduction value in the condition, fail. */
7278 : 669 : if (SLP_TREE_REDUC_IDX (slp_node) == 0)
7279 : : {
7280 : 0 : if (dump_enabled_p ())
7281 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7282 : : "condition depends on previous iteration\n");
7283 : 0 : return false;
7284 : : }
7285 : :
7286 : 669 : if (reduc_chain_length == 1
7287 : 669 : && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7288 : : OPTIMIZE_FOR_SPEED)
7289 : 640 : || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7290 : : vectype_in,
7291 : : OPTIMIZE_FOR_SPEED)))
7292 : : {
7293 : 0 : if (dump_enabled_p ())
7294 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7295 : : "optimizing condition reduction with"
7296 : : " FOLD_EXTRACT_LAST.\n");
7297 : 0 : VECT_REDUC_INFO_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7298 : : }
7299 : 669 : else if (cond_reduc_dt == vect_induction_def)
7300 : : {
7301 : 112 : tree base
7302 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7303 : 112 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7304 : :
7305 : 112 : gcc_assert (TREE_CODE (base) == INTEGER_CST
7306 : : && TREE_CODE (step) == INTEGER_CST);
7307 : 112 : cond_reduc_val = NULL_TREE;
7308 : 112 : enum tree_code cond_reduc_op_code = ERROR_MARK;
7309 : 112 : tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7310 : 112 : if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7311 : : ;
7312 : : /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7313 : : above base; punt if base is the minimum value of the type for
7314 : : MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7315 : 100 : else if (tree_int_cst_sgn (step) == -1)
7316 : : {
7317 : 20 : cond_reduc_op_code = MIN_EXPR;
7318 : 20 : if (tree_int_cst_sgn (base) == -1)
7319 : 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7320 : 20 : else if (tree_int_cst_lt (base,
7321 : 20 : TYPE_MAX_VALUE (TREE_TYPE (base))))
7322 : 20 : cond_reduc_val
7323 : 20 : = int_const_binop (PLUS_EXPR, base, integer_one_node);
7324 : : }
7325 : : else
7326 : : {
7327 : 80 : cond_reduc_op_code = MAX_EXPR;
7328 : 80 : if (tree_int_cst_sgn (base) == 1)
7329 : 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7330 : 80 : else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7331 : : base))
7332 : 80 : cond_reduc_val
7333 : 80 : = int_const_binop (MINUS_EXPR, base, integer_one_node);
7334 : : }
7335 : 100 : if (cond_reduc_val)
7336 : : {
7337 : 100 : if (dump_enabled_p ())
7338 : 64 : dump_printf_loc (MSG_NOTE, vect_location,
7339 : : "condition expression based on "
7340 : : "integer induction.\n");
7341 : 100 : VECT_REDUC_INFO_CODE (reduc_info) = cond_reduc_op_code;
7342 : 100 : VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info)
7343 : 100 : = cond_reduc_val;
7344 : 100 : VECT_REDUC_INFO_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7345 : : }
7346 : : }
7347 : 557 : else if (cond_reduc_dt == vect_constant_def)
7348 : : {
7349 : 45 : enum vect_def_type cond_initial_dt;
7350 : 45 : tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7351 : 45 : vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7352 : 45 : if (cond_initial_dt == vect_constant_def
7353 : 67 : && types_compatible_p (TREE_TYPE (cond_initial_val),
7354 : 22 : TREE_TYPE (cond_reduc_val)))
7355 : : {
7356 : 22 : tree e = fold_binary (LE_EXPR, boolean_type_node,
7357 : : cond_initial_val, cond_reduc_val);
7358 : 22 : if (e && (integer_onep (e) || integer_zerop (e)))
7359 : : {
7360 : 22 : if (dump_enabled_p ())
7361 : 16 : dump_printf_loc (MSG_NOTE, vect_location,
7362 : : "condition expression based on "
7363 : : "compile time constant.\n");
7364 : : /* Record reduction code at analysis stage. */
7365 : 22 : VECT_REDUC_INFO_CODE (reduc_info)
7366 : 22 : = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7367 : 22 : VECT_REDUC_INFO_TYPE (reduc_info) = CONST_COND_REDUCTION;
7368 : : }
7369 : : }
7370 : : }
7371 : : }
7372 : :
7373 : 44052 : if (STMT_VINFO_LIVE_P (phi_info))
7374 : : return false;
7375 : :
7376 : 44052 : ncopies = vect_get_num_copies (loop_vinfo, slp_node);
7377 : :
7378 : 44052 : gcc_assert (ncopies >= 1);
7379 : :
7380 : 44052 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7381 : :
7382 : : /* 4.2. Check support for the epilog operation.
7383 : :
7384 : : If STMT represents a reduction pattern, then the type of the
7385 : : reduction variable may be different than the type of the rest
7386 : : of the arguments. For example, consider the case of accumulation
7387 : : of shorts into an int accumulator; The original code:
7388 : : S1: int_a = (int) short_a;
7389 : : orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7390 : :
7391 : : was replaced with:
7392 : : STMT: int_acc = widen_sum <short_a, int_acc>
7393 : :
7394 : : This means that:
7395 : : 1. The tree-code that is used to create the vector operation in the
7396 : : epilog code (that reduces the partial results) is not the
7397 : : tree-code of STMT, but is rather the tree-code of the original
7398 : : stmt from the pattern that STMT is replacing. I.e, in the example
7399 : : above we want to use 'widen_sum' in the loop, but 'plus' in the
7400 : : epilog.
7401 : : 2. The type (mode) we use to check available target support
7402 : : for the vector operation to be created in the *epilog*, is
7403 : : determined by the type of the reduction variable (in the example
7404 : : above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7405 : : However the type (mode) we use to check available target support
7406 : : for the vector operation to be created *inside the loop*, is
7407 : : determined by the type of the other arguments to STMT (in the
7408 : : example we'd check this: optab_handler (widen_sum_optab,
7409 : : vect_short_mode)).
7410 : :
7411 : : This is contrary to "regular" reductions, in which the types of all
7412 : : the arguments are the same as the type of the reduction variable.
7413 : : For "regular" reductions we can therefore use the same vector type
7414 : : (and also the same tree-code) when generating the epilog code and
7415 : : when generating the code inside the loop. */
7416 : :
7417 : 44052 : code_helper orig_code = VECT_REDUC_INFO_CODE (reduc_info);
7418 : :
7419 : : /* If conversion might have created a conditional operation like
7420 : : IFN_COND_ADD already. Use the internal code for the following checks. */
7421 : 44052 : if (orig_code.is_internal_fn ())
7422 : : {
7423 : 3353 : tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7424 : 3353 : orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7425 : : }
7426 : :
7427 : 44052 : VECT_REDUC_INFO_CODE (reduc_info) = orig_code;
7428 : :
7429 : 44052 : reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7430 : 44052 : if (reduction_type == TREE_CODE_REDUCTION)
7431 : : {
7432 : : /* Check whether it's ok to change the order of the computation.
7433 : : Generally, when vectorizing a reduction we change the order of the
7434 : : computation. This may change the behavior of the program in some
7435 : : cases, so we need to check that this is ok. One exception is when
7436 : : vectorizing an outer-loop: the inner-loop is executed sequentially,
7437 : : and therefore vectorizing reductions in the inner-loop during
7438 : : outer-loop vectorization is safe. Likewise when we are vectorizing
7439 : : a series of reductions using SLP and the VF is one the reductions
7440 : : are performed in scalar order. */
7441 : 43383 : if (!reduc_chain
7442 : 43383 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7443 : : ;
7444 : 43269 : else if (needs_fold_left_reduction_p (op.type, orig_code))
7445 : : {
7446 : : /* When vectorizing a reduction chain w/o SLP the reduction PHI
7447 : : is not directy used in stmt. */
7448 : 5002 : if (!only_slp_reduc_chain
7449 : 5002 : && reduc_chain_length != 1)
7450 : : {
7451 : 53 : if (dump_enabled_p ())
7452 : 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7453 : : "in-order reduction chain without SLP.\n");
7454 : 53 : return false;
7455 : : }
7456 : : /* Code generation doesn't support function calls other
7457 : : than .COND_*. */
7458 : 4949 : if (!op.code.is_tree_code ()
7459 : 5051 : && !(op.code.is_internal_fn ()
7460 : 51 : && conditional_internal_fn_code (internal_fn (op.code))
7461 : : != ERROR_MARK))
7462 : : {
7463 : 10 : if (dump_enabled_p ())
7464 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7465 : : "in-order reduction chain operation not "
7466 : : "supported.\n");
7467 : 10 : return false;
7468 : : }
7469 : 4939 : VECT_REDUC_INFO_TYPE (reduc_info)
7470 : 4939 : = reduction_type = FOLD_LEFT_REDUCTION;
7471 : : }
7472 : 38267 : else if (!commutative_binary_op_p (orig_code, op.type)
7473 : 38267 : || !associative_binary_op_p (orig_code, op.type))
7474 : : {
7475 : 180 : if (dump_enabled_p ())
7476 : 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7477 : : "reduction: not commutative/associative\n");
7478 : 180 : return false;
7479 : : }
7480 : : }
7481 : :
7482 : 4939 : if ((reduction_type == COND_REDUCTION
7483 : : || reduction_type == INTEGER_INDUC_COND_REDUCTION
7484 : : || reduction_type == CONST_COND_REDUCTION
7485 : 38870 : || reduction_type == EXTRACT_LAST_REDUCTION)
7486 : : && 1
7487 : 669 : && ncopies > 1)
7488 : : {
7489 : 272 : if (dump_enabled_p ())
7490 : 64 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7491 : : "multiple types in condition reduction.\n");
7492 : 272 : return false;
7493 : : }
7494 : :
7495 : 43537 : internal_fn reduc_fn = IFN_LAST;
7496 : 43537 : if (reduction_type == TREE_CODE_REDUCTION
7497 : 43537 : || reduction_type == FOLD_LEFT_REDUCTION
7498 : : || reduction_type == INTEGER_INDUC_COND_REDUCTION
7499 : 397 : || reduction_type == CONST_COND_REDUCTION)
7500 : : {
7501 : 38315 : if (reduction_type == FOLD_LEFT_REDUCTION
7502 : 47403 : ? fold_left_reduction_fn (orig_code, &reduc_fn)
7503 : 38315 : : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7504 : : {
7505 : 42578 : if (reduc_fn != IFN_LAST
7506 : 42578 : && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7507 : : OPTIMIZE_FOR_SPEED))
7508 : : {
7509 : 9286 : if (dump_enabled_p ())
7510 : 777 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7511 : : "reduc op not supported by target.\n");
7512 : :
7513 : 9286 : reduc_fn = IFN_LAST;
7514 : : }
7515 : : }
7516 : : else
7517 : : {
7518 : 676 : if (dump_enabled_p ())
7519 : 48 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7520 : : "no reduc code for scalar code.\n");
7521 : :
7522 : 676 : return false;
7523 : : }
7524 : : }
7525 : 283 : else if (reduction_type == COND_REDUCTION)
7526 : : {
7527 : 283 : int scalar_precision
7528 : 283 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7529 : 283 : cr_index_scalar_type = make_unsigned_type (scalar_precision);
7530 : 283 : cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7531 : : vectype_out);
7532 : :
7533 : 283 : if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7534 : : OPTIMIZE_FOR_SPEED))
7535 : 7 : reduc_fn = IFN_REDUC_MAX;
7536 : : }
7537 : 42861 : VECT_REDUC_INFO_FN (reduc_info) = reduc_fn;
7538 : :
7539 : 42861 : if (reduction_type != EXTRACT_LAST_REDUCTION
7540 : : && reduc_fn == IFN_LAST
7541 : : && !nunits_out.is_constant ())
7542 : : {
7543 : : if (dump_enabled_p ())
7544 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7545 : : "missing target support for reduction on"
7546 : : " variable-length vectors.\n");
7547 : : return false;
7548 : : }
7549 : :
7550 : : /* For SLP reductions, see if there is a neutral value we can use. */
7551 : 42861 : tree neutral_op = NULL_TREE;
7552 : 42861 : tree initial_value = NULL_TREE;
7553 : 42861 : if (reduc_chain)
7554 : 2742 : initial_value = vect_phi_initial_value (reduc_def_phi);
7555 : 42861 : neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7556 : : orig_code, initial_value);
7557 : :
7558 : 42861 : if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7559 : : {
7560 : : /* We can't support in-order reductions of code such as this:
7561 : :
7562 : : for (int i = 0; i < n1; ++i)
7563 : : for (int j = 0; j < n2; ++j)
7564 : : l += a[j];
7565 : :
7566 : : since GCC effectively transforms the loop when vectorizing:
7567 : :
7568 : : for (int i = 0; i < n1 / VF; ++i)
7569 : : for (int j = 0; j < n2; ++j)
7570 : : for (int k = 0; k < VF; ++k)
7571 : : l += a[j];
7572 : :
7573 : : which is a reassociation of the original operation. */
7574 : 56 : if (dump_enabled_p ())
7575 : 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7576 : : "in-order double reduction not supported.\n");
7577 : :
7578 : 56 : return false;
7579 : : }
7580 : :
7581 : 42805 : if (reduction_type == FOLD_LEFT_REDUCTION
7582 : 4207 : && SLP_TREE_LANES (slp_node) > 1
7583 : 102 : && !reduc_chain)
7584 : : {
7585 : : /* We cannot use in-order reductions in this case because there is
7586 : : an implicit reassociation of the operations involved. */
7587 : 42 : if (dump_enabled_p ())
7588 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7589 : : "in-order unchained SLP reductions not supported.\n");
7590 : 42 : return false;
7591 : : }
7592 : :
7593 : : /* For double reductions, and for SLP reductions with a neutral value,
7594 : : we construct a variable-length initial vector by loading a vector
7595 : : full of the neutral value and then shift-and-inserting the start
7596 : : values into the low-numbered elements. */
7597 : 42763 : if ((double_reduc || neutral_op)
7598 : : && !nunits_out.is_constant ()
7599 : : && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7600 : : vectype_out, OPTIMIZE_FOR_SPEED))
7601 : : {
7602 : : if (dump_enabled_p ())
7603 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7604 : : "reduction on variable-length vectors requires"
7605 : : " target support for a vector-shift-and-insert"
7606 : : " operation.\n");
7607 : : return false;
7608 : : }
7609 : :
7610 : : /* Check extra constraints for variable-length unchained SLP reductions. */
7611 : 42763 : if (!reduc_chain
7612 : : && !nunits_out.is_constant ())
7613 : : {
7614 : : /* We checked above that we could build the initial vector when
7615 : : there's a neutral element value. Check here for the case in
7616 : : which each SLP statement has its own initial value and in which
7617 : : that value needs to be repeated for every instance of the
7618 : : statement within the initial vector. */
7619 : : unsigned int group_size = SLP_TREE_LANES (slp_node);
7620 : : if (!neutral_op
7621 : : && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7622 : : TREE_TYPE (vectype_out)))
7623 : : {
7624 : : if (dump_enabled_p ())
7625 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7626 : : "unsupported form of SLP reduction for"
7627 : : " variable-length vectors: cannot build"
7628 : : " initial vector.\n");
7629 : : return false;
7630 : : }
7631 : : /* The epilogue code relies on the number of elements being a multiple
7632 : : of the group size. The duplicate-and-interleave approach to setting
7633 : : up the initial vector does too. */
7634 : : if (!multiple_p (nunits_out, group_size))
7635 : : {
7636 : : if (dump_enabled_p ())
7637 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7638 : : "unsupported form of SLP reduction for"
7639 : : " variable-length vectors: the vector size"
7640 : : " is not a multiple of the number of results.\n");
7641 : : return false;
7642 : : }
7643 : : }
7644 : :
7645 : 42763 : if (reduction_type == COND_REDUCTION)
7646 : : {
7647 : 283 : widest_int ni;
7648 : :
7649 : 283 : if (! max_loop_iterations (loop, &ni))
7650 : : {
7651 : 0 : if (dump_enabled_p ())
7652 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
7653 : : "loop count not known, cannot create cond "
7654 : : "reduction.\n");
7655 : 0 : return false;
7656 : : }
7657 : : /* Convert backedges to iterations. */
7658 : 283 : ni += 1;
7659 : :
7660 : : /* The additional index will be the same type as the condition. Check
7661 : : that the loop can fit into this less one (because we'll use up the
7662 : : zero slot for when there are no matches). */
7663 : 283 : tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7664 : 283 : if (wi::geu_p (ni, wi::to_widest (max_index)))
7665 : : {
7666 : 90 : if (dump_enabled_p ())
7667 : 54 : dump_printf_loc (MSG_NOTE, vect_location,
7668 : : "loop size is greater than data size.\n");
7669 : 90 : return false;
7670 : : }
7671 : 283 : }
7672 : :
7673 : : /* In case the vectorization factor (VF) is bigger than the number
7674 : : of elements that we can fit in a vectype (nunits), we have to generate
7675 : : more than one vector stmt - i.e - we need to "unroll" the
7676 : : vector stmt by a factor VF/nunits. For more details see documentation
7677 : : in vectorizable_operation. */
7678 : :
7679 : : /* If the reduction is used in an outer loop we need to generate
7680 : : VF intermediate results, like so (e.g. for ncopies=2):
7681 : : r0 = phi (init, r0)
7682 : : r1 = phi (init, r1)
7683 : : r0 = x0 + r0;
7684 : : r1 = x1 + r1;
7685 : : (i.e. we generate VF results in 2 registers).
7686 : : In this case we have a separate def-use cycle for each copy, and therefore
7687 : : for each copy we get the vector def for the reduction variable from the
7688 : : respective phi node created for this copy.
7689 : :
7690 : : Otherwise (the reduction is unused in the loop nest), we can combine
7691 : : together intermediate results, like so (e.g. for ncopies=2):
7692 : : r = phi (init, r)
7693 : : r = x0 + r;
7694 : : r = x1 + r;
7695 : : (i.e. we generate VF/2 results in a single register).
7696 : : In this case for each copy we get the vector def for the reduction variable
7697 : : from the vectorized reduction operation generated in the previous iteration.
7698 : :
7699 : : This only works when we see both the reduction PHI and its only consumer
7700 : : in vectorizable_reduction and there are no intermediate stmts
7701 : : participating. When unrolling we want each unrolled iteration to have its
7702 : : own reduction accumulator since one of the main goals of unrolling a
7703 : : reduction is to reduce the aggregate loop-carried latency. */
7704 : 42673 : if (ncopies > 1
7705 : 42673 : && !reduc_chain
7706 : 5008 : && SLP_TREE_LANES (slp_node) == 1
7707 : 4931 : && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7708 : 4916 : && reduc_chain_length == 1
7709 : 4726 : && loop_vinfo->suggested_unroll_factor == 1)
7710 : 42673 : single_defuse_cycle = true;
7711 : :
7712 : 42673 : if (single_defuse_cycle && !lane_reducing)
7713 : : {
7714 : 4172 : gcc_assert (op.code != COND_EXPR);
7715 : :
7716 : : /* 4. check support for the operation in the loop
7717 : :
7718 : : This isn't necessary for the lane reduction codes, since they
7719 : : can only be produced by pattern matching, and it's up to the
7720 : : pattern matcher to test for support. The main reason for
7721 : : specifically skipping this step is to avoid rechecking whether
7722 : : mixed-sign dot-products can be implemented using signed
7723 : : dot-products. */
7724 : 4172 : machine_mode vec_mode = TYPE_MODE (vectype_in);
7725 : 4172 : if (!directly_supported_p (op.code, vectype_in, optab_vector))
7726 : : {
7727 : 841 : if (dump_enabled_p ())
7728 : 10 : dump_printf (MSG_NOTE, "op not supported by target.\n");
7729 : 1682 : if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7730 : 841 : || !vect_can_vectorize_without_simd_p (op.code))
7731 : : single_defuse_cycle = false;
7732 : : else
7733 : 5 : if (dump_enabled_p ())
7734 : 0 : dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7735 : : }
7736 : :
7737 : 4172 : if (vect_emulated_vector_p (vectype_in)
7738 : 4172 : && !vect_can_vectorize_without_simd_p (op.code))
7739 : : {
7740 : 0 : if (dump_enabled_p ())
7741 : 0 : dump_printf (MSG_NOTE, "using word mode not possible.\n");
7742 : 0 : return false;
7743 : : }
7744 : : }
7745 : 42673 : if (dump_enabled_p () && single_defuse_cycle)
7746 : 639 : dump_printf_loc (MSG_NOTE, vect_location,
7747 : : "using single def-use cycle for reduction by reducing "
7748 : : "multiple vectors to one in the loop body\n");
7749 : 42673 : VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7750 : :
7751 : : /* For lane-reducing operation, the below processing related to single
7752 : : defuse-cycle will be done in its own vectorizable function. One more
7753 : : thing to note is that the operation must not be involved in fold-left
7754 : : reduction. */
7755 : 42673 : single_defuse_cycle &= !lane_reducing;
7756 : :
7757 : 42673 : if (single_defuse_cycle || reduction_type == FOLD_LEFT_REDUCTION)
7758 : 23643 : for (i = 0; i < (int) op.num_ops; i++)
7759 : 16194 : if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7760 : : {
7761 : 0 : if (dump_enabled_p ())
7762 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7763 : : "incompatible vector types for invariants\n");
7764 : 0 : return false;
7765 : : }
7766 : :
7767 : 42673 : vect_model_reduction_cost (loop_vinfo, slp_for_stmt_info, reduc_fn,
7768 : : reduction_type, ncopies, cost_vec);
7769 : : /* Cost the reduction op inside the loop if transformed via
7770 : : vect_transform_reduction for non-lane-reducing operation. Otherwise
7771 : : this is costed by the separate vectorizable_* routines. */
7772 : 42673 : if (single_defuse_cycle)
7773 : 3336 : record_stmt_cost (cost_vec, ncopies, vector_stmt,
7774 : : slp_for_stmt_info, 0, vect_body);
7775 : :
7776 : 42673 : if (dump_enabled_p ()
7777 : 42673 : && reduction_type == FOLD_LEFT_REDUCTION)
7778 : 200 : dump_printf_loc (MSG_NOTE, vect_location,
7779 : : "using an in-order (fold-left) reduction.\n");
7780 : 42673 : SLP_TREE_TYPE (slp_node) = cycle_phi_info_type;
7781 : :
7782 : : /* All but single defuse-cycle optimized and fold-left reductions go
7783 : : through their own vectorizable_* routines. */
7784 : 42673 : if (!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
7785 : : {
7786 : 35224 : stmt_vec_info tem
7787 : 35224 : = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7788 : 35224 : if (REDUC_GROUP_FIRST_ELEMENT (tem))
7789 : : {
7790 : 2682 : gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7791 : : tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7792 : : }
7793 : 35224 : STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7794 : 35224 : STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7795 : : }
7796 : 7449 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7797 : 4 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
7798 : : slp_node, op.code, op.type,
7799 : : vectype_in);
7800 : : return true;
7801 : : }
7802 : :
7803 : : /* STMT_INFO is a dot-product reduction whose multiplication operands
7804 : : have different signs. Emit a sequence to emulate the operation
7805 : : using a series of signed DOT_PROD_EXPRs and return the last
7806 : : statement generated. VEC_DEST is the result of the vector operation
7807 : : and VOP lists its inputs. */
7808 : :
7809 : : static gassign *
7810 : 2 : vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7811 : : gimple_stmt_iterator *gsi, tree vec_dest,
7812 : : tree vop[3])
7813 : : {
7814 : 2 : tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
7815 : 2 : tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
7816 : 2 : tree narrow_elttype = TREE_TYPE (narrow_vectype);
7817 : 2 : gimple *new_stmt;
7818 : :
7819 : : /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
7820 : 2 : if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
7821 : 0 : std::swap (vop[0], vop[1]);
7822 : :
7823 : : /* Convert all inputs to signed types. */
7824 : 8 : for (int i = 0; i < 3; ++i)
7825 : 6 : if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
7826 : : {
7827 : 2 : tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
7828 : 2 : new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
7829 : 2 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7830 : 2 : vop[i] = tmp;
7831 : : }
7832 : :
7833 : : /* In the comments below we assume 8-bit inputs for simplicity,
7834 : : but the approach works for any full integer type. */
7835 : :
7836 : : /* Create a vector of -128. */
7837 : 2 : tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
7838 : 2 : tree min_narrow = build_vector_from_val (narrow_vectype,
7839 : : min_narrow_elttype);
7840 : :
7841 : : /* Create a vector of 64. */
7842 : 2 : auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
7843 : 2 : tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
7844 : 2 : half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
7845 : :
7846 : : /* Emit: SUB_RES = VOP[0] - 128. */
7847 : 2 : tree sub_res = make_ssa_name (narrow_vectype);
7848 : 2 : new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
7849 : 2 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7850 : :
7851 : : /* Emit:
7852 : :
7853 : : STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
7854 : : STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
7855 : : STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
7856 : :
7857 : : on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
7858 : : Doing the two 64 * y steps first allows more time to compute x. */
7859 : 2 : tree stage1 = make_ssa_name (wide_vectype);
7860 : 2 : new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
7861 : : vop[1], half_narrow, vop[2]);
7862 : 2 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7863 : :
7864 : 2 : tree stage2 = make_ssa_name (wide_vectype);
7865 : 2 : new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
7866 : : vop[1], half_narrow, stage1);
7867 : 2 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7868 : :
7869 : 2 : tree stage3 = make_ssa_name (wide_vectype);
7870 : 2 : new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
7871 : : sub_res, vop[1], stage2);
7872 : 2 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7873 : :
7874 : : /* Convert STAGE3 to the reduction type. */
7875 : 2 : return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
7876 : 2 : }
7877 : :
7878 : : /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7879 : : value. */
7880 : :
7881 : : bool
7882 : 2325 : vect_transform_reduction (loop_vec_info loop_vinfo,
7883 : : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7884 : : slp_tree slp_node)
7885 : : {
7886 : 2325 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
7887 : 2325 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7888 : 2325 : unsigned vec_num;
7889 : :
7890 : 2325 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
7891 : :
7892 : 2325 : if (nested_in_vect_loop_p (loop, stmt_info))
7893 : : {
7894 : 0 : loop = loop->inner;
7895 : 0 : gcc_assert (VECT_REDUC_INFO_DEF_TYPE (reduc_info)
7896 : : == vect_double_reduction_def);
7897 : : }
7898 : :
7899 : 2325 : gimple_match_op op;
7900 : 2325 : if (!gimple_extract_op (stmt_info->stmt, &op))
7901 : 0 : gcc_unreachable ();
7902 : :
7903 : : /* All uses but the last are expected to be defined in the loop.
7904 : : The last use is the reduction variable. In case of nested cycle this
7905 : : assumption is not true: we use reduc_index to record the index of the
7906 : : reduction variable. */
7907 : 2325 : int reduc_index = SLP_TREE_REDUC_IDX (slp_node);
7908 : 2325 : tree vectype_in = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (slp_node)[0]);
7909 : :
7910 : 2325 : vec_num = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
7911 : :
7912 : 2325 : code_helper code = canonicalize_code (op.code, op.type);
7913 : 2325 : internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
7914 : :
7915 : 2325 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7916 : 2325 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
7917 : 2325 : bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7918 : :
7919 : : /* Transform. */
7920 : 2325 : tree new_temp = NULL_TREE;
7921 : 16275 : auto_vec<tree> vec_oprnds[3];
7922 : :
7923 : 2325 : if (dump_enabled_p ())
7924 : 699 : dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7925 : :
7926 : : /* A binary COND_OP reduction must have the same definition and else
7927 : : value. */
7928 : 2602 : bool cond_fn_p = code.is_internal_fn ()
7929 : 277 : && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
7930 : 277 : if (cond_fn_p)
7931 : : {
7932 : 277 : gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
7933 : : || code == IFN_COND_MUL || code == IFN_COND_AND
7934 : : || code == IFN_COND_IOR || code == IFN_COND_XOR
7935 : : || code == IFN_COND_MIN || code == IFN_COND_MAX);
7936 : 277 : gcc_assert (op.num_ops == 4
7937 : : && (op.ops[reduc_index]
7938 : : == op.ops[internal_fn_else_index ((internal_fn) code)]));
7939 : : }
7940 : :
7941 : 2325 : bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7942 : :
7943 : 2325 : vect_reduction_type reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7944 : 2325 : if (reduction_type == FOLD_LEFT_REDUCTION)
7945 : : {
7946 : 838 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
7947 : 838 : gcc_assert (code.is_tree_code () || cond_fn_p);
7948 : 838 : return vectorize_fold_left_reduction
7949 : 838 : (loop_vinfo, stmt_info, gsi, slp_node,
7950 : 838 : code, reduc_fn, op.num_ops, vectype_in,
7951 : 838 : reduc_index, masks, lens);
7952 : : }
7953 : :
7954 : 1487 : bool single_defuse_cycle = VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info);
7955 : 1487 : bool lane_reducing = lane_reducing_op_p (code);
7956 : 1244 : gcc_assert (single_defuse_cycle || lane_reducing);
7957 : :
7958 : 1487 : if (lane_reducing)
7959 : : {
7960 : : /* The last operand of lane-reducing op is for reduction. */
7961 : 243 : gcc_assert (reduc_index == (int) op.num_ops - 1);
7962 : : }
7963 : :
7964 : : /* Create the destination vector */
7965 : 1487 : tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
7966 : 1487 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7967 : :
7968 : : /* Get NCOPIES vector definitions for all operands except the reduction
7969 : : definition. */
7970 : 1487 : if (!cond_fn_p)
7971 : : {
7972 : 1219 : gcc_assert (reduc_index >= 0 && reduc_index <= 2);
7973 : 2038 : vect_get_vec_defs (loop_vinfo, slp_node,
7974 : 1219 : single_defuse_cycle && reduc_index == 0
7975 : : ? NULL_TREE : op.ops[0], &vec_oprnds[0],
7976 : 1219 : single_defuse_cycle && reduc_index == 1
7977 : : ? NULL_TREE : op.ops[1], &vec_oprnds[1],
7978 : 1219 : op.num_ops == 3
7979 : 243 : && !(single_defuse_cycle && reduc_index == 2)
7980 : : ? op.ops[2] : NULL_TREE, &vec_oprnds[2]);
7981 : : }
7982 : : else
7983 : : {
7984 : : /* For a conditional operation pass the truth type as mask
7985 : : vectype. */
7986 : 268 : gcc_assert (single_defuse_cycle
7987 : : && (reduc_index == 1 || reduc_index == 2));
7988 : 268 : vect_get_vec_defs (loop_vinfo, slp_node, op.ops[0],
7989 : : &vec_oprnds[0],
7990 : : reduc_index == 1 ? NULL_TREE : op.ops[1],
7991 : : &vec_oprnds[1],
7992 : : reduc_index == 2 ? NULL_TREE : op.ops[2],
7993 : : &vec_oprnds[2]);
7994 : : }
7995 : :
7996 : : /* For single def-use cycles get one copy of the vectorized reduction
7997 : : definition. */
7998 : 1487 : if (single_defuse_cycle)
7999 : : {
8000 : 1408 : vect_get_vec_defs (loop_vinfo, slp_node,
8001 : : reduc_index == 0 ? op.ops[0] : NULL_TREE,
8002 : : &vec_oprnds[0],
8003 : : reduc_index == 1 ? op.ops[1] : NULL_TREE,
8004 : : &vec_oprnds[1],
8005 : : reduc_index == 2 ? op.ops[2] : NULL_TREE,
8006 : : &vec_oprnds[2]);
8007 : : }
8008 : 79 : else if (lane_reducing)
8009 : : {
8010 : : /* For normal reduction, consistency between vectorized def/use is
8011 : : naturally ensured when mapping from scalar statement. But if lane-
8012 : : reducing op is involved in reduction, thing would become somewhat
8013 : : complicated in that the op's result and operand for accumulation are
8014 : : limited to less lanes than other operands, which certainly causes
8015 : : def/use mismatch on adjacent statements around the op if do not have
8016 : : any kind of specific adjustment. One approach is to refit lane-
8017 : : reducing op in the way of introducing new trivial pass-through copies
8018 : : to fix possible def/use gap, so as to make it behave like a normal op.
8019 : : And vector reduction PHIs are always generated to the full extent, no
8020 : : matter lane-reducing op exists or not. If some copies or PHIs are
8021 : : actually superfluous, they would be cleaned up by passes after
8022 : : vectorization. An example for single-lane slp, lane-reducing ops
8023 : : with mixed input vectypes in a reduction chain, is given as below.
8024 : : Similarly, this handling is applicable for multiple-lane slp as well.
8025 : :
8026 : : int sum = 1;
8027 : : for (i)
8028 : : {
8029 : : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
8030 : : sum += w[i]; // widen-sum <vector(16) char>
8031 : : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
8032 : : sum += n[i]; // normal <vector(4) int>
8033 : : }
8034 : :
8035 : : The vector size is 128-bit,vectorization factor is 16. Reduction
8036 : : statements would be transformed as:
8037 : :
8038 : : vector<4> int sum_v0 = { 0, 0, 0, 1 };
8039 : : vector<4> int sum_v1 = { 0, 0, 0, 0 };
8040 : : vector<4> int sum_v2 = { 0, 0, 0, 0 };
8041 : : vector<4> int sum_v3 = { 0, 0, 0, 0 };
8042 : :
8043 : : for (i / 16)
8044 : : {
8045 : : sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
8046 : : sum_v1 = sum_v1; // copy
8047 : : sum_v2 = sum_v2; // copy
8048 : : sum_v3 = sum_v3; // copy
8049 : :
8050 : : sum_v0 = sum_v0; // copy
8051 : : sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
8052 : : sum_v2 = sum_v2; // copy
8053 : : sum_v3 = sum_v3; // copy
8054 : :
8055 : : sum_v0 = sum_v0; // copy
8056 : : sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1);
8057 : : sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2);
8058 : : sum_v3 = sum_v3; // copy
8059 : :
8060 : : sum_v0 += n_v0[i: 0 ~ 3 ];
8061 : : sum_v1 += n_v1[i: 4 ~ 7 ];
8062 : : sum_v2 += n_v2[i: 8 ~ 11];
8063 : : sum_v3 += n_v3[i: 12 ~ 15];
8064 : : }
8065 : :
8066 : : Moreover, for a higher instruction parallelism in final vectorized
8067 : : loop, it is considered to make those effective vector lane-reducing
8068 : : ops be distributed evenly among all def-use cycles. In the above
8069 : : example, DOT_PROD, WIDEN_SUM and SADs are generated into disparate
8070 : : cycles, instruction dependency among them could be eliminated. */
8071 : 79 : unsigned effec_ncopies = vec_oprnds[0].length ();
8072 : 79 : unsigned total_ncopies = vec_oprnds[reduc_index].length ();
8073 : :
8074 : 79 : gcc_assert (effec_ncopies <= total_ncopies);
8075 : :
8076 : 79 : if (effec_ncopies < total_ncopies)
8077 : : {
8078 : 237 : for (unsigned i = 0; i < op.num_ops - 1; i++)
8079 : : {
8080 : 316 : gcc_assert (vec_oprnds[i].length () == effec_ncopies);
8081 : 158 : vec_oprnds[i].safe_grow_cleared (total_ncopies);
8082 : : }
8083 : : }
8084 : :
8085 : 79 : tree reduc_vectype_in = vectype_in;
8086 : 79 : gcc_assert (reduc_vectype_in);
8087 : :
8088 : 79 : unsigned effec_reduc_ncopies
8089 : 79 : = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
8090 : :
8091 : 79 : gcc_assert (effec_ncopies <= effec_reduc_ncopies);
8092 : :
8093 : 79 : if (effec_ncopies < effec_reduc_ncopies)
8094 : : {
8095 : : /* Find suitable def-use cycles to generate vectorized statements
8096 : : into, and reorder operands based on the selection. */
8097 : 0 : unsigned curr_pos = VECT_REDUC_INFO_RESULT_POS (reduc_info);
8098 : 0 : unsigned next_pos = (curr_pos + effec_ncopies) % effec_reduc_ncopies;
8099 : :
8100 : 0 : gcc_assert (curr_pos < effec_reduc_ncopies);
8101 : 0 : VECT_REDUC_INFO_RESULT_POS (reduc_info) = next_pos;
8102 : :
8103 : 0 : if (curr_pos)
8104 : : {
8105 : 0 : unsigned count = effec_reduc_ncopies - effec_ncopies;
8106 : 0 : unsigned start = curr_pos - count;
8107 : :
8108 : 0 : if ((int) start < 0)
8109 : : {
8110 : 0 : count = curr_pos;
8111 : 0 : start = 0;
8112 : : }
8113 : :
8114 : 0 : for (unsigned i = 0; i < op.num_ops - 1; i++)
8115 : : {
8116 : 0 : for (unsigned j = effec_ncopies; j > start; j--)
8117 : : {
8118 : 0 : unsigned k = j - 1;
8119 : 0 : std::swap (vec_oprnds[i][k], vec_oprnds[i][k + count]);
8120 : 0 : gcc_assert (!vec_oprnds[i][k]);
8121 : : }
8122 : : }
8123 : : }
8124 : : }
8125 : : }
8126 : :
8127 : 1487 : bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (slp_node);
8128 : 2477 : unsigned num = vec_oprnds[reduc_index == 0 ? 1 : 0].length ();
8129 : 1487 : unsigned mask_index = 0;
8130 : :
8131 : 6448 : for (unsigned i = 0; i < num; ++i)
8132 : : {
8133 : 4961 : gimple *new_stmt;
8134 : 4961 : tree vop[3] = { vec_oprnds[0][i], vec_oprnds[1][i], NULL_TREE };
8135 : 4961 : if (!vop[0] || !vop[1])
8136 : : {
8137 : 418 : tree reduc_vop = vec_oprnds[reduc_index][i];
8138 : :
8139 : : /* If could not generate an effective vector statement for current
8140 : : portion of reduction operand, insert a trivial copy to simply
8141 : : handle over the operand to other dependent statements. */
8142 : 418 : gcc_assert (reduc_vop);
8143 : :
8144 : 418 : if (TREE_CODE (reduc_vop) == SSA_NAME
8145 : 418 : && !SSA_NAME_IS_DEFAULT_DEF (reduc_vop))
8146 : 418 : new_stmt = SSA_NAME_DEF_STMT (reduc_vop);
8147 : : else
8148 : : {
8149 : 0 : new_temp = make_ssa_name (vec_dest);
8150 : 0 : new_stmt = gimple_build_assign (new_temp, reduc_vop);
8151 : 0 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
8152 : : gsi);
8153 : : }
8154 : : }
8155 : 4543 : else if (masked_loop_p && !mask_by_cond_expr)
8156 : : {
8157 : : /* No conditional ifns have been defined for lane-reducing op
8158 : : yet. */
8159 : 8 : gcc_assert (!lane_reducing);
8160 : :
8161 : : /* Make sure that the reduction accumulator is vop[0]. */
8162 : 8 : if (reduc_index == 1)
8163 : : {
8164 : 8 : gcc_assert (commutative_binary_op_p (code, op.type));
8165 : 8 : std::swap (vop[0], vop[1]);
8166 : : }
8167 : 8 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8168 : : vec_num, vectype_in,
8169 : : mask_index++);
8170 : 8 : gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8171 : : vop[0], vop[1], vop[0]);
8172 : 8 : new_temp = make_ssa_name (vec_dest, call);
8173 : 8 : gimple_call_set_lhs (call, new_temp);
8174 : 8 : gimple_call_set_nothrow (call, true);
8175 : 8 : vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8176 : 8 : new_stmt = call;
8177 : : }
8178 : : else
8179 : : {
8180 : 4535 : if (op.num_ops >= 3)
8181 : 1097 : vop[2] = vec_oprnds[2][i];
8182 : :
8183 : 4535 : if (masked_loop_p && mask_by_cond_expr)
8184 : : {
8185 : 4 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8186 : : vec_num, vectype_in,
8187 : : mask_index++);
8188 : 4 : build_vect_cond_expr (code, vop, mask, gsi);
8189 : : }
8190 : :
8191 : 4535 : if (emulated_mixed_dot_prod)
8192 : 2 : new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8193 : : vec_dest, vop);
8194 : :
8195 : 5247 : else if (code.is_internal_fn () && !cond_fn_p)
8196 : 0 : new_stmt = gimple_build_call_internal (internal_fn (code),
8197 : : op.num_ops,
8198 : : vop[0], vop[1], vop[2]);
8199 : 5247 : else if (code.is_internal_fn () && cond_fn_p)
8200 : 714 : new_stmt = gimple_build_call_internal (internal_fn (code),
8201 : : op.num_ops,
8202 : : vop[0], vop[1], vop[2],
8203 : : vop[reduc_index]);
8204 : : else
8205 : 3819 : new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8206 : : vop[0], vop[1], vop[2]);
8207 : 4535 : new_temp = make_ssa_name (vec_dest, new_stmt);
8208 : 4535 : gimple_set_lhs (new_stmt, new_temp);
8209 : 4535 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8210 : : }
8211 : :
8212 : 4961 : if (single_defuse_cycle && i < num - 1)
8213 : 2929 : vec_oprnds[reduc_index].safe_push (gimple_get_lhs (new_stmt));
8214 : : else
8215 : 2032 : slp_node->push_vec_def (new_stmt);
8216 : : }
8217 : :
8218 : : return true;
8219 : 9300 : }
8220 : :
8221 : : /* Transform phase of a cycle PHI. */
8222 : :
8223 : : bool
8224 : 22612 : vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8225 : : stmt_vec_info stmt_info,
8226 : : slp_tree slp_node, slp_instance slp_node_instance)
8227 : : {
8228 : 22612 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
8229 : 22612 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8230 : 22612 : int i;
8231 : 22612 : bool nested_cycle = false;
8232 : 22612 : int vec_num;
8233 : 22612 : const bool reduc_chain
8234 : 22612 : = SLP_INSTANCE_KIND (slp_node_instance) == slp_inst_kind_reduc_chain;
8235 : :
8236 : 22715 : if (nested_in_vect_loop_p (loop, stmt_info))
8237 : : {
8238 : : loop = loop->inner;
8239 : : nested_cycle = true;
8240 : : }
8241 : :
8242 : 22612 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
8243 : 22612 : if (reduc_info
8244 : 22031 : && (VECT_REDUC_INFO_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8245 : 22031 : || VECT_REDUC_INFO_TYPE (reduc_info) == FOLD_LEFT_REDUCTION))
8246 : : /* Leave the scalar phi in place. */
8247 : : return true;
8248 : :
8249 : 21774 : vec_num = vect_get_num_copies (loop_vinfo, slp_node);
8250 : :
8251 : : /* Check whether we should use a single PHI node and accumulate
8252 : : vectors to one before the backedge. */
8253 : 21774 : if (reduc_info && VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info))
8254 : 21774 : vec_num = 1;
8255 : :
8256 : : /* Create the destination vector */
8257 : 21774 : gphi *phi = as_a <gphi *> (stmt_info->stmt);
8258 : 21774 : tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8259 : : vectype_out);
8260 : :
8261 : : /* Get the loop-entry arguments. */
8262 : 21774 : tree vec_initial_def = NULL_TREE;
8263 : 21774 : auto_vec<tree> vec_initial_defs;
8264 : 21774 : vec_initial_defs.reserve (vec_num);
8265 : : /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8266 : : and we can't use zero for induc_val, use initial_def. Similarly
8267 : : for REDUC_MIN and initial_def larger than the base. */
8268 : 21774 : if (reduc_info
8269 : 21193 : && VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8270 : : {
8271 : 66 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
8272 : 66 : tree initial_def = vect_phi_initial_value (phi);
8273 : 66 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).safe_push (initial_def);
8274 : 66 : tree induc_val = VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info);
8275 : 66 : if (TREE_CODE (initial_def) == INTEGER_CST
8276 : 64 : && !integer_zerop (induc_val)
8277 : 130 : && ((VECT_REDUC_INFO_CODE (reduc_info) == MAX_EXPR
8278 : 44 : && tree_int_cst_lt (initial_def, induc_val))
8279 : 61 : || (VECT_REDUC_INFO_CODE (reduc_info) == MIN_EXPR
8280 : 20 : && tree_int_cst_lt (induc_val, initial_def))))
8281 : : {
8282 : 3 : induc_val = initial_def;
8283 : : /* Communicate we used the initial_def to epilouge
8284 : : generation. */
8285 : 3 : VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8286 : : }
8287 : 66 : vec_initial_defs.quick_push
8288 : 66 : (build_vector_from_val (vectype_out, induc_val));
8289 : 66 : }
8290 : 21708 : else if (nested_cycle)
8291 : : {
8292 : 651 : unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8293 : 651 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8294 : : &vec_initial_defs);
8295 : : }
8296 : : else
8297 : : {
8298 : 21057 : gcc_assert (slp_node == slp_node_instance->reduc_phis);
8299 : 21057 : vec<tree> &initial_values = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info);
8300 : 21057 : vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8301 : :
8302 : 21057 : unsigned int num_phis = stmts.length ();
8303 : 21057 : if (reduc_chain)
8304 : 2280 : num_phis = 1;
8305 : 21057 : initial_values.reserve (num_phis);
8306 : 42422 : for (unsigned int i = 0; i < num_phis; ++i)
8307 : : {
8308 : 21365 : gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8309 : 21365 : initial_values.quick_push (vect_phi_initial_value (this_phi));
8310 : : }
8311 : 21057 : if (vec_num == 1)
8312 : 18488 : vect_find_reusable_accumulator (loop_vinfo, reduc_info, vectype_out);
8313 : 21057 : if (!initial_values.is_empty ())
8314 : : {
8315 : 20870 : tree initial_value
8316 : 41562 : = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8317 : 20870 : code_helper code = VECT_REDUC_INFO_CODE (reduc_info);
8318 : 20870 : tree neutral_op
8319 : 20870 : = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8320 : : code, initial_value);
8321 : : /* Try to simplify the vector initialization by applying an
8322 : : adjustment after the reduction has been performed. This
8323 : : can also break a critical path but on the other hand
8324 : : requires to keep the initial value live across the loop. */
8325 : 20870 : if (neutral_op
8326 : 20783 : && initial_values.length () == 1
8327 : 20619 : && !VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)
8328 : 17596 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8329 : 38396 : && !operand_equal_p (neutral_op, initial_values[0]))
8330 : : {
8331 : 12775 : VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info)
8332 : 12775 : = initial_values[0];
8333 : 12775 : initial_values[0] = neutral_op;
8334 : : }
8335 : 41740 : get_initial_defs_for_reduction (loop_vinfo, reduc_info, vectype_out,
8336 : : &vec_initial_defs, vec_num,
8337 : : stmts.length (), neutral_op);
8338 : : }
8339 : : }
8340 : :
8341 : 21774 : if (vec_initial_def)
8342 : : {
8343 : 0 : vec_initial_defs.create (1);
8344 : 0 : vec_initial_defs.quick_push (vec_initial_def);
8345 : : }
8346 : :
8347 : 21774 : if (reduc_info)
8348 : 21193 : if (auto *accumulator = VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info))
8349 : : {
8350 : 3224 : tree def = accumulator->reduc_input;
8351 : 3224 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8352 : : {
8353 : 3222 : unsigned int nreduc;
8354 : 6444 : bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8355 : 3222 : (TREE_TYPE (def)),
8356 : 3222 : TYPE_VECTOR_SUBPARTS (vectype_out),
8357 : : &nreduc);
8358 : 0 : gcc_assert (res);
8359 : 3222 : gimple_seq stmts = NULL;
8360 : : /* Reduce the single vector to a smaller one. */
8361 : 3222 : if (nreduc != 1)
8362 : : {
8363 : : /* Perform the reduction in the appropriate type. */
8364 : 3222 : tree rvectype = vectype_out;
8365 : 3222 : if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8366 : 3222 : TREE_TYPE (TREE_TYPE (def))))
8367 : 227 : rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8368 : : TYPE_VECTOR_SUBPARTS
8369 : 454 : (vectype_out));
8370 : 3222 : def = vect_create_partial_epilog (def, rvectype,
8371 : : VECT_REDUC_INFO_CODE
8372 : : (reduc_info),
8373 : : &stmts);
8374 : : }
8375 : : /* The epilogue loop might use a different vector mode, like
8376 : : VNx2DI vs. V2DI. */
8377 : 3222 : if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8378 : : {
8379 : 0 : tree reduc_type = build_vector_type_for_mode
8380 : 0 : (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8381 : 0 : def = gimple_convert (&stmts, reduc_type, def);
8382 : : }
8383 : : /* Adjust the input so we pick up the partially reduced value
8384 : : for the skip edge in vect_create_epilog_for_reduction. */
8385 : 3222 : accumulator->reduc_input = def;
8386 : : /* And the reduction could be carried out using a different sign. */
8387 : 3222 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8388 : 227 : def = gimple_convert (&stmts, vectype_out, def);
8389 : 3222 : edge e;
8390 : 3222 : if ((e = loop_vinfo->main_loop_edge)
8391 : 3222 : || (e = loop_vinfo->skip_this_loop_edge))
8392 : : {
8393 : : /* While we'd like to insert on the edge this will split
8394 : : blocks and disturb bookkeeping, we also will eventually
8395 : : need this on the skip edge. Rely on sinking to
8396 : : fixup optimal placement and insert in the pred. */
8397 : 3035 : gimple_stmt_iterator gsi = gsi_last_bb (e->src);
8398 : : /* Insert before a cond that eventually skips the
8399 : : epilogue. */
8400 : 3035 : if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8401 : 3022 : gsi_prev (&gsi);
8402 : 3035 : gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8403 : : }
8404 : : else
8405 : 187 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8406 : : stmts);
8407 : : }
8408 : 3224 : if (loop_vinfo->main_loop_edge)
8409 : 3037 : vec_initial_defs[0]
8410 : 3037 : = vect_get_main_loop_result (loop_vinfo, def,
8411 : 3037 : vec_initial_defs[0]);
8412 : : else
8413 : 187 : vec_initial_defs.safe_push (def);
8414 : : }
8415 : :
8416 : : /* Generate the reduction PHIs upfront. */
8417 : 47371 : for (i = 0; i < vec_num; i++)
8418 : : {
8419 : 25597 : tree vec_init_def = vec_initial_defs[i];
8420 : : /* Create the reduction-phi that defines the reduction
8421 : : operand. */
8422 : 25597 : gphi *new_phi = create_phi_node (vec_dest, loop->header);
8423 : 25597 : add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8424 : : UNKNOWN_LOCATION);
8425 : :
8426 : : /* The loop-latch arg is set in epilogue processing. */
8427 : :
8428 : 25597 : slp_node->push_vec_def (new_phi);
8429 : : }
8430 : :
8431 : 21774 : return true;
8432 : 21774 : }
8433 : :
8434 : : /* Vectorizes LC PHIs. */
8435 : :
8436 : : bool
8437 : 164590 : vectorizable_lc_phi (loop_vec_info loop_vinfo,
8438 : : stmt_vec_info stmt_info,
8439 : : slp_tree slp_node)
8440 : : {
8441 : 164590 : if (!loop_vinfo
8442 : 164590 : || !is_a <gphi *> (stmt_info->stmt)
8443 : 195321 : || gimple_phi_num_args (stmt_info->stmt) != 1)
8444 : : return false;
8445 : :
8446 : 673 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8447 : 109 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8448 : : return false;
8449 : :
8450 : : /* Deal with copies from externs or constants that disguise as
8451 : : loop-closed PHI nodes (PR97886). */
8452 : 673 : if (!vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8453 : : SLP_TREE_VECTYPE (slp_node)))
8454 : : {
8455 : 0 : if (dump_enabled_p ())
8456 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8457 : : "incompatible vector types for invariants\n");
8458 : 0 : return false;
8459 : : }
8460 : :
8461 : : /* ??? This can happen with data vs. mask uses of boolean. */
8462 : 673 : if (!useless_type_conversion_p (SLP_TREE_VECTYPE (slp_node),
8463 : 673 : SLP_TREE_VECTYPE
8464 : : (SLP_TREE_CHILDREN (slp_node)[0])))
8465 : : {
8466 : 0 : if (dump_enabled_p ())
8467 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8468 : : "missed mask promotion\n");
8469 : 0 : return false;
8470 : : }
8471 : :
8472 : 673 : SLP_TREE_TYPE (slp_node) = lc_phi_info_type;
8473 : 673 : return true;
8474 : : }
8475 : :
8476 : : bool
8477 : 428 : vect_transform_lc_phi (loop_vec_info loop_vinfo,
8478 : : stmt_vec_info stmt_info,
8479 : : slp_tree slp_node)
8480 : : {
8481 : :
8482 : 428 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8483 : 428 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8484 : 428 : basic_block bb = gimple_bb (stmt_info->stmt);
8485 : 428 : edge e = single_pred_edge (bb);
8486 : 428 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8487 : 428 : auto_vec<tree> vec_oprnds;
8488 : 856 : vect_get_vec_defs (loop_vinfo, slp_node,
8489 : 428 : gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8490 : 951 : for (unsigned i = 0; i < vec_oprnds.length (); i++)
8491 : : {
8492 : : /* Create the vectorized LC PHI node. */
8493 : 523 : gphi *new_phi = create_phi_node (vec_dest, bb);
8494 : 523 : add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8495 : 523 : slp_node->push_vec_def (new_phi);
8496 : : }
8497 : :
8498 : 428 : return true;
8499 : 428 : }
8500 : :
8501 : : /* Vectorizes PHIs. */
8502 : :
8503 : : bool
8504 : 149939 : vectorizable_phi (bb_vec_info vinfo,
8505 : : stmt_vec_info stmt_info,
8506 : : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8507 : : {
8508 : 149939 : if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8509 : : return false;
8510 : :
8511 : 78043 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8512 : : return false;
8513 : :
8514 : 78043 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8515 : :
8516 : 78043 : if (cost_vec) /* transformation not required. */
8517 : : {
8518 : : slp_tree child;
8519 : : unsigned i;
8520 : 196912 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8521 : 134163 : if (!child)
8522 : : {
8523 : 0 : if (dump_enabled_p ())
8524 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8525 : : "PHI node with unvectorized backedge def\n");
8526 : 0 : return false;
8527 : : }
8528 : 134163 : else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8529 : : {
8530 : 27 : if (dump_enabled_p ())
8531 : 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8532 : : "incompatible vector types for invariants\n");
8533 : 27 : return false;
8534 : : }
8535 : 134136 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8536 : 134136 : && !useless_type_conversion_p (vectype,
8537 : : SLP_TREE_VECTYPE (child)))
8538 : : {
8539 : : /* With bools we can have mask and non-mask precision vectors
8540 : : or different non-mask precisions. while pattern recog is
8541 : : supposed to guarantee consistency here bugs in it can cause
8542 : : mismatches (PR103489 and PR103800 for example).
8543 : : Deal with them here instead of ICEing later. */
8544 : 18 : if (dump_enabled_p ())
8545 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8546 : : "incompatible vector type setup from "
8547 : : "bool pattern detection\n");
8548 : 18 : return false;
8549 : : }
8550 : :
8551 : : /* For single-argument PHIs assume coalescing which means zero cost
8552 : : for the scalar and the vector PHIs. This avoids artificially
8553 : : favoring the vector path (but may pessimize it in some cases). */
8554 : 62749 : if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8555 : 52972 : record_stmt_cost (cost_vec, vect_get_num_copies (vinfo, slp_node),
8556 : : vector_stmt, slp_node, vectype, 0, vect_body);
8557 : 62749 : SLP_TREE_TYPE (slp_node) = phi_info_type;
8558 : 62749 : return true;
8559 : : }
8560 : :
8561 : 15249 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8562 : 15249 : basic_block bb = gimple_bb (stmt_info->stmt);
8563 : 15249 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8564 : 15249 : auto_vec<gphi *> new_phis;
8565 : 51456 : for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8566 : : {
8567 : 36207 : slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8568 : :
8569 : : /* Skip not yet vectorized defs. */
8570 : 36611 : if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8571 : 36207 : && SLP_TREE_VEC_DEFS (child).is_empty ())
8572 : 404 : continue;
8573 : :
8574 : 35803 : auto_vec<tree> vec_oprnds;
8575 : 35803 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8576 : 35803 : if (!new_phis.exists ())
8577 : : {
8578 : 15249 : new_phis.create (vec_oprnds.length ());
8579 : 32161 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
8580 : : {
8581 : : /* Create the vectorized LC PHI node. */
8582 : 16912 : new_phis.quick_push (create_phi_node (vec_dest, bb));
8583 : 16912 : slp_node->push_vec_def (new_phis[j]);
8584 : : }
8585 : : }
8586 : 35803 : edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8587 : 76660 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
8588 : 40857 : add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8589 : 35803 : }
8590 : : /* We should have at least one already vectorized child. */
8591 : 15249 : gcc_assert (new_phis.exists ());
8592 : :
8593 : 15249 : return true;
8594 : 15249 : }
8595 : :
8596 : : /* Vectorizes first order recurrences. An overview of the transformation
8597 : : is described below. Suppose we have the following loop.
8598 : :
8599 : : int t = 0;
8600 : : for (int i = 0; i < n; ++i)
8601 : : {
8602 : : b[i] = a[i] - t;
8603 : : t = a[i];
8604 : : }
8605 : :
8606 : : There is a first-order recurrence on 'a'. For this loop, the scalar IR
8607 : : looks (simplified) like:
8608 : :
8609 : : scalar.preheader:
8610 : : init = 0;
8611 : :
8612 : : scalar.body:
8613 : : i = PHI <0(scalar.preheader), i+1(scalar.body)>
8614 : : _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8615 : : _1 = a[i]
8616 : : b[i] = _1 - _2
8617 : : if (i < n) goto scalar.body
8618 : :
8619 : : In this example, _2 is a recurrence because it's value depends on the
8620 : : previous iteration. We vectorize this as (VF = 4)
8621 : :
8622 : : vector.preheader:
8623 : : vect_init = vect_cst(..., ..., ..., 0)
8624 : :
8625 : : vector.body
8626 : : i = PHI <0(vector.preheader), i+4(vector.body)>
8627 : : vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8628 : : vect_2 = a[i, i+1, i+2, i+3];
8629 : : vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8630 : : b[i, i+1, i+2, i+3] = vect_2 - vect_3
8631 : : if (..) goto vector.body
8632 : :
8633 : : In this function, vectorizable_recurr, we code generate both the
8634 : : vector PHI node and the permute since those together compute the
8635 : : vectorized value of the scalar PHI. We do not yet have the
8636 : : backedge value to fill in there nor into the vec_perm. Those
8637 : : are filled in vect_schedule_scc.
8638 : :
8639 : : TODO: Since the scalar loop does not have a use of the recurrence
8640 : : outside of the loop the natural way to implement peeling via
8641 : : vectorizing the live value doesn't work. For now peeling of loops
8642 : : with a recurrence is not implemented. For SLP the supported cases
8643 : : are restricted to those requiring a single vector recurrence PHI. */
8644 : :
8645 : : bool
8646 : 163959 : vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8647 : : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8648 : : {
8649 : 163959 : if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8650 : : return false;
8651 : :
8652 : 30100 : gphi *phi = as_a<gphi *> (stmt_info->stmt);
8653 : :
8654 : : /* So far we only support first-order recurrence auto-vectorization. */
8655 : 30100 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8656 : : return false;
8657 : :
8658 : 388 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8659 : 388 : unsigned ncopies = vect_get_num_copies (loop_vinfo, slp_node);
8660 : 388 : poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8661 : 388 : unsigned dist = SLP_TREE_LANES (slp_node);
8662 : : /* We need to be able to make progress with a single vector. */
8663 : 388 : if (maybe_gt (dist * 2, nunits))
8664 : : {
8665 : 0 : if (dump_enabled_p ())
8666 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8667 : : "first order recurrence exceeds half of "
8668 : : "a vector\n");
8669 : 0 : return false;
8670 : : }
8671 : :
8672 : : /* We need to be able to build a { ..., a, b } init vector with
8673 : : dist number of distinct trailing values. Always possible
8674 : : when dist == 1 or when nunits is constant or when the initializations
8675 : : are uniform. */
8676 : 388 : tree uniform_initval = NULL_TREE;
8677 : 388 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8678 : 1576 : for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
8679 : : {
8680 : 424 : gphi *phi = as_a <gphi *> (s->stmt);
8681 : 424 : if (! uniform_initval)
8682 : 388 : uniform_initval = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8683 : 36 : else if (! operand_equal_p (uniform_initval,
8684 : 36 : PHI_ARG_DEF_FROM_EDGE (phi, pe)))
8685 : : {
8686 : : uniform_initval = NULL_TREE;
8687 : : break;
8688 : : }
8689 : : }
8690 : 388 : if (!uniform_initval && !nunits.is_constant ())
8691 : : {
8692 : : if (dump_enabled_p ())
8693 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8694 : : "cannot build initialization vector for "
8695 : : "first order recurrence\n");
8696 : : return false;
8697 : : }
8698 : :
8699 : : /* First-order recurrence autovectorization needs to handle permutation
8700 : : with indices = [nunits-1, nunits, nunits+1, ...]. */
8701 : 388 : vec_perm_builder sel (nunits, 1, 3);
8702 : 1552 : for (int i = 0; i < 3; ++i)
8703 : 1164 : sel.quick_push (nunits - dist + i);
8704 : 388 : vec_perm_indices indices (sel, 2, nunits);
8705 : :
8706 : 388 : if (cost_vec) /* transformation not required. */
8707 : : {
8708 : 346 : if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8709 : : indices))
8710 : : return false;
8711 : :
8712 : : /* We eventually need to set a vector type on invariant
8713 : : arguments. */
8714 : : unsigned j;
8715 : : slp_tree child;
8716 : 774 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8717 : 516 : if (!vect_maybe_update_slp_op_vectype (child, vectype))
8718 : : {
8719 : 0 : if (dump_enabled_p ())
8720 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8721 : : "incompatible vector types for "
8722 : : "invariants\n");
8723 : 0 : return false;
8724 : : }
8725 : :
8726 : : /* Verify we have set up compatible types. */
8727 : 258 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8728 : 258 : slp_tree latch_def = SLP_TREE_CHILDREN (slp_node)[le->dest_idx];
8729 : 258 : tree latch_vectype = SLP_TREE_VECTYPE (latch_def);
8730 : 258 : if (!types_compatible_p (latch_vectype, vectype))
8731 : : return false;
8732 : :
8733 : : /* The recurrence costs the initialization vector and one permute
8734 : : for each copy. With SLP the prologue value is explicitly
8735 : : represented and costed separately. */
8736 : 226 : unsigned prologue_cost = 0;
8737 : 226 : unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8738 : : slp_node, 0, vect_body);
8739 : 226 : if (dump_enabled_p ())
8740 : 50 : dump_printf_loc (MSG_NOTE, vect_location,
8741 : : "vectorizable_recurr: inside_cost = %d, "
8742 : : "prologue_cost = %d .\n", inside_cost,
8743 : : prologue_cost);
8744 : :
8745 : 226 : SLP_TREE_TYPE (slp_node) = recurr_info_type;
8746 : 226 : return true;
8747 : : }
8748 : :
8749 : 42 : tree vec_init;
8750 : 42 : if (! uniform_initval)
8751 : : {
8752 : 6 : vec<constructor_elt, va_gc> *v = NULL;
8753 : 6 : vec_alloc (v, nunits.to_constant ());
8754 : 33 : for (unsigned i = 0; i < nunits.to_constant () - dist; ++i)
8755 : 27 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
8756 : : build_zero_cst (TREE_TYPE (vectype)));
8757 : 39 : for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
8758 : : {
8759 : 21 : gphi *phi = as_a <gphi *> (s->stmt);
8760 : 21 : tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8761 : 21 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
8762 : 21 : TREE_TYPE (preheader)))
8763 : : {
8764 : 0 : gimple_seq stmts = NULL;
8765 : 0 : preheader = gimple_convert (&stmts,
8766 : 0 : TREE_TYPE (vectype), preheader);
8767 : 0 : gsi_insert_seq_on_edge_immediate (pe, stmts);
8768 : : }
8769 : 21 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, preheader);
8770 : : }
8771 : 6 : vec_init = build_constructor (vectype, v);
8772 : : }
8773 : : else
8774 : : vec_init = uniform_initval;
8775 : 42 : vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8776 : :
8777 : : /* Create the vectorized first-order PHI node. */
8778 : 42 : tree vec_dest = vect_get_new_vect_var (vectype,
8779 : : vect_simple_var, "vec_recur_");
8780 : 42 : basic_block bb = gimple_bb (phi);
8781 : 42 : gphi *new_phi = create_phi_node (vec_dest, bb);
8782 : 42 : add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8783 : :
8784 : : /* Insert shuffles the first-order recurrence autovectorization.
8785 : : result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
8786 : 42 : tree perm = vect_gen_perm_mask_checked (vectype, indices);
8787 : :
8788 : : /* Insert the required permute after the latch definition. The
8789 : : second and later operands are tentative and will be updated when we have
8790 : : vectorized the latch definition. */
8791 : 42 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8792 : 42 : gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8793 : 42 : gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8794 : 42 : gsi_next (&gsi2);
8795 : :
8796 : 121 : for (unsigned i = 0; i < ncopies; ++i)
8797 : : {
8798 : 79 : vec_dest = make_ssa_name (vectype);
8799 : 79 : gassign *vperm
8800 : 121 : = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8801 : 42 : i == 0 ? gimple_phi_result (new_phi) : NULL,
8802 : : NULL, perm);
8803 : 79 : vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8804 : :
8805 : 79 : slp_node->push_vec_def (vperm);
8806 : : }
8807 : :
8808 : : return true;
8809 : 388 : }
8810 : :
8811 : : /* Return true if VECTYPE represents a vector that requires lowering
8812 : : by the vector lowering pass. */
8813 : :
8814 : : bool
8815 : 604793 : vect_emulated_vector_p (tree vectype)
8816 : : {
8817 : 1209586 : return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8818 : 607422 : && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8819 : 2611 : || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8820 : : }
8821 : :
8822 : : /* Return true if we can emulate CODE on an integer mode representation
8823 : : of a vector. */
8824 : :
8825 : : bool
8826 : 10660 : vect_can_vectorize_without_simd_p (tree_code code)
8827 : : {
8828 : 10660 : switch (code)
8829 : : {
8830 : : case PLUS_EXPR:
8831 : : case MINUS_EXPR:
8832 : : case NEGATE_EXPR:
8833 : : case BIT_AND_EXPR:
8834 : : case BIT_IOR_EXPR:
8835 : : case BIT_XOR_EXPR:
8836 : : case BIT_NOT_EXPR:
8837 : : return true;
8838 : :
8839 : 9891 : default:
8840 : 9891 : return false;
8841 : : }
8842 : : }
8843 : :
8844 : : /* Likewise, but taking a code_helper. */
8845 : :
8846 : : bool
8847 : 231 : vect_can_vectorize_without_simd_p (code_helper code)
8848 : : {
8849 : 231 : return (code.is_tree_code ()
8850 : 231 : && vect_can_vectorize_without_simd_p (tree_code (code)));
8851 : : }
8852 : :
8853 : : /* Create vector init for vectorized iv. */
8854 : : static tree
8855 : 833 : vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8856 : : tree step_expr, poly_uint64 nunits,
8857 : : tree vectype,
8858 : : enum vect_induction_op_type induction_type)
8859 : : {
8860 : 833 : unsigned HOST_WIDE_INT const_nunits;
8861 : 833 : tree vec_shift, vec_init, new_name;
8862 : 833 : unsigned i;
8863 : 833 : tree itype = TREE_TYPE (vectype);
8864 : :
8865 : : /* iv_loop is the loop to be vectorized. Create:
8866 : : vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
8867 : 833 : new_name = gimple_convert (stmts, itype, init_expr);
8868 : 833 : switch (induction_type)
8869 : : {
8870 : 18 : case vect_step_op_shr:
8871 : 18 : case vect_step_op_shl:
8872 : : /* Build the Initial value from shift_expr. */
8873 : 18 : vec_init = gimple_build_vector_from_val (stmts,
8874 : : vectype,
8875 : : new_name);
8876 : 18 : vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
8877 : : build_zero_cst (itype), step_expr);
8878 : 18 : vec_init = gimple_build (stmts,
8879 : : (induction_type == vect_step_op_shr
8880 : : ? RSHIFT_EXPR : LSHIFT_EXPR),
8881 : : vectype, vec_init, vec_shift);
8882 : 18 : break;
8883 : :
8884 : 739 : case vect_step_op_neg:
8885 : 739 : {
8886 : 739 : vec_init = gimple_build_vector_from_val (stmts,
8887 : : vectype,
8888 : : new_name);
8889 : 739 : tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
8890 : : vectype, vec_init);
8891 : : /* The encoding has 2 interleaved stepped patterns. */
8892 : 739 : vec_perm_builder sel (nunits, 2, 3);
8893 : 739 : sel.quick_grow (6);
8894 : 3695 : for (i = 0; i < 3; i++)
8895 : : {
8896 : 2217 : sel[2 * i] = i;
8897 : 2217 : sel[2 * i + 1] = i + nunits;
8898 : : }
8899 : 739 : vec_perm_indices indices (sel, 2, nunits);
8900 : : /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
8901 : : fail when vec_init is const vector. In that situation vec_perm is not
8902 : : really needed. */
8903 : 739 : tree perm_mask_even
8904 : 739 : = vect_gen_perm_mask_any (vectype, indices);
8905 : 739 : vec_init = gimple_build (stmts, VEC_PERM_EXPR,
8906 : : vectype,
8907 : : vec_init, vec_neg,
8908 : : perm_mask_even);
8909 : 739 : }
8910 : 739 : break;
8911 : :
8912 : 76 : case vect_step_op_mul:
8913 : 76 : {
8914 : : /* Use unsigned mult to avoid UD integer overflow. */
8915 : 76 : gcc_assert (nunits.is_constant (&const_nunits));
8916 : 76 : tree utype = unsigned_type_for (itype);
8917 : 76 : tree uvectype = build_vector_type (utype,
8918 : 76 : TYPE_VECTOR_SUBPARTS (vectype));
8919 : 76 : new_name = gimple_convert (stmts, utype, new_name);
8920 : 76 : vec_init = gimple_build_vector_from_val (stmts,
8921 : : uvectype,
8922 : : new_name);
8923 : 76 : tree_vector_builder elts (uvectype, const_nunits, 1);
8924 : 76 : tree elt_step = build_one_cst (utype);
8925 : :
8926 : 76 : elts.quick_push (elt_step);
8927 : 660 : for (i = 1; i < const_nunits; i++)
8928 : : {
8929 : : /* Create: new_name_i = new_name + step_expr. */
8930 : 508 : elt_step = gimple_build (stmts, MULT_EXPR,
8931 : : utype, elt_step, step_expr);
8932 : 508 : elts.quick_push (elt_step);
8933 : : }
8934 : : /* Create a vector from [new_name_0, new_name_1, ...,
8935 : : new_name_nunits-1]. */
8936 : 76 : tree vec_mul = gimple_build_vector (stmts, &elts);
8937 : 76 : vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
8938 : : vec_init, vec_mul);
8939 : 76 : vec_init = gimple_convert (stmts, vectype, vec_init);
8940 : 76 : }
8941 : 76 : break;
8942 : :
8943 : 0 : default:
8944 : 0 : gcc_unreachable ();
8945 : : }
8946 : :
8947 : 833 : return vec_init;
8948 : : }
8949 : :
8950 : : /* Peel init_expr by skip_niter for induction_type. */
8951 : : tree
8952 : 84 : vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8953 : : tree skip_niters, tree step_expr,
8954 : : enum vect_induction_op_type induction_type)
8955 : : {
8956 : 84 : gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
8957 : 84 : tree type = TREE_TYPE (init_expr);
8958 : 84 : unsigned prec = TYPE_PRECISION (type);
8959 : 84 : switch (induction_type)
8960 : : {
8961 : 0 : case vect_step_op_neg:
8962 : 0 : if (TREE_INT_CST_LOW (skip_niters) % 2)
8963 : 0 : init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
8964 : : /* else no change. */
8965 : : break;
8966 : :
8967 : 12 : case vect_step_op_shr:
8968 : 12 : case vect_step_op_shl:
8969 : 12 : skip_niters = gimple_convert (stmts, type, skip_niters);
8970 : 12 : step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
8971 : : /* When shift mount >= precision, need to avoid UD.
8972 : : In the original loop, there's no UD, and according to semantic,
8973 : : init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
8974 : 12 : if (!tree_fits_uhwi_p (step_expr)
8975 : 12 : || tree_to_uhwi (step_expr) >= prec)
8976 : : {
8977 : 6 : if (induction_type == vect_step_op_shl
8978 : 6 : || TYPE_UNSIGNED (type))
8979 : 4 : init_expr = build_zero_cst (type);
8980 : : else
8981 : 2 : init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
8982 : : init_expr,
8983 : 4 : wide_int_to_tree (type, prec - 1));
8984 : : }
8985 : : else
8986 : 8 : init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
8987 : : ? RSHIFT_EXPR : LSHIFT_EXPR),
8988 : : type, init_expr, step_expr);
8989 : : break;
8990 : :
8991 : 72 : case vect_step_op_mul:
8992 : 72 : {
8993 : 72 : tree utype = unsigned_type_for (type);
8994 : 72 : init_expr = gimple_convert (stmts, utype, init_expr);
8995 : 72 : wide_int skipn = wi::to_wide (skip_niters);
8996 : 72 : wide_int begin = wi::to_wide (step_expr);
8997 : 72 : auto_mpz base, exp, mod, res;
8998 : 72 : wi::to_mpz (begin, base, TYPE_SIGN (type));
8999 : 72 : wi::to_mpz (skipn, exp, UNSIGNED);
9000 : 72 : mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9001 : 72 : mpz_powm (res, base, exp, mod);
9002 : 72 : begin = wi::from_mpz (utype, res, true);
9003 : 72 : tree mult_expr = wide_int_to_tree (utype, begin);
9004 : 72 : init_expr = gimple_build (stmts, MULT_EXPR, utype,
9005 : : init_expr, mult_expr);
9006 : 72 : init_expr = gimple_convert (stmts, type, init_expr);
9007 : 72 : }
9008 : 72 : break;
9009 : :
9010 : 0 : default:
9011 : 0 : gcc_unreachable ();
9012 : : }
9013 : :
9014 : 84 : return init_expr;
9015 : : }
9016 : :
9017 : : /* Create vector step for vectorized iv. */
9018 : : static tree
9019 : 1068 : vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9020 : : poly_uint64 vf,
9021 : : enum vect_induction_op_type induction_type)
9022 : : {
9023 : 1068 : tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9024 : 1068 : tree new_name = NULL;
9025 : : /* Step should be pow (step, vf) for mult induction. */
9026 : 1068 : if (induction_type == vect_step_op_mul)
9027 : : {
9028 : 76 : gcc_assert (vf.is_constant ());
9029 : 76 : wide_int begin = wi::to_wide (step_expr);
9030 : :
9031 : 584 : for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9032 : 508 : begin = wi::mul (begin, wi::to_wide (step_expr));
9033 : :
9034 : 76 : new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9035 : 76 : }
9036 : 992 : else if (induction_type == vect_step_op_neg)
9037 : : /* Do nothing. */
9038 : : ;
9039 : : else
9040 : 18 : new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9041 : : expr, step_expr);
9042 : 1068 : return new_name;
9043 : : }
9044 : :
9045 : : static tree
9046 : 1068 : vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9047 : : stmt_vec_info stmt_info,
9048 : : tree new_name, tree vectype,
9049 : : enum vect_induction_op_type induction_type)
9050 : : {
9051 : : /* No step is needed for neg induction. */
9052 : 1068 : if (induction_type == vect_step_op_neg)
9053 : : return NULL;
9054 : :
9055 : 94 : tree t = unshare_expr (new_name);
9056 : 94 : gcc_assert (CONSTANT_CLASS_P (new_name)
9057 : : || TREE_CODE (new_name) == SSA_NAME);
9058 : 94 : tree new_vec = build_vector_from_val (vectype, t);
9059 : 94 : tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9060 : : new_vec, vectype, NULL);
9061 : 94 : return vec_step;
9062 : : }
9063 : :
9064 : : /* Update vectorized iv with vect_step, induc_def is init. */
9065 : : static tree
9066 : 1250 : vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9067 : : tree induc_def, tree vec_step,
9068 : : enum vect_induction_op_type induction_type)
9069 : : {
9070 : 1250 : tree vec_def = induc_def;
9071 : 1250 : switch (induction_type)
9072 : : {
9073 : 76 : case vect_step_op_mul:
9074 : 76 : {
9075 : : /* Use unsigned mult to avoid UD integer overflow. */
9076 : 76 : tree uvectype
9077 : 76 : = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9078 : 76 : TYPE_VECTOR_SUBPARTS (vectype));
9079 : 76 : vec_def = gimple_convert (stmts, uvectype, vec_def);
9080 : 76 : vec_step = gimple_convert (stmts, uvectype, vec_step);
9081 : 76 : vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9082 : : vec_def, vec_step);
9083 : 76 : vec_def = gimple_convert (stmts, vectype, vec_def);
9084 : : }
9085 : 76 : break;
9086 : :
9087 : 12 : case vect_step_op_shr:
9088 : 12 : vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9089 : : vec_def, vec_step);
9090 : 12 : break;
9091 : :
9092 : 6 : case vect_step_op_shl:
9093 : 6 : vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9094 : : vec_def, vec_step);
9095 : 6 : break;
9096 : : case vect_step_op_neg:
9097 : : vec_def = induc_def;
9098 : : /* Do nothing. */
9099 : : break;
9100 : 0 : default:
9101 : 0 : gcc_unreachable ();
9102 : : }
9103 : :
9104 : 1250 : return vec_def;
9105 : :
9106 : : }
9107 : :
9108 : : /* Function vectorizable_nonlinear_induction
9109 : :
9110 : : Check if STMT_INFO performs an nonlinear induction computation that can be
9111 : : vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9112 : : a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9113 : : basic block.
9114 : : Return true if STMT_INFO is vectorizable in this way. */
9115 : :
9116 : : static bool
9117 : 9863 : vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9118 : : stmt_vec_info stmt_info,
9119 : : slp_tree slp_node,
9120 : : stmt_vector_for_cost *cost_vec)
9121 : : {
9122 : 9863 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9123 : 9863 : unsigned ncopies;
9124 : 9863 : bool nested_in_vect_loop = false;
9125 : 9863 : class loop *iv_loop;
9126 : 9863 : tree vec_def;
9127 : 9863 : edge pe = loop_preheader_edge (loop);
9128 : 9863 : basic_block new_bb;
9129 : 9863 : tree vec_init, vec_step;
9130 : 9863 : tree new_name;
9131 : 9863 : gimple *new_stmt;
9132 : 9863 : gphi *induction_phi;
9133 : 9863 : tree induc_def, vec_dest;
9134 : 9863 : tree init_expr, step_expr;
9135 : 9863 : tree niters_skip;
9136 : 9863 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9137 : 9863 : unsigned i;
9138 : 9863 : gimple_stmt_iterator si;
9139 : :
9140 : 9863 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9141 : :
9142 : 9863 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9143 : 9863 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9144 : 9863 : enum vect_induction_op_type induction_type
9145 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9146 : :
9147 : 9863 : gcc_assert (induction_type > vect_step_op_add);
9148 : :
9149 : 9863 : ncopies = vect_get_num_copies (loop_vinfo, slp_node);
9150 : 9863 : gcc_assert (ncopies >= 1);
9151 : :
9152 : : /* FORNOW. Only handle nonlinear induction in the same loop. */
9153 : 9863 : if (nested_in_vect_loop_p (loop, stmt_info))
9154 : : {
9155 : 0 : if (dump_enabled_p ())
9156 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9157 : : "nonlinear induction in nested loop.\n");
9158 : 0 : return false;
9159 : : }
9160 : :
9161 : 9863 : iv_loop = loop;
9162 : 9863 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9163 : :
9164 : : /* TODO: Support multi-lane SLP for nonlinear iv. There should be separate
9165 : : vector iv update for each iv and a permutation to generate wanted
9166 : : vector iv. */
9167 : 9863 : if (SLP_TREE_LANES (slp_node) > 1)
9168 : : {
9169 : 0 : if (dump_enabled_p ())
9170 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9171 : : "SLP induction not supported for nonlinear"
9172 : : " induction.\n");
9173 : 0 : return false;
9174 : : }
9175 : :
9176 : 9863 : if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9177 : : {
9178 : 0 : if (dump_enabled_p ())
9179 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9180 : : "floating point nonlinear induction vectorization"
9181 : : " not supported.\n");
9182 : 0 : return false;
9183 : : }
9184 : :
9185 : 9863 : step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9186 : 9863 : init_expr = vect_phi_initial_value (phi);
9187 : 9863 : gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9188 : : && TREE_CODE (step_expr) == INTEGER_CST);
9189 : : /* step_expr should be aligned with init_expr,
9190 : : .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9191 : 9863 : step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9192 : :
9193 : 9863 : if (TREE_CODE (init_expr) == INTEGER_CST)
9194 : 2490 : init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9195 : 7373 : else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9196 : : {
9197 : : /* INIT_EXPR could be a bit_field, bail out for such case. */
9198 : 4 : if (dump_enabled_p ())
9199 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9200 : : "nonlinear induction vectorization failed:"
9201 : : " component type of vectype is not a nop conversion"
9202 : : " from type of init_expr.\n");
9203 : 4 : return false;
9204 : : }
9205 : :
9206 : 9859 : switch (induction_type)
9207 : : {
9208 : 2205 : case vect_step_op_neg:
9209 : 2205 : if (maybe_eq (TYPE_VECTOR_SUBPARTS (vectype), 1u))
9210 : : return false;
9211 : 2203 : if (TREE_CODE (init_expr) != INTEGER_CST
9212 : 190 : && TREE_CODE (init_expr) != REAL_CST)
9213 : : {
9214 : : /* Check for backend support of NEGATE_EXPR and vec_perm. */
9215 : 190 : if (!directly_supported_p (NEGATE_EXPR, vectype))
9216 : 0 : return false;
9217 : :
9218 : : /* The encoding has 2 interleaved stepped patterns. */
9219 : 190 : vec_perm_builder sel (nunits, 2, 3);
9220 : 190 : machine_mode mode = TYPE_MODE (vectype);
9221 : 190 : sel.quick_grow (6);
9222 : 950 : for (i = 0; i < 3; i++)
9223 : : {
9224 : 570 : sel[i * 2] = i;
9225 : 570 : sel[i * 2 + 1] = i + nunits;
9226 : : }
9227 : 190 : vec_perm_indices indices (sel, 2, nunits);
9228 : 190 : if (!can_vec_perm_const_p (mode, mode, indices))
9229 : 0 : return false;
9230 : 190 : }
9231 : : break;
9232 : :
9233 : 724 : case vect_step_op_mul:
9234 : 724 : {
9235 : : /* Check for backend support of MULT_EXPR. */
9236 : 724 : if (!directly_supported_p (MULT_EXPR, vectype))
9237 : : return false;
9238 : :
9239 : : /* ?? How to construct vector step for variable number vector.
9240 : : [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9241 : : if (!vf.is_constant ())
9242 : : return false;
9243 : : }
9244 : : break;
9245 : :
9246 : 6834 : case vect_step_op_shr:
9247 : : /* Check for backend support of RSHIFT_EXPR. */
9248 : 6834 : if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9249 : : return false;
9250 : :
9251 : : /* Don't shift more than type precision to avoid UD. */
9252 : 26 : if (!tree_fits_uhwi_p (step_expr)
9253 : 26 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9254 : : TYPE_PRECISION (TREE_TYPE (init_expr))))
9255 : : return false;
9256 : : break;
9257 : :
9258 : 96 : case vect_step_op_shl:
9259 : : /* Check for backend support of RSHIFT_EXPR. */
9260 : 96 : if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9261 : : return false;
9262 : :
9263 : : /* Don't shift more than type precision to avoid UD. */
9264 : 12 : if (!tree_fits_uhwi_p (step_expr)
9265 : 12 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9266 : : TYPE_PRECISION (TREE_TYPE (init_expr))))
9267 : : return false;
9268 : :
9269 : : break;
9270 : :
9271 : 0 : default:
9272 : 0 : gcc_unreachable ();
9273 : : }
9274 : :
9275 : 2811 : if (cost_vec) /* transformation not required. */
9276 : : {
9277 : 1978 : unsigned inside_cost = 0, prologue_cost = 0;
9278 : : /* loop cost for vec_loop. Neg induction doesn't have any
9279 : : inside_cost. */
9280 : 1978 : inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9281 : : slp_node, 0, vect_body);
9282 : :
9283 : : /* loop cost for vec_loop. Neg induction doesn't have any
9284 : : inside_cost. */
9285 : 1978 : if (induction_type == vect_step_op_neg)
9286 : 1464 : inside_cost = 0;
9287 : :
9288 : : /* prologue cost for vec_init and vec_step. */
9289 : 1978 : prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9290 : : slp_node, 0, vect_prologue);
9291 : :
9292 : 1978 : if (dump_enabled_p ())
9293 : 60 : dump_printf_loc (MSG_NOTE, vect_location,
9294 : : "vect_model_induction_cost: inside_cost = %d, "
9295 : : "prologue_cost = %d. \n", inside_cost,
9296 : : prologue_cost);
9297 : :
9298 : 1978 : SLP_TREE_TYPE (slp_node) = induc_vec_info_type;
9299 : 1978 : DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9300 : 1978 : return true;
9301 : : }
9302 : :
9303 : : /* Transform. */
9304 : :
9305 : : /* Compute a vector variable, initialized with the first VF values of
9306 : : the induction variable. E.g., for an iv with IV_PHI='X' and
9307 : : evolution S, for a vector of 4 units, we want to compute:
9308 : : [X, X + S, X + 2*S, X + 3*S]. */
9309 : :
9310 : 833 : if (dump_enabled_p ())
9311 : 32 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9312 : :
9313 : 833 : pe = loop_preheader_edge (iv_loop);
9314 : : /* Find the first insertion point in the BB. */
9315 : 833 : basic_block bb = gimple_bb (phi);
9316 : 833 : si = gsi_after_labels (bb);
9317 : :
9318 : 833 : gimple_seq stmts = NULL;
9319 : :
9320 : 833 : niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9321 : : /* If we are using the loop mask to "peel" for alignment then we need
9322 : : to adjust the start value here. */
9323 : 833 : if (niters_skip != NULL_TREE)
9324 : 0 : init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9325 : : step_expr, induction_type);
9326 : :
9327 : 833 : vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9328 : : step_expr, nunits, vectype,
9329 : : induction_type);
9330 : 833 : if (stmts)
9331 : : {
9332 : 162 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9333 : 162 : gcc_assert (!new_bb);
9334 : : }
9335 : :
9336 : 833 : stmts = NULL;
9337 : 833 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9338 : : vf, induction_type);
9339 : 833 : if (stmts)
9340 : : {
9341 : 0 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9342 : 0 : gcc_assert (!new_bb);
9343 : : }
9344 : :
9345 : 833 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9346 : : new_name, vectype,
9347 : : induction_type);
9348 : : /* Create the following def-use cycle:
9349 : : loop prolog:
9350 : : vec_init = ...
9351 : : vec_step = ...
9352 : : loop:
9353 : : vec_iv = PHI <vec_init, vec_loop>
9354 : : ...
9355 : : STMT
9356 : : ...
9357 : : vec_loop = vec_iv + vec_step; */
9358 : :
9359 : : /* Create the induction-phi that defines the induction-operand. */
9360 : 833 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9361 : 833 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
9362 : 833 : induc_def = PHI_RESULT (induction_phi);
9363 : :
9364 : : /* Create the iv update inside the loop. */
9365 : 833 : stmts = NULL;
9366 : 833 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9367 : : induc_def, vec_step,
9368 : : induction_type);
9369 : :
9370 : 833 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9371 : 833 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9372 : :
9373 : : /* Set the arguments of the phi node: */
9374 : 833 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9375 : 833 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9376 : : UNKNOWN_LOCATION);
9377 : :
9378 : 833 : slp_node->push_vec_def (induction_phi);
9379 : :
9380 : : /* In case that vectorization factor (VF) is bigger than the number
9381 : : of elements that we can fit in a vectype (nunits), we have to generate
9382 : : more than one vector stmt - i.e - we need to "unroll" the
9383 : : vector stmt by a factor VF/nunits. For more details see documentation
9384 : : in vectorizable_operation. */
9385 : :
9386 : 833 : if (ncopies > 1)
9387 : : {
9388 : 235 : stmts = NULL;
9389 : : /* FORNOW. This restriction should be relaxed. */
9390 : 235 : gcc_assert (!nested_in_vect_loop);
9391 : :
9392 : 235 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9393 : : nunits, induction_type);
9394 : :
9395 : 235 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9396 : : new_name, vectype,
9397 : : induction_type);
9398 : 235 : vec_def = induc_def;
9399 : 887 : for (i = 1; i < ncopies; i++)
9400 : : {
9401 : : /* vec_i = vec_prev + vec_step. */
9402 : 417 : stmts = NULL;
9403 : 417 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9404 : : vec_def, vec_step,
9405 : : induction_type);
9406 : 417 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9407 : 417 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9408 : 417 : slp_node->push_vec_def (new_stmt);
9409 : : }
9410 : : }
9411 : :
9412 : 833 : if (dump_enabled_p ())
9413 : 64 : dump_printf_loc (MSG_NOTE, vect_location,
9414 : : "transform induction: created def-use cycle: %G%G",
9415 : 32 : (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9416 : :
9417 : : return true;
9418 : : }
9419 : :
9420 : : /* Function vectorizable_induction
9421 : :
9422 : : Check if STMT_INFO performs an induction computation that can be vectorized.
9423 : : If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9424 : : phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9425 : : Return true if STMT_INFO is vectorizable in this way. */
9426 : :
9427 : : bool
9428 : 280067 : vectorizable_induction (loop_vec_info loop_vinfo,
9429 : : stmt_vec_info stmt_info,
9430 : : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9431 : : {
9432 : 280067 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9433 : 280067 : bool nested_in_vect_loop = false;
9434 : 280067 : class loop *iv_loop;
9435 : 280067 : tree vec_def;
9436 : 280067 : edge pe = loop_preheader_edge (loop);
9437 : 280067 : basic_block new_bb;
9438 : 280067 : tree vec_init = NULL_TREE, vec_step, t;
9439 : 280067 : tree new_name;
9440 : 280067 : gphi *induction_phi;
9441 : 280067 : tree induc_def, vec_dest;
9442 : 280067 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9443 : 280067 : unsigned i;
9444 : 280067 : tree expr;
9445 : 280067 : tree index_vectype = NULL_TREE;
9446 : 280067 : gimple_stmt_iterator si;
9447 : 280067 : enum vect_induction_op_type induction_type
9448 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9449 : :
9450 : 303746 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9451 : 146208 : if (!phi)
9452 : : return false;
9453 : :
9454 : 146208 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
9455 : : return false;
9456 : :
9457 : : /* Make sure it was recognized as induction computation. */
9458 : 146208 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9459 : : return false;
9460 : :
9461 : : /* Handle nonlinear induction in a separate place. */
9462 : 142149 : if (induction_type != vect_step_op_add)
9463 : 9863 : return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9464 : 9863 : slp_node, cost_vec);
9465 : :
9466 : 132286 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9467 : 132286 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9468 : :
9469 : : /* FORNOW. These restrictions should be relaxed. */
9470 : 132286 : if (nested_in_vect_loop_p (loop, stmt_info))
9471 : : {
9472 : 606 : imm_use_iterator imm_iter;
9473 : 606 : use_operand_p use_p;
9474 : 606 : gimple *exit_phi;
9475 : 606 : edge latch_e;
9476 : 606 : tree loop_arg;
9477 : :
9478 : 606 : exit_phi = NULL;
9479 : 606 : latch_e = loop_latch_edge (loop->inner);
9480 : 606 : loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9481 : 1254 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9482 : : {
9483 : 658 : gimple *use_stmt = USE_STMT (use_p);
9484 : 658 : if (is_gimple_debug (use_stmt))
9485 : 36 : continue;
9486 : :
9487 : 622 : if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9488 : : {
9489 : : exit_phi = use_stmt;
9490 : : break;
9491 : : }
9492 : : }
9493 : 606 : if (exit_phi)
9494 : : {
9495 : 10 : stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9496 : 10 : if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9497 : 6 : && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9498 : : {
9499 : 4 : if (dump_enabled_p ())
9500 : 4 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9501 : : "inner-loop induction only used outside "
9502 : : "of the outer vectorized loop.\n");
9503 : 4 : return false;
9504 : : }
9505 : : }
9506 : :
9507 : 602 : nested_in_vect_loop = true;
9508 : 602 : iv_loop = loop->inner;
9509 : : }
9510 : : else
9511 : : iv_loop = loop;
9512 : 132282 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9513 : :
9514 : 132282 : if (!nunits.is_constant () && SLP_TREE_LANES (slp_node) != 1)
9515 : : {
9516 : : /* The current SLP code creates the step value element-by-element. */
9517 : : if (dump_enabled_p ())
9518 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9519 : : "SLP induction not supported for variable-length"
9520 : : " vectors.\n");
9521 : : return false;
9522 : : }
9523 : :
9524 : 132282 : if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9525 : : {
9526 : 12 : if (dump_enabled_p ())
9527 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9528 : : "floating point induction vectorization disabled\n");
9529 : 12 : return false;
9530 : : }
9531 : :
9532 : 132270 : tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9533 : 132270 : gcc_assert (step_expr != NULL_TREE);
9534 : 264494 : if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
9535 : 264402 : && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
9536 : : {
9537 : 12 : if (dump_enabled_p ())
9538 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9539 : : "bit-precision induction vectorization not "
9540 : : "supported.\n");
9541 : 12 : return false;
9542 : : }
9543 : 132258 : tree stept = TREE_TYPE (step_expr);
9544 : 132258 : tree step_vectype = get_same_sized_vectype (stept, vectype);
9545 : 132258 : stept = TREE_TYPE (step_vectype);
9546 : :
9547 : : /* Check for target support of the vectorized arithmetic used here. */
9548 : 132258 : if (!target_supports_op_p (step_vectype, PLUS_EXPR, optab_default)
9549 : 132258 : || !target_supports_op_p (step_vectype, MINUS_EXPR, optab_default))
9550 : 19592 : return false;
9551 : 112666 : if (!nunits.is_constant ())
9552 : : {
9553 : : if (!target_supports_op_p (step_vectype, MULT_EXPR, optab_default))
9554 : : return false;
9555 : : /* FLOAT_EXPR when computing VEC_INIT for float inductions. */
9556 : : if (SCALAR_FLOAT_TYPE_P (stept))
9557 : : {
9558 : : tree index_type = build_nonstandard_integer_type
9559 : : (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (stept)), 1);
9560 : :
9561 : : index_vectype = build_vector_type (index_type, nunits);
9562 : : if (!can_float_p (TYPE_MODE (step_vectype),
9563 : : TYPE_MODE (index_vectype), 1))
9564 : : return false;
9565 : : }
9566 : : }
9567 : :
9568 : 112666 : unsigned nvects = vect_get_num_copies (loop_vinfo, slp_node);
9569 : 112666 : if (cost_vec) /* transformation not required. */
9570 : : {
9571 : 292170 : unsigned inside_cost = 0, prologue_cost = 0;
9572 : : /* We eventually need to set a vector type on invariant
9573 : : arguments. */
9574 : : unsigned j;
9575 : : slp_tree child;
9576 : 292170 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9577 : 194780 : if (!vect_maybe_update_slp_op_vectype
9578 : 194780 : (child, SLP_TREE_VECTYPE (slp_node)))
9579 : : {
9580 : 0 : if (dump_enabled_p ())
9581 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9582 : : "incompatible vector types for "
9583 : : "invariants\n");
9584 : 0 : return false;
9585 : : }
9586 : : /* loop cost for vec_loop. */
9587 : 97390 : inside_cost = record_stmt_cost (cost_vec, nvects,
9588 : : vector_stmt, slp_node, 0, vect_body);
9589 : : /* prologue cost for vec_init (if not nested) and step. */
9590 : 97390 : prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9591 : : scalar_to_vec,
9592 : : slp_node, 0, vect_prologue);
9593 : 97390 : if (dump_enabled_p ())
9594 : 3923 : dump_printf_loc (MSG_NOTE, vect_location,
9595 : : "vect_model_induction_cost: inside_cost = %d, "
9596 : : "prologue_cost = %d .\n", inside_cost,
9597 : : prologue_cost);
9598 : :
9599 : 97390 : SLP_TREE_TYPE (slp_node) = induc_vec_info_type;
9600 : 97390 : DUMP_VECT_SCOPE ("vectorizable_induction");
9601 : 97390 : return true;
9602 : : }
9603 : :
9604 : : /* Transform. */
9605 : :
9606 : : /* Compute a vector variable, initialized with the first VF values of
9607 : : the induction variable. E.g., for an iv with IV_PHI='X' and
9608 : : evolution S, for a vector of 4 units, we want to compute:
9609 : : [X, X + S, X + 2*S, X + 3*S]. */
9610 : :
9611 : 15276 : if (dump_enabled_p ())
9612 : 2814 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9613 : :
9614 : 15276 : pe = loop_preheader_edge (iv_loop);
9615 : : /* Find the first insertion point in the BB. */
9616 : 15276 : basic_block bb = gimple_bb (phi);
9617 : 15276 : si = gsi_after_labels (bb);
9618 : :
9619 : : /* For SLP induction we have to generate several IVs as for example
9620 : : with group size 3 we need
9621 : : [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9622 : : [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9623 : 15276 : gimple_stmt_iterator incr_si;
9624 : 15276 : bool insert_after;
9625 : 15276 : standard_iv_increment_position (iv_loop, &incr_si, &insert_after);
9626 : :
9627 : : /* The initial values are vectorized, but any lanes > group_size
9628 : : need adjustment. */
9629 : 15276 : slp_tree init_node
9630 : 15276 : = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9631 : :
9632 : : /* Gather steps. Since we do not vectorize inductions as
9633 : : cycles we have to reconstruct the step from SCEV data. */
9634 : 15276 : unsigned group_size = SLP_TREE_LANES (slp_node);
9635 : 15276 : tree *steps = XALLOCAVEC (tree, group_size);
9636 : 15276 : tree *inits = XALLOCAVEC (tree, group_size);
9637 : 15276 : stmt_vec_info phi_info;
9638 : 47012 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9639 : : {
9640 : 16460 : steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9641 : 16460 : if (!init_node)
9642 : 16292 : inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9643 : : pe->dest_idx);
9644 : : }
9645 : :
9646 : : /* Now generate the IVs. */
9647 : 30552 : gcc_assert (multiple_p (nunits * nvects, group_size));
9648 : 15276 : unsigned nivs;
9649 : 15276 : unsigned HOST_WIDE_INT const_nunits;
9650 : 15276 : if (nested_in_vect_loop)
9651 : : nivs = nvects;
9652 : 15122 : else if (nunits.is_constant (&const_nunits))
9653 : : {
9654 : : /* Compute the number of distinct IVs we need. First reduce
9655 : : group_size if it is a multiple of const_nunits so we get
9656 : : one IV for a group_size of 4 but const_nunits 2. */
9657 : 15122 : unsigned group_sizep = group_size;
9658 : 15122 : if (group_sizep % const_nunits == 0)
9659 : 109 : group_sizep = group_sizep / const_nunits;
9660 : 15122 : nivs = least_common_multiple (group_sizep, const_nunits) / const_nunits;
9661 : : }
9662 : : else
9663 : : {
9664 : : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
9665 : : nivs = 1;
9666 : : }
9667 : 15276 : gimple_seq init_stmts = NULL;
9668 : 15276 : tree lupdate_mul = NULL_TREE;
9669 : 154 : if (!nested_in_vect_loop)
9670 : : {
9671 : 15122 : if (nunits.is_constant (&const_nunits))
9672 : : {
9673 : : /* The number of iterations covered in one vector iteration. */
9674 : 15122 : unsigned lup_mul = (nvects * const_nunits) / group_size;
9675 : 15122 : lupdate_mul
9676 : 15122 : = build_vector_from_val (step_vectype,
9677 : 15122 : SCALAR_FLOAT_TYPE_P (stept)
9678 : 27 : ? build_real_from_wide (stept, lup_mul,
9679 : : UNSIGNED)
9680 : 30217 : : build_int_cstu (stept, lup_mul));
9681 : : }
9682 : : else
9683 : : {
9684 : : if (SCALAR_FLOAT_TYPE_P (stept))
9685 : : {
9686 : : tree tem = build_int_cst (integer_type_node, vf);
9687 : : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
9688 : : }
9689 : : else
9690 : : lupdate_mul = build_int_cst (stept, vf);
9691 : : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
9692 : : lupdate_mul);
9693 : : }
9694 : : }
9695 : 15276 : tree peel_mul = NULL_TREE;
9696 : 15276 : if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9697 : : {
9698 : 0 : if (SCALAR_FLOAT_TYPE_P (stept))
9699 : 0 : peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9700 : : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9701 : : else
9702 : 0 : peel_mul = gimple_convert (&init_stmts, stept,
9703 : : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9704 : 0 : peel_mul = gimple_build_vector_from_val (&init_stmts,
9705 : : step_vectype, peel_mul);
9706 : :
9707 : : /* If early break then we have to create a new PHI which we can use as
9708 : : an offset to adjust the induction reduction in early exits.
9709 : :
9710 : : This is because when peeling for alignment using masking, the first
9711 : : few elements of the vector can be inactive. As such if we find the
9712 : : entry in the first iteration we have adjust the starting point of
9713 : : the scalar code.
9714 : :
9715 : : We do this by creating a new scalar PHI that keeps track of whether
9716 : : we are the first iteration of the loop (with the additional masking)
9717 : : or whether we have taken a loop iteration already.
9718 : :
9719 : : The generated sequence:
9720 : :
9721 : : pre-header:
9722 : : bb1:
9723 : : i_1 = <number of leading inactive elements>
9724 : :
9725 : : header:
9726 : : bb2:
9727 : : i_2 = PHI <i_1(bb1), 0(latch)>
9728 : : …
9729 : :
9730 : : early-exit:
9731 : : bb3:
9732 : : i_3 = iv_step * i_2 + PHI<vector-iv>
9733 : :
9734 : : The first part of the adjustment to create i_1 and i_2 are done here
9735 : : and the last part creating i_3 is done in
9736 : : vectorizable_live_operations when the induction extraction is
9737 : : materialized. */
9738 : 0 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
9739 : 0 : && !LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo))
9740 : : {
9741 : 0 : auto skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9742 : 0 : tree ty_skip_niters = TREE_TYPE (skip_niters);
9743 : 0 : tree break_lhs_phi = vect_get_new_vect_var (ty_skip_niters,
9744 : : vect_scalar_var,
9745 : : "pfa_iv_offset");
9746 : 0 : gphi *nphi = create_phi_node (break_lhs_phi, bb);
9747 : 0 : add_phi_arg (nphi, skip_niters, pe, UNKNOWN_LOCATION);
9748 : 0 : add_phi_arg (nphi, build_zero_cst (ty_skip_niters),
9749 : : loop_latch_edge (iv_loop), UNKNOWN_LOCATION);
9750 : :
9751 : 0 : LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo) = PHI_RESULT (nphi);
9752 : : }
9753 : : }
9754 : 15276 : tree step_mul = NULL_TREE;
9755 : 15276 : unsigned ivn;
9756 : 15276 : auto_vec<tree> vec_steps;
9757 : 31052 : for (ivn = 0; ivn < nivs; ++ivn)
9758 : : {
9759 : 15776 : gimple_seq stmts = NULL;
9760 : 15776 : bool invariant = true;
9761 : 15776 : if (nunits.is_constant (&const_nunits))
9762 : : {
9763 : 15776 : tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9764 : 15776 : tree_vector_builder init_elts (vectype, const_nunits, 1);
9765 : 15776 : tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9766 : 98796 : for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9767 : : {
9768 : : /* The scalar steps of the IVs. */
9769 : 83020 : tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9770 : 83020 : elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9771 : 83020 : step_elts.quick_push (elt);
9772 : 83020 : if (!init_node)
9773 : : {
9774 : : /* The scalar inits of the IVs if not vectorized. */
9775 : 82126 : elt = inits[(ivn*const_nunits + eltn) % group_size];
9776 : 82126 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
9777 : 82126 : TREE_TYPE (elt)))
9778 : 266 : elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9779 : 266 : TREE_TYPE (vectype), elt);
9780 : 82126 : init_elts.quick_push (elt);
9781 : : }
9782 : : /* The number of steps to add to the initial values. */
9783 : 83020 : unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9784 : 166040 : mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9785 : 165942 : ? build_real_from_wide (stept, mul_elt,
9786 : : UNSIGNED)
9787 : 165942 : : build_int_cstu (stept, mul_elt));
9788 : : }
9789 : 15776 : vec_step = gimple_build_vector (&init_stmts, &step_elts);
9790 : 15776 : step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9791 : 15776 : if (!init_node)
9792 : 15604 : vec_init = gimple_build_vector (&init_stmts, &init_elts);
9793 : 15776 : }
9794 : : else
9795 : : {
9796 : : if (init_node)
9797 : : ;
9798 : : else if (INTEGRAL_TYPE_P (TREE_TYPE (steps[0])))
9799 : : {
9800 : : new_name = gimple_convert (&init_stmts, stept, inits[0]);
9801 : : /* Build the initial value directly as a VEC_SERIES_EXPR. */
9802 : : vec_init = gimple_build (&init_stmts, VEC_SERIES_EXPR,
9803 : : step_vectype, new_name, steps[0]);
9804 : : if (!useless_type_conversion_p (vectype, step_vectype))
9805 : : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9806 : : vectype, vec_init);
9807 : : }
9808 : : else
9809 : : {
9810 : : /* Build:
9811 : : [base, base, base, ...]
9812 : : + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
9813 : : gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (steps[0])));
9814 : : gcc_assert (flag_associative_math);
9815 : : gcc_assert (index_vectype != NULL_TREE);
9816 : :
9817 : : tree index = build_index_vector (index_vectype, 0, 1);
9818 : : new_name = gimple_convert (&init_stmts, TREE_TYPE (steps[0]),
9819 : : inits[0]);
9820 : : tree base_vec = gimple_build_vector_from_val (&init_stmts,
9821 : : step_vectype,
9822 : : new_name);
9823 : : tree step_vec = gimple_build_vector_from_val (&init_stmts,
9824 : : step_vectype,
9825 : : steps[0]);
9826 : : vec_init = gimple_build (&init_stmts, FLOAT_EXPR,
9827 : : step_vectype, index);
9828 : : vec_init = gimple_build (&init_stmts, MULT_EXPR,
9829 : : step_vectype, vec_init, step_vec);
9830 : : vec_init = gimple_build (&init_stmts, PLUS_EXPR,
9831 : : step_vectype, vec_init, base_vec);
9832 : : if (!useless_type_conversion_p (vectype, step_vectype))
9833 : : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9834 : : vectype, vec_init);
9835 : : }
9836 : : /* iv_loop is nested in the loop to be vectorized. Generate:
9837 : : vec_step = [S, S, S, S] */
9838 : : t = unshare_expr (steps[0]);
9839 : : gcc_assert (CONSTANT_CLASS_P (t)
9840 : : || TREE_CODE (t) == SSA_NAME);
9841 : : vec_step = gimple_build_vector_from_val (&init_stmts,
9842 : : step_vectype, t);
9843 : : }
9844 : 15776 : vec_steps.safe_push (vec_step);
9845 : 15776 : if (peel_mul)
9846 : : {
9847 : 0 : if (!step_mul)
9848 : : {
9849 : 0 : gcc_assert (!nunits.is_constant ());
9850 : : step_mul = gimple_build (&init_stmts,
9851 : : MINUS_EXPR, step_vectype,
9852 : : build_zero_cst (step_vectype), peel_mul);
9853 : : }
9854 : : else
9855 : 0 : step_mul = gimple_build (&init_stmts,
9856 : : MINUS_EXPR, step_vectype,
9857 : : step_mul, peel_mul);
9858 : : }
9859 : :
9860 : : /* Create the induction-phi that defines the induction-operand. */
9861 : 15776 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9862 : : "vec_iv_");
9863 : 15776 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
9864 : 15776 : induc_def = PHI_RESULT (induction_phi);
9865 : :
9866 : : /* Create the iv update inside the loop */
9867 : 15776 : tree up = vec_step;
9868 : 15776 : if (lupdate_mul)
9869 : : {
9870 : 15604 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
9871 : : {
9872 : : /* When we're using loop_len produced by SELEC_VL, the
9873 : : non-final iterations are not always processing VF
9874 : : elements. So vectorize induction variable instead of
9875 : :
9876 : : _21 = vect_vec_iv_.6_22 + { VF, ... };
9877 : :
9878 : : We should generate:
9879 : :
9880 : : _35 = .SELECT_VL (ivtmp_33, VF);
9881 : : vect_cst__22 = [vec_duplicate_expr] _35;
9882 : : _21 = vect_vec_iv_.6_22 + vect_cst__22; */
9883 : 0 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
9884 : 0 : tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1,
9885 : : vectype, 0, 0);
9886 : 0 : if (SCALAR_FLOAT_TYPE_P (stept))
9887 : 0 : expr = gimple_build (&stmts, FLOAT_EXPR, stept, len);
9888 : : else
9889 : 0 : expr = gimple_convert (&stmts, stept, len);
9890 : 0 : lupdate_mul = gimple_build_vector_from_val (&stmts, step_vectype,
9891 : : expr);
9892 : 0 : up = gimple_build (&stmts, MULT_EXPR,
9893 : : step_vectype, vec_step, lupdate_mul);
9894 : : }
9895 : : else
9896 : 15604 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9897 : : vec_step, lupdate_mul);
9898 : : }
9899 : 15776 : vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9900 : 15776 : vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, up);
9901 : 15776 : vec_def = gimple_convert (&stmts, vectype, vec_def);
9902 : 15776 : insert_iv_increment (&incr_si, insert_after, stmts);
9903 : 15776 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9904 : : UNKNOWN_LOCATION);
9905 : :
9906 : 15776 : if (init_node)
9907 : 172 : vec_init = vect_get_slp_vect_def (init_node, ivn);
9908 : 15776 : if (!nested_in_vect_loop
9909 : 15776 : && step_mul
9910 : 15776 : && !integer_zerop (step_mul))
9911 : : {
9912 : 15194 : gcc_assert (invariant);
9913 : 15194 : vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9914 : 15194 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9915 : : vec_step, step_mul);
9916 : 15194 : vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9917 : : vec_def, up);
9918 : 15194 : vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9919 : : }
9920 : :
9921 : : /* Set the arguments of the phi node: */
9922 : 15776 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9923 : :
9924 : 15776 : slp_node->push_vec_def (induction_phi);
9925 : : }
9926 : 15276 : if (!nested_in_vect_loop)
9927 : : {
9928 : : /* Fill up to the number of vectors we need for the whole group. */
9929 : 15122 : if (nunits.is_constant (&const_nunits))
9930 : 15122 : nivs = least_common_multiple (group_size, const_nunits) / const_nunits;
9931 : : else
9932 : : nivs = 1;
9933 : 15122 : vec_steps.reserve (nivs-ivn);
9934 : 30271 : for (; ivn < nivs; ++ivn)
9935 : : {
9936 : 27 : slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
9937 : 27 : vec_steps.quick_push (vec_steps[0]);
9938 : : }
9939 : : }
9940 : :
9941 : : /* Re-use IVs when we can. We are generating further vector
9942 : : stmts by adding VF' * stride to the IVs generated above. */
9943 : 15276 : if (ivn < nvects)
9944 : : {
9945 : 3496 : if (nunits.is_constant (&const_nunits))
9946 : : {
9947 : 3496 : unsigned vfp = (least_common_multiple (group_size, const_nunits)
9948 : 3496 : / group_size);
9949 : 3496 : lupdate_mul
9950 : 3496 : = build_vector_from_val (step_vectype,
9951 : 3496 : SCALAR_FLOAT_TYPE_P (stept)
9952 : 8 : ? build_real_from_wide (stept,
9953 : 8 : vfp, UNSIGNED)
9954 : 6984 : : build_int_cstu (stept, vfp));
9955 : : }
9956 : : else
9957 : : {
9958 : : if (SCALAR_FLOAT_TYPE_P (stept))
9959 : : {
9960 : : tree tem = build_int_cst (integer_type_node, nunits);
9961 : : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
9962 : : }
9963 : : else
9964 : : lupdate_mul = build_int_cst (stept, nunits);
9965 : : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
9966 : : lupdate_mul);
9967 : : }
9968 : 11132 : for (; ivn < nvects; ++ivn)
9969 : : {
9970 : 7636 : gimple *iv
9971 : 7636 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
9972 : 7636 : tree def = gimple_get_lhs (iv);
9973 : 7636 : if (ivn < 2*nivs)
9974 : 3542 : vec_steps[ivn - nivs]
9975 : 3542 : = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9976 : 3542 : vec_steps[ivn - nivs], lupdate_mul);
9977 : 7636 : gimple_seq stmts = NULL;
9978 : 7636 : def = gimple_convert (&stmts, step_vectype, def);
9979 : 22908 : def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9980 : 7636 : def, vec_steps[ivn % nivs]);
9981 : 7636 : def = gimple_convert (&stmts, vectype, def);
9982 : 7636 : if (gimple_code (iv) == GIMPLE_PHI)
9983 : 3542 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9984 : : else
9985 : : {
9986 : 4094 : gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
9987 : 4094 : gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
9988 : : }
9989 : 7636 : slp_node->push_vec_def (def);
9990 : : }
9991 : : }
9992 : :
9993 : 15276 : new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
9994 : 15276 : gcc_assert (!new_bb);
9995 : :
9996 : 15276 : return true;
9997 : 15276 : }
9998 : :
9999 : : /* Function vectorizable_live_operation_1.
10000 : :
10001 : : helper function for vectorizable_live_operation. */
10002 : :
10003 : : static tree
10004 : 5235 : vectorizable_live_operation_1 (loop_vec_info loop_vinfo, basic_block exit_bb,
10005 : : tree vectype, slp_tree slp_node,
10006 : : tree bitsize, tree bitstart, tree vec_lhs,
10007 : : tree lhs_type, gimple_stmt_iterator *exit_gsi)
10008 : : {
10009 : 5235 : gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10010 : :
10011 : 5235 : tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10012 : 5235 : gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10013 : 10863 : for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10014 : 5628 : SET_PHI_ARG_DEF (phi, i, vec_lhs);
10015 : :
10016 : 5235 : gimple_seq stmts = NULL;
10017 : 5235 : tree new_tree;
10018 : :
10019 : : /* If bitstart is 0 then we can use a BIT_FIELD_REF */
10020 : 5235 : if (integer_zerop (bitstart))
10021 : : {
10022 : 2721 : tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10023 : : vec_lhs_phi, bitsize, bitstart);
10024 : :
10025 : : /* Convert the extracted vector element to the scalar type. */
10026 : 2721 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10027 : : }
10028 : 2514 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10029 : : {
10030 : : /* Emit:
10031 : :
10032 : : SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - (BIAS + 1)>
10033 : :
10034 : : where VEC_LHS is the vectorized live-out result, LEN is the length of
10035 : : the vector, BIAS is the load-store bias. The bias should not be used
10036 : : at all since we are not using load/store operations, but LEN will be
10037 : : REALLEN + BIAS, so subtract it to get to the correct position. */
10038 : 0 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
10039 : 0 : gimple_seq tem = NULL;
10040 : 0 : gimple_stmt_iterator gsi = gsi_last (tem);
10041 : 0 : tree len = vect_get_loop_len (loop_vinfo, &gsi,
10042 : : &LOOP_VINFO_LENS (loop_vinfo),
10043 : : 1, vectype, 0, 1);
10044 : 0 : gimple_seq_add_seq (&stmts, tem);
10045 : :
10046 : : /* BIAS + 1. */
10047 : 0 : signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10048 : 0 : tree bias_plus_one
10049 : 0 : = int_const_binop (PLUS_EXPR,
10050 : 0 : build_int_cst (TREE_TYPE (len), biasval),
10051 : 0 : build_one_cst (TREE_TYPE (len)));
10052 : :
10053 : : /* LAST_INDEX = LEN - (BIAS + 1). */
10054 : 0 : tree last_index = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (len),
10055 : : len, bias_plus_one);
10056 : :
10057 : : /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - (BIAS + 1)>. */
10058 : 0 : tree scalar_res
10059 : 0 : = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10060 : : vec_lhs_phi, last_index);
10061 : :
10062 : : /* Convert the extracted vector element to the scalar type. */
10063 : 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10064 : : }
10065 : 2514 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10066 : : {
10067 : : /* Emit:
10068 : :
10069 : : SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10070 : :
10071 : : where VEC_LHS is the vectorized live-out result and MASK is
10072 : : the loop mask for the final iteration. */
10073 : 0 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
10074 : 0 : tree scalar_type = TREE_TYPE (vectype);
10075 : 0 : gimple_seq tem = NULL;
10076 : 0 : gimple_stmt_iterator gsi = gsi_last (tem);
10077 : 0 : tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10078 : : &LOOP_VINFO_MASKS (loop_vinfo),
10079 : : 1, vectype, 0);
10080 : 0 : tree scalar_res;
10081 : 0 : gimple_seq_add_seq (&stmts, tem);
10082 : :
10083 : 0 : scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10084 : : mask, vec_lhs_phi);
10085 : :
10086 : : /* Convert the extracted vector element to the scalar type. */
10087 : 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10088 : : }
10089 : : else
10090 : : {
10091 : 2514 : tree bftype = TREE_TYPE (vectype);
10092 : 2514 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10093 : 87 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10094 : 2514 : new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10095 : 2514 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10096 : : &stmts, true, NULL_TREE);
10097 : : }
10098 : :
10099 : 5235 : *exit_gsi = gsi_after_labels (exit_bb);
10100 : 5235 : if (stmts)
10101 : 5235 : gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10102 : :
10103 : 5235 : return new_tree;
10104 : : }
10105 : :
10106 : : /* Function vectorizable_live_operation.
10107 : :
10108 : : STMT_INFO computes a value that is used outside the loop. Check if
10109 : : it can be supported. */
10110 : :
10111 : : bool
10112 : 274274 : vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10113 : : slp_tree slp_node, slp_instance slp_node_instance,
10114 : : int slp_index, bool vec_stmt_p,
10115 : : stmt_vector_for_cost *cost_vec)
10116 : : {
10117 : 274274 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10118 : 274274 : imm_use_iterator imm_iter;
10119 : 274274 : tree lhs, lhs_type, bitsize;
10120 : 274274 : tree vectype = SLP_TREE_VECTYPE (slp_node);
10121 : 274274 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10122 : 274274 : gimple *use_stmt;
10123 : 274274 : use_operand_p use_p;
10124 : 274274 : auto_vec<tree> vec_oprnds;
10125 : 274274 : int vec_entry = 0;
10126 : 274274 : poly_uint64 vec_index = 0;
10127 : :
10128 : 274274 : gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10129 : : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10130 : :
10131 : : /* If a stmt of a reduction is live, vectorize it via
10132 : : vect_create_epilog_for_reduction. vectorizable_reduction assessed
10133 : : validity so just trigger the transform here. */
10134 : 274274 : if (vect_is_reduction (slp_node))
10135 : : {
10136 : 52249 : if (!vec_stmt_p)
10137 : : return true;
10138 : : /* For SLP reductions we vectorize the epilogue for all involved stmts
10139 : : together. For SLP reduction chains we only get here once. */
10140 : 22317 : if (SLP_INSTANCE_KIND (slp_node_instance) == slp_inst_kind_reduc_group
10141 : 19967 : && slp_index != 0)
10142 : : return true;
10143 : 22006 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
10144 : 22006 : if (VECT_REDUC_INFO_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10145 : 22006 : || VECT_REDUC_INFO_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10146 : : return true;
10147 : :
10148 : 21168 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
10149 : 21168 : || !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10150 : 21164 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10151 : : slp_node_instance,
10152 : : LOOP_VINFO_IV_EXIT (loop_vinfo));
10153 : :
10154 : : /* If early break we only have to materialize the reduction on the merge
10155 : : block, but we have to find an alternate exit first. */
10156 : 21168 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10157 : : {
10158 : 23 : slp_tree phis_node = slp_node_instance->reduc_phis;
10159 : 23 : stmt_info = SLP_TREE_REPRESENTATIVE (phis_node);
10160 : 69 : for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10161 : 23 : if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
10162 : : {
10163 : 23 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10164 : : phis_node, slp_node_instance,
10165 : : exit);
10166 : 23 : break;
10167 : 23 : }
10168 : 23 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10169 : 4 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10170 : : phis_node, slp_node_instance,
10171 : : LOOP_VINFO_IV_EXIT (loop_vinfo));
10172 : : }
10173 : :
10174 : 21168 : return true;
10175 : : }
10176 : :
10177 : : /* If STMT is not relevant and it is a simple assignment and its inputs are
10178 : : invariant then it can remain in place, unvectorized. The original last
10179 : : scalar value that it computes will be used. */
10180 : 222025 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
10181 : : {
10182 : 0 : gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10183 : 0 : if (dump_enabled_p ())
10184 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
10185 : : "statement is simple and uses invariant. Leaving in "
10186 : : "place.\n");
10187 : 0 : return true;
10188 : : }
10189 : :
10190 : 222025 : gcc_assert (slp_index >= 0);
10191 : :
10192 : : /* Get the last occurrence of the scalar index from the concatenation of
10193 : : all the slp vectors. Calculate which slp vector it is and the index
10194 : : within. */
10195 : 222025 : int num_scalar = SLP_TREE_LANES (slp_node);
10196 : 222025 : int num_vec = vect_get_num_copies (vinfo, slp_node);
10197 : 222025 : poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10198 : :
10199 : : /* Calculate which vector contains the result, and which lane of
10200 : : that vector we need. */
10201 : 222025 : if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10202 : : {
10203 : : if (dump_enabled_p ())
10204 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10205 : : "Cannot determine which vector holds the"
10206 : : " final result.\n");
10207 : : return false;
10208 : : }
10209 : :
10210 : 222025 : if (!vec_stmt_p)
10211 : : {
10212 : : /* No transformation required. */
10213 : 180118 : if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10214 : : {
10215 : 1 : if (SLP_TREE_LANES (slp_node) != 1)
10216 : : {
10217 : 0 : if (dump_enabled_p ())
10218 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10219 : : "can't operate on partial vectors "
10220 : : "because an SLP statement is live after "
10221 : : "the loop.\n");
10222 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10223 : : }
10224 : 1 : else if (num_vec > 1)
10225 : : {
10226 : 0 : if (dump_enabled_p ())
10227 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10228 : : "can't operate on partial vectors "
10229 : : "because ncopies is greater than 1.\n");
10230 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10231 : : }
10232 : : else
10233 : : {
10234 : 1 : if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10235 : : OPTIMIZE_FOR_SPEED))
10236 : 0 : vect_record_loop_mask (loop_vinfo,
10237 : : &LOOP_VINFO_MASKS (loop_vinfo),
10238 : : 1, vectype, NULL);
10239 : 1 : else if (can_vec_extract_var_idx_p (
10240 : 1 : TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10241 : 0 : vect_record_loop_len (loop_vinfo,
10242 : : &LOOP_VINFO_LENS (loop_vinfo),
10243 : : 1, vectype, 1);
10244 : : else
10245 : : {
10246 : 1 : if (dump_enabled_p ())
10247 : 0 : dump_printf_loc (
10248 : 0 : MSG_MISSED_OPTIMIZATION, vect_location,
10249 : : "can't operate on partial vectors "
10250 : : "because the target doesn't support extract "
10251 : : "last reduction.\n");
10252 : 1 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10253 : : }
10254 : : }
10255 : : }
10256 : : /* ??? Enable for loop costing as well. */
10257 : 1 : if (!loop_vinfo)
10258 : 90714 : record_stmt_cost (cost_vec, 1, vec_to_scalar, slp_node,
10259 : : 0, vect_epilogue);
10260 : 180118 : return true;
10261 : : }
10262 : :
10263 : : /* Use the lhs of the original scalar statement. */
10264 : 41907 : gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10265 : 41907 : if (dump_enabled_p ())
10266 : 1475 : dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10267 : : "stmt %G", stmt);
10268 : :
10269 : 41907 : lhs = gimple_get_lhs (stmt);
10270 : 41907 : lhs_type = TREE_TYPE (lhs);
10271 : :
10272 : 41907 : bitsize = vector_element_bits_tree (vectype);
10273 : :
10274 : : /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10275 : 41907 : gcc_assert (!loop_vinfo
10276 : : || ((!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10277 : : && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10278 : : || SLP_TREE_LANES (slp_node) == 1));
10279 : :
10280 : : /* Get the correct slp vectorized stmt. */
10281 : 41907 : tree vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10282 : 41907 : gimple *vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10283 : :
10284 : : /* In case we need to early break vectorize also get the first stmt. */
10285 : 41907 : tree vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10286 : :
10287 : : /* Get entry to use. */
10288 : 41907 : tree bitstart = bitsize_int (vec_index);
10289 : 41907 : bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10290 : :
10291 : 41907 : if (loop_vinfo)
10292 : : {
10293 : : /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10294 : : requirement, insert one phi node for it. It looks like:
10295 : : loop;
10296 : : BB:
10297 : : # lhs' = PHI <lhs>
10298 : : ==>
10299 : : loop;
10300 : : BB:
10301 : : # vec_lhs' = PHI <vec_lhs>
10302 : : new_tree = lane_extract <vec_lhs', ...>;
10303 : : lhs' = new_tree; */
10304 : :
10305 : 5290 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10306 : : /* Check if we have a loop where the chosen exit is not the main exit,
10307 : : in these cases for an early break we restart the iteration the vector code
10308 : : did. For the live values we want the value at the start of the iteration
10309 : : rather than at the end. */
10310 : 5290 : edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10311 : 5290 : bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10312 : 22456 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10313 : 17166 : if (!is_gimple_debug (use_stmt)
10314 : 17166 : && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10315 : 5235 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10316 : : {
10317 : 5235 : edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10318 : 5235 : phi_arg_index_from_use (use_p));
10319 : 5235 : gcc_assert (loop_exit_edge_p (loop, e));
10320 : 5235 : bool main_exit_edge = e == main_e;
10321 : 5235 : tree tmp_vec_lhs = vec_lhs;
10322 : 5235 : tree tmp_bitstart = bitstart;
10323 : :
10324 : : /* For early exit where the exit is not in the BB that leads
10325 : : to the latch then we're restarting the iteration in the
10326 : : scalar loop. So get the first live value. */
10327 : 13165 : bool early_break_first_element_p
10328 : 5235 : = (all_exits_as_early_p || !main_exit_edge)
10329 : 5235 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def;
10330 : 2695 : if (early_break_first_element_p)
10331 : : {
10332 : 2695 : tmp_vec_lhs = vec_lhs0;
10333 : 2695 : tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10334 : : }
10335 : :
10336 : 5235 : gimple_stmt_iterator exit_gsi;
10337 : 5235 : tree new_tree
10338 : 5235 : = vectorizable_live_operation_1 (loop_vinfo,
10339 : : e->dest, vectype,
10340 : : slp_node, bitsize,
10341 : : tmp_bitstart, tmp_vec_lhs,
10342 : : lhs_type, &exit_gsi);
10343 : :
10344 : 5235 : auto gsi = gsi_for_stmt (use_stmt);
10345 : 5235 : if (early_break_first_element_p
10346 : 2695 : && LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo))
10347 : : {
10348 : 0 : tree step_expr
10349 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10350 : 0 : tree break_lhs_phi
10351 : : = LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo);
10352 : 0 : tree ty_skip_niters = TREE_TYPE (break_lhs_phi);
10353 : 0 : gimple_seq iv_stmts = NULL;
10354 : :
10355 : : /* Now create the PHI for the outside loop usage to
10356 : : retrieve the value for the offset counter. */
10357 : 0 : tree rphi_step
10358 : 0 : = gimple_convert (&iv_stmts, ty_skip_niters, step_expr);
10359 : 0 : tree tmp2
10360 : 0 : = gimple_build (&iv_stmts, MULT_EXPR,
10361 : : ty_skip_niters, rphi_step,
10362 : : break_lhs_phi);
10363 : :
10364 : 0 : if (POINTER_TYPE_P (TREE_TYPE (new_tree)))
10365 : : {
10366 : 0 : tmp2 = gimple_convert (&iv_stmts, sizetype, tmp2);
10367 : 0 : tmp2 = gimple_build (&iv_stmts, POINTER_PLUS_EXPR,
10368 : 0 : TREE_TYPE (new_tree), new_tree,
10369 : : tmp2);
10370 : : }
10371 : : else
10372 : : {
10373 : 0 : tmp2 = gimple_convert (&iv_stmts, TREE_TYPE (new_tree),
10374 : : tmp2);
10375 : 0 : tmp2 = gimple_build (&iv_stmts, PLUS_EXPR,
10376 : 0 : TREE_TYPE (new_tree), new_tree,
10377 : : tmp2);
10378 : : }
10379 : :
10380 : 0 : new_tree = tmp2;
10381 : 0 : gsi_insert_seq_before (&exit_gsi, iv_stmts, GSI_SAME_STMT);
10382 : : }
10383 : :
10384 : 5235 : tree lhs_phi = gimple_phi_result (use_stmt);
10385 : 5235 : remove_phi_node (&gsi, false);
10386 : 5235 : gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10387 : 5235 : gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10388 : 5235 : break;
10389 : 5290 : }
10390 : :
10391 : : /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
10392 : 17221 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10393 : 11931 : gcc_assert (is_gimple_debug (use_stmt)
10394 : 5290 : || flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10395 : : }
10396 : : else
10397 : : {
10398 : : /* For basic-block vectorization simply insert the lane-extraction. */
10399 : 36617 : tree bftype = TREE_TYPE (vectype);
10400 : 36617 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10401 : 0 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10402 : 36617 : tree new_tree = build3 (BIT_FIELD_REF, bftype,
10403 : : vec_lhs, bitsize, bitstart);
10404 : 36617 : gimple_seq stmts = NULL;
10405 : 36617 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10406 : : &stmts, true, NULL_TREE);
10407 : 36617 : if (TREE_CODE (new_tree) == SSA_NAME
10408 : 73234 : && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10409 : 2 : SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10410 : 36617 : if (is_a <gphi *> (vec_stmt))
10411 : : {
10412 : 2941 : gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10413 : 2941 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10414 : : }
10415 : : else
10416 : : {
10417 : 33676 : gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10418 : 33676 : gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10419 : : }
10420 : :
10421 : : /* Replace use of lhs with newly computed result. If the use stmt is a
10422 : : single arg PHI, just replace all uses of PHI result. It's necessary
10423 : : because lcssa PHI defining lhs may be before newly inserted stmt. */
10424 : 36617 : use_operand_p use_p;
10425 : 36617 : stmt_vec_info use_stmt_info;
10426 : 199346 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10427 : 162729 : if (!is_gimple_debug (use_stmt)
10428 : 162729 : && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10429 : 110047 : || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10430 : : {
10431 : : /* ??? This can happen when the live lane ends up being
10432 : : rooted in a vector construction code-generated by an
10433 : : external SLP node (and code-generation for that already
10434 : : happened). See gcc.dg/vect/bb-slp-47.c.
10435 : : Doing this is what would happen if that vector CTOR
10436 : : were not code-generated yet so it is not too bad.
10437 : : ??? In fact we'd likely want to avoid this situation
10438 : : in the first place. */
10439 : 63733 : if (TREE_CODE (new_tree) == SSA_NAME
10440 : 63469 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10441 : 63469 : && gimple_code (use_stmt) != GIMPLE_PHI
10442 : 119221 : && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10443 : : use_stmt))
10444 : : {
10445 : 264 : if (dump_enabled_p ())
10446 : 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10447 : : "Using original scalar computation for "
10448 : : "live lane because use preceeds vector "
10449 : : "def\n");
10450 : 264 : continue;
10451 : : }
10452 : : /* ??? It can also happen that we end up pulling a def into
10453 : : a loop where replacing out-of-loop uses would require
10454 : : a new LC SSA PHI node. Retain the original scalar in
10455 : : those cases as well. PR98064. */
10456 : 64709 : if (TREE_CODE (new_tree) == SSA_NAME
10457 : 63205 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10458 : 63205 : && (gimple_bb (use_stmt)->loop_father
10459 : 63205 : != gimple_bb (vec_stmt)->loop_father)
10460 : 70167 : && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10461 : 6962 : gimple_bb (use_stmt)->loop_father))
10462 : : {
10463 : 1504 : if (dump_enabled_p ())
10464 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10465 : : "Using original scalar computation for "
10466 : : "live lane because there is an out-of-loop "
10467 : : "definition for it\n");
10468 : 1504 : continue;
10469 : : }
10470 : 186985 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10471 : 62642 : SET_USE (use_p, new_tree);
10472 : 61701 : update_stmt (use_stmt);
10473 : 36617 : }
10474 : : }
10475 : :
10476 : : return true;
10477 : 274274 : }
10478 : :
10479 : : /* Given loop represented by LOOP_VINFO, return true if computation of
10480 : : LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10481 : : otherwise. */
10482 : :
10483 : : static bool
10484 : 57678 : loop_niters_no_overflow (loop_vec_info loop_vinfo)
10485 : : {
10486 : : /* Constant case. */
10487 : 57678 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10488 : : {
10489 : 33292 : tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10490 : 33292 : tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10491 : :
10492 : 33292 : gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10493 : 33292 : gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10494 : 33292 : if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10495 : : return true;
10496 : : }
10497 : :
10498 : 24386 : widest_int max;
10499 : 24386 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10500 : : /* Check the upper bound of loop niters. */
10501 : 24386 : if (get_max_loop_iterations (loop, &max))
10502 : : {
10503 : 24386 : tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10504 : 24386 : signop sgn = TYPE_SIGN (type);
10505 : 24386 : widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10506 : 24386 : if (max < type_max)
10507 : 24175 : return true;
10508 : 24386 : }
10509 : : return false;
10510 : 24386 : }
10511 : :
10512 : : /* Return a mask type with half the number of elements as OLD_TYPE,
10513 : : given that it should have mode NEW_MODE. */
10514 : :
10515 : : tree
10516 : 3102 : vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10517 : : {
10518 : 3102 : poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10519 : 3102 : return build_truth_vector_type_for_mode (nunits, new_mode);
10520 : : }
10521 : :
10522 : : /* Return a mask type with twice as many elements as OLD_TYPE,
10523 : : given that it should have mode NEW_MODE. */
10524 : :
10525 : : tree
10526 : 2077 : vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10527 : : {
10528 : 2077 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10529 : 2077 : return build_truth_vector_type_for_mode (nunits, new_mode);
10530 : : }
10531 : :
10532 : : /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10533 : : contain a sequence of NVECTORS masks that each control a vector of type
10534 : : VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10535 : : these vector masks with the vector version of SCALAR_MASK. */
10536 : :
10537 : : void
10538 : 128 : vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10539 : : unsigned int nvectors, tree vectype, tree scalar_mask)
10540 : : {
10541 : 128 : gcc_assert (nvectors != 0);
10542 : :
10543 : 128 : if (scalar_mask)
10544 : : {
10545 : 24 : scalar_cond_masked_key cond (scalar_mask, nvectors);
10546 : 24 : loop_vinfo->scalar_cond_masked_set.add (cond);
10547 : : }
10548 : :
10549 : 128 : masks->mask_set.add (std::make_pair (vectype, nvectors));
10550 : 128 : }
10551 : :
10552 : : /* Given a complete set of masks MASKS, extract mask number INDEX
10553 : : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10554 : : where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10555 : :
10556 : : See the comment above vec_loop_masks for more details about the mask
10557 : : arrangement. */
10558 : :
10559 : : tree
10560 : 166 : vect_get_loop_mask (loop_vec_info loop_vinfo,
10561 : : gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10562 : : unsigned int nvectors, tree vectype, unsigned int index)
10563 : : {
10564 : 166 : if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10565 : : == vect_partial_vectors_while_ult)
10566 : : {
10567 : 0 : rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10568 : 0 : tree mask_type = rgm->type;
10569 : :
10570 : : /* Populate the rgroup's mask array, if this is the first time we've
10571 : : used it. */
10572 : 0 : if (rgm->controls.is_empty ())
10573 : : {
10574 : 0 : rgm->controls.safe_grow_cleared (nvectors, true);
10575 : 0 : for (unsigned int i = 0; i < nvectors; ++i)
10576 : : {
10577 : 0 : tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10578 : : /* Provide a dummy definition until the real one is available. */
10579 : 0 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10580 : 0 : rgm->controls[i] = mask;
10581 : : }
10582 : : }
10583 : :
10584 : 0 : tree mask = rgm->controls[index];
10585 : 0 : if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10586 : 0 : TYPE_VECTOR_SUBPARTS (vectype)))
10587 : : {
10588 : : /* A loop mask for data type X can be reused for data type Y
10589 : : if X has N times more elements than Y and if Y's elements
10590 : : are N times bigger than X's. In this case each sequence
10591 : : of N elements in the loop mask will be all-zero or all-one.
10592 : : We can then view-convert the mask so that each sequence of
10593 : : N elements is replaced by a single element. */
10594 : 0 : gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10595 : : TYPE_VECTOR_SUBPARTS (vectype)));
10596 : 0 : gimple_seq seq = NULL;
10597 : 0 : mask_type = truth_type_for (vectype);
10598 : 0 : mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10599 : 0 : if (seq)
10600 : 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10601 : : }
10602 : 0 : return mask;
10603 : : }
10604 : 166 : else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10605 : : == vect_partial_vectors_avx512)
10606 : : {
10607 : : /* The number of scalars per iteration and the number of vectors are
10608 : : both compile-time constants. */
10609 : 166 : unsigned int nscalars_per_iter
10610 : 166 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10611 : 166 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10612 : :
10613 : 166 : rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10614 : :
10615 : : /* The stored nV is dependent on the mask type produced. */
10616 : 166 : gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10617 : : TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10618 : : == rgm->factor);
10619 : 166 : nvectors = rgm->factor;
10620 : :
10621 : : /* Populate the rgroup's mask array, if this is the first time we've
10622 : : used it. */
10623 : 166 : if (rgm->controls.is_empty ())
10624 : : {
10625 : 16 : rgm->controls.safe_grow_cleared (nvectors, true);
10626 : 98 : for (unsigned int i = 0; i < nvectors; ++i)
10627 : : {
10628 : 82 : tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
10629 : : /* Provide a dummy definition until the real one is available. */
10630 : 82 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10631 : 82 : rgm->controls[i] = mask;
10632 : : }
10633 : : }
10634 : 166 : if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
10635 : : TYPE_VECTOR_SUBPARTS (vectype)))
10636 : 150 : return rgm->controls[index];
10637 : :
10638 : : /* Split the vector if needed. Since we are dealing with integer mode
10639 : : masks with AVX512 we can operate on the integer representation
10640 : : performing the whole vector shifting. */
10641 : 16 : unsigned HOST_WIDE_INT factor;
10642 : 16 : bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
10643 : 16 : TYPE_VECTOR_SUBPARTS (vectype), &factor);
10644 : 0 : gcc_assert (ok);
10645 : 16 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
10646 : 16 : tree mask_type = truth_type_for (vectype);
10647 : 16 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
10648 : 16 : unsigned vi = index / factor;
10649 : 16 : unsigned vpart = index % factor;
10650 : 16 : tree vec = rgm->controls[vi];
10651 : 16 : gimple_seq seq = NULL;
10652 : 16 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
10653 : 16 : lang_hooks.types.type_for_mode
10654 : 16 : (TYPE_MODE (rgm->type), 1), vec);
10655 : : /* For integer mode masks simply shift the right bits into position. */
10656 : 16 : if (vpart != 0)
10657 : 12 : vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
10658 : : build_int_cst (integer_type_node,
10659 : 24 : (TYPE_VECTOR_SUBPARTS (vectype)
10660 : 12 : * vpart)));
10661 : 16 : vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
10662 : 16 : (TYPE_MODE (mask_type), 1), vec);
10663 : 16 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
10664 : 16 : if (seq)
10665 : 16 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10666 : 16 : return vec;
10667 : : }
10668 : : else
10669 : 0 : gcc_unreachable ();
10670 : : }
10671 : :
10672 : : /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10673 : : lengths for controlling an operation on VECTYPE. The operation splits
10674 : : each element of VECTYPE into FACTOR separate subelements, measuring the
10675 : : length as a number of these subelements. */
10676 : :
10677 : : void
10678 : 0 : vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10679 : : unsigned int nvectors, tree vectype, unsigned int factor)
10680 : : {
10681 : 0 : gcc_assert (nvectors != 0);
10682 : 0 : if (lens->length () < nvectors)
10683 : 0 : lens->safe_grow_cleared (nvectors, true);
10684 : 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
10685 : :
10686 : : /* The number of scalars per iteration, scalar occupied bytes and
10687 : : the number of vectors are both compile-time constants. */
10688 : 0 : unsigned int nscalars_per_iter
10689 : 0 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10690 : 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10691 : :
10692 : 0 : if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10693 : : {
10694 : : /* For now, we only support cases in which all loads and stores fall back
10695 : : to VnQI or none do. */
10696 : 0 : gcc_assert (!rgl->max_nscalars_per_iter
10697 : : || (rgl->factor == 1 && factor == 1)
10698 : : || (rgl->max_nscalars_per_iter * rgl->factor
10699 : : == nscalars_per_iter * factor));
10700 : 0 : rgl->max_nscalars_per_iter = nscalars_per_iter;
10701 : 0 : rgl->type = vectype;
10702 : 0 : rgl->factor = factor;
10703 : : }
10704 : 0 : }
10705 : :
10706 : : /* Given a complete set of lengths LENS, extract length number INDEX
10707 : : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10708 : : where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
10709 : : multipled by the number of elements that should be processed.
10710 : : Insert any set-up statements before GSI. */
10711 : :
10712 : : tree
10713 : 0 : vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10714 : : vec_loop_lens *lens, unsigned int nvectors, tree vectype,
10715 : : unsigned int index, unsigned int factor)
10716 : : {
10717 : 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
10718 : 0 : bool use_bias_adjusted_len =
10719 : 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10720 : :
10721 : : /* Populate the rgroup's len array, if this is the first time we've
10722 : : used it. */
10723 : 0 : if (rgl->controls.is_empty ())
10724 : : {
10725 : 0 : rgl->controls.safe_grow_cleared (nvectors, true);
10726 : 0 : for (unsigned int i = 0; i < nvectors; ++i)
10727 : : {
10728 : 0 : tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10729 : 0 : gcc_assert (len_type != NULL_TREE);
10730 : :
10731 : 0 : tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10732 : :
10733 : : /* Provide a dummy definition until the real one is available. */
10734 : 0 : SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10735 : 0 : rgl->controls[i] = len;
10736 : :
10737 : 0 : if (use_bias_adjusted_len)
10738 : : {
10739 : 0 : gcc_assert (i == 0);
10740 : 0 : tree adjusted_len =
10741 : 0 : make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10742 : 0 : SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10743 : 0 : rgl->bias_adjusted_ctrl = adjusted_len;
10744 : : }
10745 : : }
10746 : : }
10747 : :
10748 : 0 : if (use_bias_adjusted_len)
10749 : 0 : return rgl->bias_adjusted_ctrl;
10750 : :
10751 : 0 : tree loop_len = rgl->controls[index];
10752 : 0 : if (rgl->factor == 1 && factor == 1)
10753 : : {
10754 : 0 : poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
10755 : 0 : poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
10756 : 0 : if (maybe_ne (nunits1, nunits2))
10757 : : {
10758 : : /* A loop len for data type X can be reused for data type Y
10759 : : if X has N times more elements than Y and if Y's elements
10760 : : are N times bigger than X's. */
10761 : 0 : gcc_assert (multiple_p (nunits1, nunits2));
10762 : 0 : factor = exact_div (nunits1, nunits2).to_constant ();
10763 : 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10764 : 0 : gimple_seq seq = NULL;
10765 : 0 : loop_len = gimple_build (&seq, EXACT_DIV_EXPR, iv_type, loop_len,
10766 : 0 : build_int_cst (iv_type, factor));
10767 : 0 : if (seq)
10768 : 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10769 : : }
10770 : : }
10771 : : return loop_len;
10772 : : }
10773 : :
10774 : : /* Generate the tree for the loop len mask and return it. Given the lens,
10775 : : nvectors, vectype, index and factor to gen the len mask as below.
10776 : :
10777 : : tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
10778 : : */
10779 : : tree
10780 : 0 : vect_gen_loop_len_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10781 : : gimple_stmt_iterator *cond_gsi, vec_loop_lens *lens,
10782 : : unsigned int nvectors, tree vectype, tree stmt,
10783 : : unsigned int index, unsigned int factor)
10784 : : {
10785 : 0 : tree all_one_mask = build_all_ones_cst (vectype);
10786 : 0 : tree all_zero_mask = build_zero_cst (vectype);
10787 : 0 : tree len = vect_get_loop_len (loop_vinfo, gsi, lens, nvectors, vectype, index,
10788 : : factor);
10789 : 0 : tree bias = build_int_cst (intQI_type_node,
10790 : 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo));
10791 : 0 : tree len_mask = make_temp_ssa_name (TREE_TYPE (stmt), NULL, "vec_len_mask");
10792 : 0 : gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, stmt,
10793 : : all_one_mask, all_zero_mask, len,
10794 : : bias);
10795 : 0 : gimple_call_set_lhs (call, len_mask);
10796 : 0 : gsi_insert_before (cond_gsi, call, GSI_SAME_STMT);
10797 : :
10798 : 0 : return len_mask;
10799 : : }
10800 : :
10801 : : /* Scale profiling counters by estimation for LOOP which is vectorized
10802 : : by factor VF.
10803 : : If FLAT is true, the loop we started with had unrealistically flat
10804 : : profile. */
10805 : :
10806 : : static void
10807 : 57678 : scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
10808 : : {
10809 : : /* For flat profiles do not scale down proportionally by VF and only
10810 : : cap by known iteration count bounds. */
10811 : 57678 : if (flat)
10812 : : {
10813 : 32925 : if (dump_file && (dump_flags & TDF_DETAILS))
10814 : 4852 : fprintf (dump_file,
10815 : : "Vectorized loop profile seems flat; not scaling iteration "
10816 : : "count down by the vectorization factor %i\n", vf);
10817 : 32925 : scale_loop_profile (loop, profile_probability::always (),
10818 : : get_likely_max_loop_iterations_int (loop));
10819 : 32925 : return;
10820 : : }
10821 : : /* Loop body executes VF fewer times and exit increases VF times. */
10822 : 24753 : profile_count entry_count = loop_preheader_edge (loop)->count ();
10823 : :
10824 : : /* If we have unreliable loop profile avoid dropping entry
10825 : : count below header count. This can happen since loops
10826 : : has unrealistically low trip counts. */
10827 : 24753 : while (vf > 1
10828 : 25923 : && loop->header->count > entry_count
10829 : 52687 : && loop->header->count < entry_count * vf)
10830 : : {
10831 : 2011 : if (dump_file && (dump_flags & TDF_DETAILS))
10832 : 149 : fprintf (dump_file,
10833 : : "Vectorization factor %i seems too large for profile "
10834 : : "prevoiusly believed to be consistent; reducing.\n", vf);
10835 : 2011 : vf /= 2;
10836 : : }
10837 : :
10838 : 24753 : if (entry_count.nonzero_p ())
10839 : 24753 : set_edge_probability_and_rescale_others
10840 : 24753 : (exit_e,
10841 : 24753 : entry_count.probability_in (loop->header->count / vf));
10842 : : /* Avoid producing very large exit probability when we do not have
10843 : : sensible profile. */
10844 : 0 : else if (exit_e->probability < profile_probability::always () / (vf * 2))
10845 : 0 : set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
10846 : 24753 : loop->latch->count = single_pred_edge (loop->latch)->count ();
10847 : :
10848 : 24753 : scale_loop_profile (loop, profile_probability::always () / vf,
10849 : : get_likely_max_loop_iterations_int (loop));
10850 : : }
10851 : :
10852 : : /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
10853 : : original loop that has now been vectorized.
10854 : :
10855 : : The inits of the data_references need to be advanced with the number of
10856 : : iterations of the main loop. This has been computed in vect_do_peeling and
10857 : : is stored in parameter ADVANCE.
10858 : :
10859 : : Since the loop_vec_info of this EPILOGUE was constructed for the original
10860 : : loop, its stmt_vec_infos all point to the original statements. These need
10861 : : to be updated to point to their corresponding copies.
10862 : :
10863 : : The data_reference's connections also need to be updated. Their
10864 : : corresponding dr_vec_info need to be reconnected to the EPILOGUE's
10865 : : stmt_vec_infos, their statements need to point to their corresponding
10866 : : copy. */
10867 : :
10868 : : static void
10869 : 6712 : update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
10870 : : {
10871 : 6712 : loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
10872 : 6712 : hash_map<tree,tree> mapping;
10873 : 6712 : gimple *orig_stmt, *new_stmt;
10874 : 6712 : gimple_stmt_iterator epilogue_gsi;
10875 : 6712 : gphi_iterator epilogue_phi_gsi;
10876 : 6712 : stmt_vec_info stmt_vinfo = NULL, related_vinfo;
10877 : 6712 : basic_block *epilogue_bbs = get_loop_body (epilogue);
10878 : 6712 : unsigned i;
10879 : :
10880 : 6712 : free (LOOP_VINFO_BBS (epilogue_vinfo));
10881 : 6712 : LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
10882 : 6712 : LOOP_VINFO_NBBS (epilogue_vinfo) = epilogue->num_nodes;
10883 : :
10884 : : /* The EPILOGUE loop is a copy of the original loop so they share the same
10885 : : gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
10886 : : point to the copied statements. */
10887 : 20136 : for (unsigned i = 0; i < epilogue->num_nodes; ++i)
10888 : : {
10889 : 13424 : for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
10890 : 34625 : !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
10891 : : {
10892 : 21201 : new_stmt = epilogue_phi_gsi.phi ();
10893 : :
10894 : 21201 : gcc_assert (gimple_uid (new_stmt) > 0);
10895 : 21201 : stmt_vinfo
10896 : 21201 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10897 : :
10898 : 21201 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10899 : : }
10900 : :
10901 : 26848 : for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
10902 : 133096 : !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
10903 : : {
10904 : 119672 : new_stmt = gsi_stmt (epilogue_gsi);
10905 : 119672 : if (is_gimple_debug (new_stmt))
10906 : 21220 : continue;
10907 : :
10908 : 98452 : gcc_assert (gimple_uid (new_stmt) > 0);
10909 : 98452 : stmt_vinfo
10910 : 98452 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10911 : :
10912 : 98452 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10913 : :
10914 : 98452 : related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
10915 : 98452 : if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
10916 : : {
10917 : 1827 : gimple *stmt = STMT_VINFO_STMT (related_vinfo);
10918 : : /* Set BB such that the assert in
10919 : : 'get_initial_defs_for_reduction' is able to determine that
10920 : : the BB of the related stmt is inside this loop. */
10921 : 1827 : gimple_set_bb (stmt,
10922 : : gimple_bb (new_stmt));
10923 : 1827 : related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
10924 : 1827 : gcc_assert (related_vinfo == NULL
10925 : : || related_vinfo == stmt_vinfo);
10926 : : }
10927 : : }
10928 : : }
10929 : :
10930 : 6712 : struct data_reference *dr;
10931 : 6712 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
10932 : 28611 : FOR_EACH_VEC_ELT (datarefs, i, dr)
10933 : : {
10934 : 21899 : orig_stmt = DR_STMT (dr);
10935 : 21899 : gcc_assert (gimple_uid (orig_stmt) > 0);
10936 : 21899 : stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
10937 : 21899 : DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
10938 : : }
10939 : :
10940 : : /* Advance data_reference's with the number of iterations of the previous
10941 : : loop and its prologue. */
10942 : 6712 : vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
10943 : :
10944 : : /* Remember the advancement made. */
10945 : 6712 : LOOP_VINFO_DRS_ADVANCED_BY (epilogue_vinfo) = advance;
10946 : 6712 : }
10947 : :
10948 : : /* When vectorizing early break statements instructions that happen before
10949 : : the early break in the current BB need to be moved to after the early
10950 : : break. This function deals with that and assumes that any validity
10951 : : checks has already been performed.
10952 : :
10953 : : While moving the instructions if it encounters a VUSE or VDEF it then
10954 : : corrects the VUSES as it moves the statements along. GDEST is the location
10955 : : in which to insert the new statements. */
10956 : :
10957 : : static void
10958 : 1418 : move_early_exit_stmts (loop_vec_info loop_vinfo)
10959 : : {
10960 : 1418 : DUMP_VECT_SCOPE ("move_early_exit_stmts");
10961 : :
10962 : 1418 : if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
10963 : 1214 : return;
10964 : :
10965 : : /* Move all stmts that need moving. */
10966 : 204 : basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
10967 : 204 : gimple_stmt_iterator dest_gsi = gsi_after_labels (dest_bb);
10968 : :
10969 : 204 : tree last_seen_vuse = NULL_TREE;
10970 : 503 : for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
10971 : : {
10972 : : /* We have to update crossed degenerate virtual PHIs. Simply
10973 : : elide them. */
10974 : 299 : if (gphi *vphi = dyn_cast <gphi *> (stmt))
10975 : : {
10976 : 7 : tree vdef = gimple_phi_result (vphi);
10977 : 7 : tree vuse = gimple_phi_arg_def (vphi, 0);
10978 : 7 : imm_use_iterator iter;
10979 : 7 : use_operand_p use_p;
10980 : 7 : gimple *use_stmt;
10981 : 23 : FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef)
10982 : : {
10983 : 48 : FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
10984 : 16 : SET_USE (use_p, vuse);
10985 : 7 : }
10986 : 7 : auto gsi = gsi_for_stmt (stmt);
10987 : 7 : remove_phi_node (&gsi, true);
10988 : 7 : last_seen_vuse = vuse;
10989 : 7 : continue;
10990 : 7 : }
10991 : :
10992 : : /* Check to see if statement is still required for vect or has been
10993 : : elided. */
10994 : 292 : auto stmt_info = loop_vinfo->lookup_stmt (stmt);
10995 : 292 : if (!stmt_info)
10996 : 0 : continue;
10997 : :
10998 : 292 : if (dump_enabled_p ())
10999 : 147 : dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11000 : :
11001 : 292 : gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11002 : 292 : gsi_move_before (&stmt_gsi, &dest_gsi, GSI_NEW_STMT);
11003 : 584 : last_seen_vuse = gimple_vuse (stmt);
11004 : : }
11005 : :
11006 : : /* Update all the stmts with their new reaching VUSES. */
11007 : 628 : for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11008 : : {
11009 : 178 : if (dump_enabled_p ())
11010 : 142 : dump_printf_loc (MSG_NOTE, vect_location,
11011 : : "updating vuse to %T for load %G",
11012 : : last_seen_vuse, p);
11013 : 178 : gimple_set_vuse (p, last_seen_vuse);
11014 : 178 : update_stmt (p);
11015 : : }
11016 : :
11017 : : /* And update the LC PHIs on exits. */
11018 : 1026 : for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
11019 : 414 : if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
11020 : 218 : if (gphi *phi = get_virtual_phi (e->dest))
11021 : 422 : SET_PHI_ARG_DEF_ON_EDGE (phi, e, last_seen_vuse);
11022 : : }
11023 : :
11024 : : /* Function vect_transform_loop.
11025 : :
11026 : : The analysis phase has determined that the loop is vectorizable.
11027 : : Vectorize the loop - created vectorized stmts to replace the scalar
11028 : : stmts in the loop, and update the loop exit condition.
11029 : : Returns scalar epilogue loop if any. */
11030 : :
11031 : : class loop *
11032 : 57678 : vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11033 : : {
11034 : 57678 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11035 : 57678 : class loop *epilogue = NULL;
11036 : 57678 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11037 : 57678 : int nbbs = loop->num_nodes;
11038 : 57678 : int i;
11039 : 57678 : tree niters_vector = NULL_TREE;
11040 : 57678 : tree step_vector = NULL_TREE;
11041 : 57678 : tree niters_vector_mult_vf = NULL_TREE;
11042 : 57678 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11043 : 57678 : unsigned int lowest_vf = constant_lower_bound (vf);
11044 : 57678 : gimple *stmt;
11045 : 57678 : bool check_profitability = false;
11046 : 57678 : unsigned int th;
11047 : 57678 : bool flat = maybe_flat_loop_profile (loop);
11048 : :
11049 : 57678 : DUMP_VECT_SCOPE ("vec_transform_loop");
11050 : :
11051 : 57678 : if (! LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11052 : 50966 : loop_vinfo->shared->check_datarefs ();
11053 : :
11054 : : /* Use the more conservative vectorization threshold. If the number
11055 : : of iterations is constant assume the cost check has been performed
11056 : : by our caller. If the threshold makes all loops profitable that
11057 : : run at least the (estimated) vectorization factor number of times
11058 : : checking is pointless, too. */
11059 : 57678 : th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11060 : 57678 : if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11061 : : {
11062 : 17580 : if (dump_enabled_p ())
11063 : 162 : dump_printf_loc (MSG_NOTE, vect_location,
11064 : : "Profitability threshold is %d loop iterations.\n",
11065 : : th);
11066 : : check_profitability = true;
11067 : : }
11068 : :
11069 : : /* Make sure there exists a single-predecessor exit bb. Do this before
11070 : : versioning. */
11071 : 57678 : edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11072 : 57678 : if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11073 : : {
11074 : 11754 : split_loop_exit_edge (e, true);
11075 : 11754 : if (dump_enabled_p ())
11076 : 1952 : dump_printf (MSG_NOTE, "split exit edge\n");
11077 : : }
11078 : :
11079 : : /* Version the loop first, if required, so the profitability check
11080 : : comes first. */
11081 : :
11082 : 57678 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11083 : : {
11084 : 3678 : class loop *sloop
11085 : 3678 : = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11086 : 3678 : sloop->force_vectorize = false;
11087 : 3678 : check_profitability = false;
11088 : : }
11089 : :
11090 : : /* Make sure there exists a single-predecessor exit bb also on the
11091 : : scalar loop copy. Do this after versioning but before peeling
11092 : : so CFG structure is fine for both scalar and if-converted loop
11093 : : to make slpeel_duplicate_current_defs_from_edges face matched
11094 : : loop closed PHI nodes on the exit. */
11095 : 57678 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11096 : : {
11097 : 6081 : e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11098 : 6081 : if (! single_pred_p (e->dest))
11099 : : {
11100 : 5831 : split_loop_exit_edge (e, true);
11101 : 5831 : if (dump_enabled_p ())
11102 : 1101 : dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11103 : : }
11104 : : }
11105 : :
11106 : 57678 : tree niters = vect_build_loop_niters (loop_vinfo);
11107 : 57678 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11108 : 57678 : tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11109 : 57678 : bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11110 : 57678 : tree advance;
11111 : 57678 : drs_init_vec orig_drs_init;
11112 : :
11113 : 57678 : epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11114 : : &step_vector, &niters_vector_mult_vf, th,
11115 : : check_profitability, niters_no_overflow,
11116 : : &advance);
11117 : 57678 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11118 : 57678 : && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11119 : : {
11120 : : /* Ifcvt duplicates loop preheader, loop body and produces an basic
11121 : : block after loop exit. We need to scale all that. */
11122 : 91 : basic_block preheader
11123 : 91 : = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11124 : 91 : preheader->count
11125 : : = preheader->count.apply_probability
11126 : 91 : (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11127 : 91 : scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11128 : : LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11129 : 91 : LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo)->dest->count = preheader->count;
11130 : : }
11131 : :
11132 : 57678 : if (niters_vector == NULL_TREE)
11133 : : {
11134 : 25009 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11135 : 25009 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11136 : 50749 : && known_eq (lowest_vf, vf))
11137 : : {
11138 : 25006 : niters_vector
11139 : 25006 : = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11140 : 25006 : LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11141 : 25006 : step_vector = build_one_cst (TREE_TYPE (niters));
11142 : : }
11143 : 737 : else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11144 : 1 : vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11145 : : &step_vector, niters_no_overflow);
11146 : : else
11147 : : /* vect_do_peeling subtracted the number of peeled prologue
11148 : : iterations from LOOP_VINFO_NITERS. */
11149 : 736 : vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11150 : : &niters_vector, &step_vector,
11151 : : niters_no_overflow);
11152 : : }
11153 : :
11154 : : /* 1) Make sure the loop header has exactly two entries
11155 : : 2) Make sure we have a preheader basic block. */
11156 : :
11157 : 57678 : gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11158 : :
11159 : 57678 : split_edge (loop_preheader_edge (loop));
11160 : :
11161 : 57678 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11162 : : /* This will deal with any possible peeling. */
11163 : 1 : vect_prepare_for_masked_peels (loop_vinfo);
11164 : :
11165 : : /* Handle any code motion that we need to for early-break vectorization after
11166 : : we've done peeling but just before we start vectorizing. */
11167 : 57678 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11168 : 1418 : move_early_exit_stmts (loop_vinfo);
11169 : :
11170 : : /* Remove existing clobber stmts and prefetches. */
11171 : 176158 : for (i = 0; i < nbbs; i++)
11172 : : {
11173 : 118480 : basic_block bb = bbs[i];
11174 : 1021984 : for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);)
11175 : : {
11176 : 785024 : stmt = gsi_stmt (si);
11177 : 785024 : if (gimple_clobber_p (stmt)
11178 : 785024 : || gimple_call_builtin_p (stmt, BUILT_IN_PREFETCH))
11179 : : {
11180 : 193 : unlink_stmt_vdef (stmt);
11181 : 193 : gsi_remove (&si, true);
11182 : 193 : release_defs (stmt);
11183 : : }
11184 : : else
11185 : 784831 : gsi_next (&si);
11186 : : }
11187 : : }
11188 : :
11189 : : /* Schedule the SLP instances. */
11190 : 57678 : if (!loop_vinfo->slp_instances.is_empty ())
11191 : : {
11192 : 57678 : DUMP_VECT_SCOPE ("scheduling SLP instances");
11193 : 57678 : vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11194 : : }
11195 : :
11196 : : /* Generate the loop invariant statements. */
11197 : 57678 : if (!gimple_seq_empty_p (LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo)))
11198 : : {
11199 : 74 : if (dump_enabled_p ())
11200 : 30 : dump_printf_loc (MSG_NOTE, vect_location,
11201 : : "------>generating loop invariant statements\n");
11202 : 74 : gimple_stmt_iterator gsi;
11203 : 74 : gsi = gsi_after_labels (loop_preheader_edge (loop)->src);
11204 : 74 : gsi_insert_seq_before (&gsi, LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo),
11205 : : GSI_CONTINUE_LINKING);
11206 : : }
11207 : :
11208 : : /* Stub out scalar statements that must not survive vectorization and
11209 : : were not picked as relevant in any SLP instance.
11210 : : Doing this here helps with grouped statements, or statements that
11211 : : are involved in patterns. */
11212 : 176158 : for (i = 0; i < nbbs; i++)
11213 : : {
11214 : 118480 : basic_block bb = bbs[i];
11215 : 118480 : stmt_vec_info stmt_info;
11216 : 236960 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11217 : 1557527 : !gsi_end_p (gsi); gsi_next (&gsi))
11218 : : {
11219 : 1439047 : gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11220 : 5403 : if (!call || !gimple_call_internal_p (call))
11221 : 1434795 : continue;
11222 : 4252 : internal_fn ifn = gimple_call_internal_fn (call);
11223 : 4252 : if (ifn == IFN_MASK_LOAD)
11224 : : {
11225 : 554 : tree lhs = gimple_get_lhs (call);
11226 : 554 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11227 : : {
11228 : 0 : tree zero = build_zero_cst (TREE_TYPE (lhs));
11229 : 0 : gimple *new_stmt = gimple_build_assign (lhs, zero);
11230 : 0 : gsi_replace (&gsi, new_stmt, true);
11231 : : }
11232 : : }
11233 : 3698 : else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11234 : : {
11235 : 1579 : tree lhs = gimple_get_lhs (call);
11236 : 1579 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11237 : : {
11238 : 0 : tree else_arg
11239 : 0 : = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11240 : 0 : gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11241 : 0 : gsi_replace (&gsi, new_stmt, true);
11242 : : }
11243 : : }
11244 : 2119 : else if (ifn == IFN_MASK_CALL
11245 : 4 : && (stmt_info = loop_vinfo->lookup_stmt (call))
11246 : 4 : && !STMT_VINFO_RELEVANT_P (stmt_info)
11247 : 2123 : && !STMT_VINFO_LIVE_P (stmt_info))
11248 : : {
11249 : 4 : gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11250 : 4 : loop_vinfo->remove_stmt (stmt_info);
11251 : : }
11252 : : }
11253 : : }
11254 : :
11255 : : /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11256 : : a zero NITERS becomes a nonzero NITERS_VECTOR. */
11257 : 57678 : if (integer_onep (step_vector))
11258 : 57664 : niters_no_overflow = true;
11259 : 57678 : vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
11260 : : niters_vector, step_vector, niters_vector_mult_vf,
11261 : 57678 : !niters_no_overflow);
11262 : :
11263 : 57678 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11264 : :
11265 : : /* True if the final iteration might not handle a full vector's
11266 : : worth of scalar iterations. */
11267 : 115356 : bool final_iter_may_be_partial
11268 : 57678 : = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11269 : 57678 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
11270 : :
11271 : : /* +1 to convert latch counts to loop iteration counts. */
11272 : 57678 : int bias_for_lowest = 1;
11273 : :
11274 : : /* When we are peeling for gaps then we take away one scalar iteration
11275 : : from the vector loop. Thus we can adjust the upper bound by one
11276 : : scalar iteration. But only when we know the bound applies to the
11277 : : IV exit test which might not be true when we have multiple exits. */
11278 : 57678 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11279 : 112185 : bias_for_lowest -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11280 : :
11281 : 57678 : int bias_for_assumed = bias_for_lowest;
11282 : 57678 : int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11283 : 57678 : if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11284 : : {
11285 : : /* When the amount of peeling is known at compile time, the first
11286 : : iteration will have exactly alignment_npeels active elements.
11287 : : In the worst case it will have at least one. */
11288 : 1 : int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11289 : 1 : bias_for_lowest += lowest_vf - min_first_active;
11290 : 1 : bias_for_assumed += assumed_vf - min_first_active;
11291 : : }
11292 : : /* In these calculations the "- 1" converts loop iteration counts
11293 : : back to latch counts. */
11294 : 57678 : if (loop->any_upper_bound)
11295 : : {
11296 : 57678 : loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11297 : 57678 : loop->nb_iterations_upper_bound
11298 : 57678 : = (final_iter_may_be_partial
11299 : 59110 : ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11300 : 2864 : lowest_vf) - 1
11301 : 56246 : : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11302 : 112492 : lowest_vf) - 1);
11303 : 57678 : if (main_vinfo
11304 : : /* Both peeling for alignment and peeling for gaps can end up
11305 : : with the scalar epilogue running for more than VF-1 iterations. */
11306 : 6712 : && !main_vinfo->peeling_for_alignment
11307 : 6664 : && !main_vinfo->peeling_for_gaps)
11308 : : {
11309 : 6508 : unsigned int bound;
11310 : 6508 : poly_uint64 main_iters
11311 : 6508 : = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11312 : : LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11313 : 6508 : main_iters
11314 : 6508 : = upper_bound (main_iters,
11315 : 6508 : LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11316 : 13016 : if (can_div_away_from_zero_p (main_iters,
11317 : 6508 : LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11318 : : &bound))
11319 : 6508 : loop->nb_iterations_upper_bound
11320 : 6508 : = wi::umin ((bound_wide_int) (bound - 1),
11321 : 6508 : loop->nb_iterations_upper_bound);
11322 : : }
11323 : : }
11324 : 57678 : if (loop->any_likely_upper_bound)
11325 : 57678 : loop->nb_iterations_likely_upper_bound
11326 : 57678 : = (final_iter_may_be_partial
11327 : 59110 : ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11328 : 1432 : + bias_for_lowest, lowest_vf) - 1
11329 : 56246 : : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11330 : 57678 : + bias_for_lowest, lowest_vf) - 1);
11331 : 57678 : if (loop->any_estimate)
11332 : 32943 : loop->nb_iterations_estimate
11333 : 32943 : = (final_iter_may_be_partial
11334 : 33730 : ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11335 : 1574 : assumed_vf) - 1
11336 : 32156 : : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11337 : 65099 : assumed_vf) - 1);
11338 : 57678 : scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
11339 : : assumed_vf, flat);
11340 : :
11341 : 57678 : if (dump_enabled_p ())
11342 : : {
11343 : 10171 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11344 : : {
11345 : 8857 : dump_printf_loc (MSG_NOTE, vect_location,
11346 : : "LOOP VECTORIZED\n");
11347 : 8857 : if (loop->inner)
11348 : 279 : dump_printf_loc (MSG_NOTE, vect_location,
11349 : : "OUTER LOOP VECTORIZED\n");
11350 : 8857 : dump_printf (MSG_NOTE, "\n");
11351 : : }
11352 : : else
11353 : 1314 : dump_printf_loc (MSG_NOTE, vect_location,
11354 : : "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11355 : 1314 : GET_MODE_NAME (loop_vinfo->vector_mode));
11356 : : }
11357 : :
11358 : : /* Loops vectorized with a variable factor won't benefit from
11359 : : unrolling/peeling. */
11360 : 57678 : if (!vf.is_constant ())
11361 : : {
11362 : : loop->unroll = 1;
11363 : : if (dump_enabled_p ())
11364 : : dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11365 : : " variable-length vectorization factor\n");
11366 : : }
11367 : :
11368 : : /* When we have unrolled the loop due to a user requested value we should
11369 : : leave it up to the RTL unroll heuristics to determine if it's still worth
11370 : : while to unroll more. */
11371 : 57678 : if (LOOP_VINFO_USER_UNROLL (loop_vinfo))
11372 : 40 : loop->unroll = 0;
11373 : :
11374 : : /* Free SLP instances here because otherwise stmt reference counting
11375 : : won't work. */
11376 : : slp_instance instance;
11377 : 145449 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11378 : 87771 : vect_free_slp_instance (instance);
11379 : 57678 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11380 : : /* Clear-up safelen field since its value is invalid after vectorization
11381 : : since vectorized loop can have loop-carried dependencies. */
11382 : 57678 : loop->safelen = 0;
11383 : :
11384 : 57678 : if (epilogue)
11385 : : {
11386 : : /* Accumulate past advancements made. */
11387 : 6712 : if (LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo))
11388 : 88 : advance = fold_build2 (PLUS_EXPR, TREE_TYPE (advance),
11389 : : LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo),
11390 : : advance);
11391 : 6712 : update_epilogue_loop_vinfo (epilogue, advance);
11392 : :
11393 : 6712 : epilogue->simduid = loop->simduid;
11394 : 6712 : epilogue->force_vectorize = loop->force_vectorize;
11395 : 6712 : epilogue->dont_vectorize = false;
11396 : : }
11397 : :
11398 : 57678 : return epilogue;
11399 : 57678 : }
11400 : :
11401 : : /* The code below is trying to perform simple optimization - revert
11402 : : if-conversion for masked stores, i.e. if the mask of a store is zero
11403 : : do not perform it and all stored value producers also if possible.
11404 : : For example,
11405 : : for (i=0; i<n; i++)
11406 : : if (c[i])
11407 : : {
11408 : : p1[i] += 1;
11409 : : p2[i] = p3[i] +2;
11410 : : }
11411 : : this transformation will produce the following semi-hammock:
11412 : :
11413 : : if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11414 : : {
11415 : : vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11416 : : vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11417 : : MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11418 : : vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11419 : : vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11420 : : MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11421 : : }
11422 : : */
11423 : :
11424 : : void
11425 : 486 : optimize_mask_stores (class loop *loop)
11426 : : {
11427 : 486 : basic_block *bbs = get_loop_body (loop);
11428 : 486 : unsigned nbbs = loop->num_nodes;
11429 : 486 : unsigned i;
11430 : 486 : basic_block bb;
11431 : 486 : class loop *bb_loop;
11432 : 486 : gimple_stmt_iterator gsi;
11433 : 486 : gimple *stmt;
11434 : 486 : auto_vec<gimple *> worklist;
11435 : 486 : auto_purge_vect_location sentinel;
11436 : :
11437 : 486 : vect_location = find_loop_location (loop);
11438 : : /* Pick up all masked stores in loop if any. */
11439 : 1944 : for (i = 0; i < nbbs; i++)
11440 : : {
11441 : 972 : bb = bbs[i];
11442 : 15932 : for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11443 : 13988 : gsi_next (&gsi))
11444 : : {
11445 : 13988 : stmt = gsi_stmt (gsi);
11446 : 13988 : if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11447 : 634 : worklist.safe_push (stmt);
11448 : : }
11449 : : }
11450 : :
11451 : 486 : free (bbs);
11452 : 486 : if (worklist.is_empty ())
11453 : 68 : return;
11454 : :
11455 : : /* Loop has masked stores. */
11456 : 1035 : while (!worklist.is_empty ())
11457 : : {
11458 : 617 : gimple *last, *last_store;
11459 : 617 : edge e, efalse;
11460 : 617 : tree mask;
11461 : 617 : basic_block store_bb, join_bb;
11462 : 617 : gimple_stmt_iterator gsi_to;
11463 : 617 : tree vdef, new_vdef;
11464 : 617 : gphi *phi;
11465 : 617 : tree vectype;
11466 : 617 : tree zero;
11467 : :
11468 : 617 : last = worklist.pop ();
11469 : 617 : mask = gimple_call_arg (last, 2);
11470 : 617 : bb = gimple_bb (last);
11471 : : /* Create then_bb and if-then structure in CFG, then_bb belongs to
11472 : : the same loop as if_bb. It could be different to LOOP when two
11473 : : level loop-nest is vectorized and mask_store belongs to the inner
11474 : : one. */
11475 : 617 : e = split_block (bb, last);
11476 : 617 : bb_loop = bb->loop_father;
11477 : 617 : gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11478 : 617 : join_bb = e->dest;
11479 : 617 : store_bb = create_empty_bb (bb);
11480 : 617 : add_bb_to_loop (store_bb, bb_loop);
11481 : 617 : e->flags = EDGE_TRUE_VALUE;
11482 : 617 : efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11483 : : /* Put STORE_BB to likely part. */
11484 : 617 : efalse->probability = profile_probability::likely ();
11485 : 617 : e->probability = efalse->probability.invert ();
11486 : 617 : store_bb->count = efalse->count ();
11487 : 617 : make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11488 : 617 : if (dom_info_available_p (CDI_DOMINATORS))
11489 : 617 : set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11490 : 617 : if (dump_enabled_p ())
11491 : 299 : dump_printf_loc (MSG_NOTE, vect_location,
11492 : : "Create new block %d to sink mask stores.",
11493 : : store_bb->index);
11494 : : /* Create vector comparison with boolean result. */
11495 : 617 : vectype = TREE_TYPE (mask);
11496 : 617 : zero = build_zero_cst (vectype);
11497 : 617 : stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11498 : 617 : gsi = gsi_last_bb (bb);
11499 : 617 : gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11500 : : /* Create new PHI node for vdef of the last masked store:
11501 : : .MEM_2 = VDEF <.MEM_1>
11502 : : will be converted to
11503 : : .MEM.3 = VDEF <.MEM_1>
11504 : : and new PHI node will be created in join bb
11505 : : .MEM_2 = PHI <.MEM_1, .MEM_3>
11506 : : */
11507 : 617 : vdef = gimple_vdef (last);
11508 : 617 : new_vdef = make_ssa_name (gimple_vop (cfun), last);
11509 : 617 : gimple_set_vdef (last, new_vdef);
11510 : 617 : phi = create_phi_node (vdef, join_bb);
11511 : 617 : add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11512 : :
11513 : : /* Put all masked stores with the same mask to STORE_BB if possible. */
11514 : 651 : while (true)
11515 : : {
11516 : 634 : gimple_stmt_iterator gsi_from;
11517 : 634 : gimple *stmt1 = NULL;
11518 : :
11519 : : /* Move masked store to STORE_BB. */
11520 : 634 : last_store = last;
11521 : 634 : gsi = gsi_for_stmt (last);
11522 : 634 : gsi_from = gsi;
11523 : : /* Shift GSI to the previous stmt for further traversal. */
11524 : 634 : gsi_prev (&gsi);
11525 : 634 : gsi_to = gsi_start_bb (store_bb);
11526 : 634 : gsi_move_before (&gsi_from, &gsi_to);
11527 : : /* Setup GSI_TO to the non-empty block start. */
11528 : 634 : gsi_to = gsi_start_bb (store_bb);
11529 : 634 : if (dump_enabled_p ())
11530 : 315 : dump_printf_loc (MSG_NOTE, vect_location,
11531 : : "Move stmt to created bb\n%G", last);
11532 : : /* Move all stored value producers if possible. */
11533 : 4348 : while (!gsi_end_p (gsi))
11534 : : {
11535 : 4347 : tree lhs;
11536 : 4347 : imm_use_iterator imm_iter;
11537 : 4347 : use_operand_p use_p;
11538 : 4347 : bool res;
11539 : :
11540 : : /* Skip debug statements. */
11541 : 4347 : if (is_gimple_debug (gsi_stmt (gsi)))
11542 : : {
11543 : 1 : gsi_prev (&gsi);
11544 : 2705 : continue;
11545 : : }
11546 : 4346 : stmt1 = gsi_stmt (gsi);
11547 : : /* Do not consider statements writing to memory or having
11548 : : volatile operand. */
11549 : 8584 : if (gimple_vdef (stmt1)
11550 : 8584 : || gimple_has_volatile_ops (stmt1))
11551 : : break;
11552 : 4238 : gsi_from = gsi;
11553 : 4238 : gsi_prev (&gsi);
11554 : 4238 : lhs = gimple_get_lhs (stmt1);
11555 : 4238 : if (!lhs)
11556 : : break;
11557 : :
11558 : : /* LHS of vectorized stmt must be SSA_NAME. */
11559 : 4238 : if (TREE_CODE (lhs) != SSA_NAME)
11560 : : break;
11561 : :
11562 : 4238 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11563 : : {
11564 : : /* Remove dead scalar statement. */
11565 : 2996 : if (has_zero_uses (lhs))
11566 : : {
11567 : 2704 : gsi_remove (&gsi_from, true);
11568 : 2704 : release_defs (stmt1);
11569 : 2704 : continue;
11570 : : }
11571 : : }
11572 : :
11573 : : /* Check that LHS does not have uses outside of STORE_BB. */
11574 : 1534 : res = true;
11575 : 2633 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11576 : : {
11577 : 1624 : gimple *use_stmt;
11578 : 1624 : use_stmt = USE_STMT (use_p);
11579 : 1624 : if (is_gimple_debug (use_stmt))
11580 : 0 : continue;
11581 : 1624 : if (gimple_bb (use_stmt) != store_bb)
11582 : : {
11583 : : res = false;
11584 : : break;
11585 : : }
11586 : : }
11587 : 1534 : if (!res)
11588 : : break;
11589 : :
11590 : 1009 : if (gimple_vuse (stmt1)
11591 : 1443 : && gimple_vuse (stmt1) != gimple_vuse (last_store))
11592 : : break;
11593 : :
11594 : : /* Can move STMT1 to STORE_BB. */
11595 : 1009 : if (dump_enabled_p ())
11596 : 529 : dump_printf_loc (MSG_NOTE, vect_location,
11597 : : "Move stmt to created bb\n%G", stmt1);
11598 : 1009 : gsi_move_before (&gsi_from, &gsi_to);
11599 : : /* Shift GSI_TO for further insertion. */
11600 : 2018 : gsi_prev (&gsi_to);
11601 : : }
11602 : : /* Put other masked stores with the same mask to STORE_BB. */
11603 : 634 : if (worklist.is_empty ()
11604 : 216 : || gimple_call_arg (worklist.last (), 2) != mask
11605 : 17 : || worklist.last () != stmt1)
11606 : : break;
11607 : 17 : last = worklist.pop ();
11608 : 17 : }
11609 : 1234 : add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11610 : : }
11611 : 486 : }
11612 : :
11613 : : /* Decide whether it is possible to use a zero-based induction variable
11614 : : when vectorizing LOOP_VINFO with partial vectors. If it is, return
11615 : : the value that the induction variable must be able to hold in order
11616 : : to ensure that the rgroups eventually have no active vector elements.
11617 : : Return -1 otherwise. */
11618 : :
11619 : : widest_int
11620 : 62 : vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11621 : : {
11622 : 62 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11623 : 62 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11624 : 62 : unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11625 : :
11626 : : /* Calculate the value that the induction variable must be able
11627 : : to hit in order to ensure that we end the loop with an all-false mask.
11628 : : This involves adding the maximum number of inactive trailing scalar
11629 : : iterations. */
11630 : 62 : widest_int iv_limit = -1;
11631 : 62 : if (max_loop_iterations (loop, &iv_limit))
11632 : : {
11633 : 62 : if (niters_skip)
11634 : : {
11635 : : /* Add the maximum number of skipped iterations to the
11636 : : maximum iteration count. */
11637 : 0 : if (TREE_CODE (niters_skip) == INTEGER_CST)
11638 : 0 : iv_limit += wi::to_widest (niters_skip);
11639 : : else
11640 : 0 : iv_limit += max_vf - 1;
11641 : : }
11642 : 62 : else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11643 : : /* Make a conservatively-correct assumption. */
11644 : 2 : iv_limit += max_vf - 1;
11645 : :
11646 : : /* IV_LIMIT is the maximum number of latch iterations, which is also
11647 : : the maximum in-range IV value. Round this value down to the previous
11648 : : vector alignment boundary and then add an extra full iteration. */
11649 : 62 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11650 : 62 : iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11651 : : }
11652 : 62 : return iv_limit;
11653 : : }
11654 : :
11655 : : /* For the given rgroup_controls RGC, check whether an induction variable
11656 : : would ever hit a value that produces a set of all-false masks or zero
11657 : : lengths before wrapping around. Return true if it's possible to wrap
11658 : : around before hitting the desirable value, otherwise return false. */
11659 : :
11660 : : bool
11661 : 0 : vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11662 : : {
11663 : 0 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11664 : :
11665 : 0 : if (iv_limit == -1)
11666 : : return true;
11667 : :
11668 : 0 : tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11669 : 0 : unsigned int compare_precision = TYPE_PRECISION (compare_type);
11670 : 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11671 : :
11672 : 0 : if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11673 : : return true;
11674 : :
11675 : : return false;
11676 : 0 : }
|