Branch data Line data Source code
1 : : /* Loop Vectorization
2 : : Copyright (C) 2003-2025 Free Software Foundation, Inc.
3 : : Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 : : Ira Rosen <irar@il.ibm.com>
5 : :
6 : : This file is part of GCC.
7 : :
8 : : GCC is free software; you can redistribute it and/or modify it under
9 : : the terms of the GNU General Public License as published by the Free
10 : : Software Foundation; either version 3, or (at your option) any later
11 : : version.
12 : :
13 : : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : : for more details.
17 : :
18 : : You should have received a copy of the GNU General Public License
19 : : along with GCC; see the file COPYING3. If not see
20 : : <http://www.gnu.org/licenses/>. */
21 : :
22 : : #define INCLUDE_ALGORITHM
23 : : #include "config.h"
24 : : #include "system.h"
25 : : #include "coretypes.h"
26 : : #include "backend.h"
27 : : #include "target.h"
28 : : #include "rtl.h"
29 : : #include "tree.h"
30 : : #include "gimple.h"
31 : : #include "cfghooks.h"
32 : : #include "tree-pass.h"
33 : : #include "ssa.h"
34 : : #include "optabs-tree.h"
35 : : #include "memmodel.h"
36 : : #include "optabs.h"
37 : : #include "diagnostic-core.h"
38 : : #include "fold-const.h"
39 : : #include "stor-layout.h"
40 : : #include "cfganal.h"
41 : : #include "gimplify.h"
42 : : #include "gimple-iterator.h"
43 : : #include "gimplify-me.h"
44 : : #include "tree-ssa-loop-ivopts.h"
45 : : #include "tree-ssa-loop-manip.h"
46 : : #include "tree-ssa-loop-niter.h"
47 : : #include "tree-ssa-loop.h"
48 : : #include "cfgloop.h"
49 : : #include "tree-scalar-evolution.h"
50 : : #include "tree-vectorizer.h"
51 : : #include "gimple-fold.h"
52 : : #include "cgraph.h"
53 : : #include "tree-cfg.h"
54 : : #include "tree-if-conv.h"
55 : : #include "internal-fn.h"
56 : : #include "tree-vector-builder.h"
57 : : #include "vec-perm-indices.h"
58 : : #include "tree-eh.h"
59 : : #include "case-cfn-macros.h"
60 : : #include "langhooks.h"
61 : : #include "opts.h"
62 : :
63 : : /* Loop Vectorization Pass.
64 : :
65 : : This pass tries to vectorize loops.
66 : :
67 : : For example, the vectorizer transforms the following simple loop:
68 : :
69 : : short a[N]; short b[N]; short c[N]; int i;
70 : :
71 : : for (i=0; i<N; i++){
72 : : a[i] = b[i] + c[i];
73 : : }
74 : :
75 : : as if it was manually vectorized by rewriting the source code into:
76 : :
77 : : typedef int __attribute__((mode(V8HI))) v8hi;
78 : : short a[N]; short b[N]; short c[N]; int i;
79 : : v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
80 : : v8hi va, vb, vc;
81 : :
82 : : for (i=0; i<N/8; i++){
83 : : vb = pb[i];
84 : : vc = pc[i];
85 : : va = vb + vc;
86 : : pa[i] = va;
87 : : }
88 : :
89 : : The main entry to this pass is vectorize_loops(), in which
90 : : the vectorizer applies a set of analyses on a given set of loops,
91 : : followed by the actual vectorization transformation for the loops that
92 : : had successfully passed the analysis phase.
93 : : Throughout this pass we make a distinction between two types of
94 : : data: scalars (which are represented by SSA_NAMES), and memory references
95 : : ("data-refs"). These two types of data require different handling both
96 : : during analysis and transformation. The types of data-refs that the
97 : : vectorizer currently supports are ARRAY_REFS which base is an array DECL
98 : : (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
99 : : accesses are required to have a simple (consecutive) access pattern.
100 : :
101 : : Analysis phase:
102 : : ===============
103 : : The driver for the analysis phase is vect_analyze_loop().
104 : : It applies a set of analyses, some of which rely on the scalar evolution
105 : : analyzer (scev) developed by Sebastian Pop.
106 : :
107 : : During the analysis phase the vectorizer records some information
108 : : per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
109 : : loop, as well as general information about the loop as a whole, which is
110 : : recorded in a "loop_vec_info" struct attached to each loop.
111 : :
112 : : Transformation phase:
113 : : =====================
114 : : The loop transformation phase scans all the stmts in the loop, and
115 : : creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
116 : : the loop that needs to be vectorized. It inserts the vector code sequence
117 : : just before the scalar stmt S, and records a pointer to the vector code
118 : : in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
119 : : attached to S). This pointer will be used for the vectorization of following
120 : : stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
121 : : otherwise, we rely on dead code elimination for removing it.
122 : :
123 : : For example, say stmt S1 was vectorized into stmt VS1:
124 : :
125 : : VS1: vb = px[i];
126 : : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
127 : : S2: a = b;
128 : :
129 : : To vectorize stmt S2, the vectorizer first finds the stmt that defines
130 : : the operand 'b' (S1), and gets the relevant vector def 'vb' from the
131 : : vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
132 : : resulting sequence would be:
133 : :
134 : : VS1: vb = px[i];
135 : : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
136 : : VS2: va = vb;
137 : : S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
138 : :
139 : : Operands that are not SSA_NAMEs, are data-refs that appear in
140 : : load/store operations (like 'x[i]' in S1), and are handled differently.
141 : :
142 : : Target modeling:
143 : : =================
144 : : Currently the only target specific information that is used is the
145 : : size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
146 : : Targets that can support different sizes of vectors, for now will need
147 : : to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
148 : : flexibility will be added in the future.
149 : :
150 : : Since we only vectorize operations which vector form can be
151 : : expressed using existing tree codes, to verify that an operation is
152 : : supported, the vectorizer checks the relevant optab at the relevant
153 : : machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
154 : : the value found is CODE_FOR_nothing, then there's no target support, and
155 : : we can't vectorize the stmt.
156 : :
157 : : For additional information on this project see:
158 : : http://gcc.gnu.org/projects/tree-ssa/vectorization.html
159 : : */
160 : :
161 : : static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
162 : : unsigned *);
163 : : static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
164 : : gphi **);
165 : :
166 : :
167 : : /* Function vect_is_simple_iv_evolution.
168 : :
169 : : FORNOW: A simple evolution of an induction variables in the loop is
170 : : considered a polynomial evolution. */
171 : :
172 : : static bool
173 : 667810 : vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn,
174 : : stmt_vec_info stmt_info)
175 : : {
176 : 667810 : tree init_expr;
177 : 667810 : tree step_expr;
178 : 667810 : tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
179 : 667810 : basic_block bb;
180 : :
181 : : /* When there is no evolution in this loop, the evolution function
182 : : is not "simple". */
183 : 667810 : if (evolution_part == NULL_TREE)
184 : : return false;
185 : :
186 : : /* When the evolution is a polynomial of degree >= 2
187 : : the evolution function is not "simple". */
188 : 709950 : if (tree_is_chrec (evolution_part))
189 : : return false;
190 : :
191 : 617719 : step_expr = evolution_part;
192 : 617719 : init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
193 : :
194 : 617719 : if (dump_enabled_p ())
195 : 36615 : dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
196 : : step_expr, init_expr);
197 : :
198 : 617719 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = init_expr;
199 : 617719 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = step_expr;
200 : :
201 : 617719 : if (TREE_CODE (step_expr) != INTEGER_CST
202 : 49660 : && (TREE_CODE (step_expr) != SSA_NAME
203 : 41929 : || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
204 : 41767 : && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
205 : 7544 : || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
206 : 111 : && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
207 : 111 : || !flag_associative_math)))
208 : 659916 : && (TREE_CODE (step_expr) != REAL_CST
209 : 407 : || !flag_associative_math))
210 : : {
211 : 42140 : if (dump_enabled_p ())
212 : 2726 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
213 : : "step unknown.\n");
214 : 42140 : return false;
215 : : }
216 : :
217 : : return true;
218 : : }
219 : :
220 : : /* Function vect_is_nonlinear_iv_evolution
221 : :
222 : : Only support nonlinear induction for integer type
223 : : 1. neg
224 : : 2. mul by constant
225 : : 3. lshift/rshift by constant.
226 : :
227 : : For neg induction, return a fake step as integer -1. */
228 : : static bool
229 : 89890 : vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
230 : : gphi* loop_phi_node)
231 : : {
232 : 89890 : tree init_expr, ev_expr, result, op1, op2;
233 : 89890 : gimple* def;
234 : :
235 : 89890 : if (gimple_phi_num_args (loop_phi_node) != 2)
236 : : return false;
237 : :
238 : 89890 : init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
239 : 89890 : ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
240 : :
241 : : /* Support nonlinear induction only for integer type. */
242 : 89890 : if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
243 : : return false;
244 : :
245 : 67068 : result = PHI_RESULT (loop_phi_node);
246 : :
247 : 67068 : if (TREE_CODE (ev_expr) != SSA_NAME
248 : 64934 : || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
249 : 67068 : || !is_gimple_assign (def))
250 : : return false;
251 : :
252 : 59867 : enum tree_code t_code = gimple_assign_rhs_code (def);
253 : 59867 : tree step;
254 : 59867 : switch (t_code)
255 : : {
256 : 1790 : case NEGATE_EXPR:
257 : 1790 : if (gimple_assign_rhs1 (def) != result)
258 : : return false;
259 : 1790 : step = build_int_cst (TREE_TYPE (init_expr), -1);
260 : 1790 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
261 : 1790 : break;
262 : :
263 : 9721 : case RSHIFT_EXPR:
264 : 9721 : case LSHIFT_EXPR:
265 : 9721 : case MULT_EXPR:
266 : 9721 : op1 = gimple_assign_rhs1 (def);
267 : 9721 : op2 = gimple_assign_rhs2 (def);
268 : 9721 : if (TREE_CODE (op2) != INTEGER_CST
269 : 6179 : || op1 != result)
270 : : return false;
271 : 6048 : step = op2;
272 : 6048 : if (t_code == LSHIFT_EXPR)
273 : 193 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
274 : 5855 : else if (t_code == RSHIFT_EXPR)
275 : 5235 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
276 : : /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
277 : : else
278 : 620 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
279 : : break;
280 : :
281 : : default:
282 : : return false;
283 : : }
284 : :
285 : 7838 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = init_expr;
286 : 7838 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = step;
287 : :
288 : 7838 : return true;
289 : : }
290 : :
291 : : /* Returns true if Phi is a first-order recurrence. A first-order
292 : : recurrence is a non-reduction recurrence relation in which the value of
293 : : the recurrence in the current loop iteration equals a value defined in
294 : : the previous iteration. */
295 : :
296 : : static bool
297 : 21021 : vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
298 : : gphi *phi)
299 : : {
300 : : /* A nested cycle isn't vectorizable as first order recurrence. */
301 : 21021 : if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
302 : : return false;
303 : :
304 : : /* Ensure the loop latch definition is from within the loop. */
305 : 20855 : edge latch = loop_latch_edge (loop);
306 : 20855 : tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
307 : 20855 : if (TREE_CODE (ldef) != SSA_NAME
308 : 18438 : || SSA_NAME_IS_DEFAULT_DEF (ldef)
309 : 18410 : || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
310 : 38039 : || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
311 : 3984 : return false;
312 : :
313 : 16871 : tree def = gimple_phi_result (phi);
314 : :
315 : : /* Ensure every use_stmt of the phi node is dominated by the latch
316 : : definition. */
317 : 16871 : imm_use_iterator imm_iter;
318 : 16871 : use_operand_p use_p;
319 : 35970 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
320 : 18607 : if (!is_gimple_debug (USE_STMT (use_p))
321 : 36212 : && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
322 : 10641 : || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
323 : : USE_STMT (use_p))))
324 : 16379 : return false;
325 : :
326 : : /* First-order recurrence autovectorization needs shuffle vector. */
327 : 492 : tree scalar_type = TREE_TYPE (def);
328 : 492 : tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
329 : 492 : if (!vectype)
330 : : return false;
331 : :
332 : : return true;
333 : : }
334 : :
335 : : /* Function vect_analyze_scalar_cycles_1.
336 : :
337 : : Examine the cross iteration def-use cycles of scalar variables
338 : : in LOOP. LOOP_VINFO represents the loop that is now being
339 : : considered for vectorization (can be LOOP, or an outer-loop
340 : : enclosing LOOP). SLP indicates there will be some subsequent
341 : : slp analyses or not. */
342 : :
343 : : static void
344 : 326230 : vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
345 : : {
346 : 326230 : basic_block bb = loop->header;
347 : 326230 : auto_vec<stmt_vec_info, 64> worklist;
348 : 326230 : gphi_iterator gsi;
349 : :
350 : 326230 : DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
351 : :
352 : : /* First - identify all inductions. Reduction detection assumes that all the
353 : : inductions have been identified, therefore, this order must not be
354 : : changed. */
355 : 1171537 : for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
356 : : {
357 : 845307 : gphi *phi = gsi.phi ();
358 : 845307 : tree access_fn = NULL;
359 : 845307 : tree def = PHI_RESULT (phi);
360 : 845307 : stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
361 : :
362 : : /* Skip virtual phi's. The data dependences that are associated with
363 : : virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
364 : 1690614 : if (virtual_operand_p (def))
365 : 261896 : continue;
366 : :
367 : : /* Skip already analyzed inner loop PHIs of double reductions. */
368 : 668703 : if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))
369 : 893 : continue;
370 : :
371 : 667810 : if (dump_enabled_p ())
372 : 38499 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
373 : : (gimple *) phi);
374 : :
375 : 667810 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
376 : :
377 : : /* Analyze the evolution function. */
378 : 667810 : access_fn = analyze_scalar_evolution (loop, def);
379 : 667810 : if (dump_enabled_p ())
380 : 38499 : dump_printf_loc (MSG_NOTE, vect_location,
381 : : "Access function of PHI: %T\n", access_fn);
382 : 667810 : if (access_fn)
383 : 667810 : STRIP_NOPS (access_fn);
384 : :
385 : 752209 : if ((!access_fn
386 : 667810 : || !vect_is_simple_iv_evolution (loop->num, access_fn, stmt_vinfo)
387 : 575579 : || (LOOP_VINFO_LOOP (loop_vinfo) != loop
388 : 10413 : && (TREE_CODE (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo))
389 : : != INTEGER_CST)))
390 : : /* Only handle nonlinear iv for same loop. */
391 : 760047 : && (LOOP_VINFO_LOOP (loop_vinfo) != loop
392 : 89890 : || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo, phi)))
393 : : {
394 : 84399 : worklist.safe_push (stmt_vinfo);
395 : 84399 : continue;
396 : : }
397 : :
398 : 583411 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
399 : : != NULL_TREE);
400 : 583411 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
401 : :
402 : 583411 : if (dump_enabled_p ())
403 : 33981 : dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
404 : 583411 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
405 : :
406 : : /* Mark if we have a non-linear IV. */
407 : 583411 : LOOP_VINFO_NON_LINEAR_IV (loop_vinfo)
408 : 583411 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_vinfo) != vect_step_op_add;
409 : : }
410 : :
411 : :
412 : : /* Second - identify all reductions and nested cycles. */
413 : 410629 : while (worklist.length () > 0)
414 : : {
415 : 84399 : stmt_vec_info stmt_vinfo = worklist.pop ();
416 : 84399 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
417 : 84399 : tree def = PHI_RESULT (phi);
418 : :
419 : 84399 : if (dump_enabled_p ())
420 : 4518 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
421 : : (gimple *) phi);
422 : :
423 : 168798 : gcc_assert (!virtual_operand_p (def)
424 : : && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
425 : :
426 : 84399 : gphi *double_reduc;
427 : 84399 : stmt_vec_info reduc_stmt_info
428 : 84399 : = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc);
429 : 84399 : if (reduc_stmt_info && double_reduc)
430 : : {
431 : 982 : stmt_vec_info inner_phi_info
432 : 982 : = loop_vinfo->lookup_stmt (double_reduc);
433 : : /* ??? Pass down flag we're the inner loop of a double reduc. */
434 : 982 : stmt_vec_info inner_reduc_info
435 : 982 : = vect_is_simple_reduction (loop_vinfo, inner_phi_info, NULL);
436 : 982 : if (inner_reduc_info)
437 : : {
438 : 893 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
439 : 893 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
440 : 893 : STMT_VINFO_REDUC_DEF (inner_phi_info) = inner_reduc_info;
441 : 893 : STMT_VINFO_REDUC_DEF (inner_reduc_info) = inner_phi_info;
442 : 893 : if (dump_enabled_p ())
443 : 120 : dump_printf_loc (MSG_NOTE, vect_location,
444 : : "Detected double reduction.\n");
445 : :
446 : 893 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
447 : 893 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
448 : 893 : STMT_VINFO_DEF_TYPE (inner_phi_info) = vect_nested_cycle;
449 : : /* Make it accessible for SLP vectorization. */
450 : 893 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
451 : : }
452 : 89 : else if (dump_enabled_p ())
453 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
454 : : "Unknown def-use cycle pattern.\n");
455 : : }
456 : 83417 : else if (reduc_stmt_info)
457 : : {
458 : 62396 : if (loop != LOOP_VINFO_LOOP (loop_vinfo))
459 : : {
460 : 2181 : if (dump_enabled_p ())
461 : 357 : dump_printf_loc (MSG_NOTE, vect_location,
462 : : "Detected vectorizable nested cycle.\n");
463 : :
464 : 2181 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
465 : : }
466 : : else
467 : : {
468 : 60215 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
469 : 60215 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
470 : 60215 : if (dump_enabled_p ())
471 : 3587 : dump_printf_loc (MSG_NOTE, vect_location,
472 : : "Detected reduction.\n");
473 : :
474 : 60215 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
475 : 60215 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
476 : 60215 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
477 : : }
478 : : }
479 : 21021 : else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
480 : 486 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
481 : : else
482 : 20535 : if (dump_enabled_p ())
483 : 370 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
484 : : "Unknown def-use cycle pattern.\n");
485 : : }
486 : 326230 : }
487 : :
488 : :
489 : : /* Function vect_analyze_scalar_cycles.
490 : :
491 : : Examine the cross iteration def-use cycles of scalar variables, by
492 : : analyzing the loop-header PHIs of scalar variables. Classify each
493 : : cycle as one of the following: invariant, induction, reduction, unknown.
494 : : We do that for the loop represented by LOOP_VINFO, and also to its
495 : : inner-loop, if exists.
496 : : Examples for scalar cycles:
497 : :
498 : : Example1: reduction:
499 : :
500 : : loop1:
501 : : for (i=0; i<N; i++)
502 : : sum += a[i];
503 : :
504 : : Example2: induction:
505 : :
506 : : loop2:
507 : : for (i=0; i<N; i++)
508 : : a[i] = i; */
509 : :
510 : : static void
511 : 320942 : vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
512 : : {
513 : 320942 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
514 : :
515 : 320942 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
516 : :
517 : : /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
518 : : Reductions in such inner-loop therefore have different properties than
519 : : the reductions in the nest that gets vectorized:
520 : : 1. When vectorized, they are executed in the same order as in the original
521 : : scalar loop, so we can't change the order of computation when
522 : : vectorizing them.
523 : : 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
524 : : current checks are too strict. */
525 : :
526 : 320942 : if (loop->inner)
527 : 5288 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
528 : 320942 : }
529 : :
530 : : /* Function vect_get_loop_niters.
531 : :
532 : : Determine how many iterations the loop is executed and place it
533 : : in NUMBER_OF_ITERATIONS. Place the number of latch iterations
534 : : in NUMBER_OF_ITERATIONSM1. Place the condition under which the
535 : : niter information holds in ASSUMPTIONS.
536 : :
537 : : Return the loop exit conditions. */
538 : :
539 : :
540 : : static vec<gcond *>
541 : 266163 : vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
542 : : tree *number_of_iterations, tree *number_of_iterationsm1)
543 : : {
544 : 266163 : auto_vec<edge> exits = get_loop_exit_edges (loop);
545 : 266163 : vec<gcond *> conds;
546 : 532326 : conds.create (exits.length ());
547 : 266163 : class tree_niter_desc niter_desc;
548 : 266163 : tree niter_assumptions, niter, may_be_zero;
549 : :
550 : 266163 : *assumptions = boolean_true_node;
551 : 266163 : *number_of_iterationsm1 = chrec_dont_know;
552 : 266163 : *number_of_iterations = chrec_dont_know;
553 : :
554 : 266163 : DUMP_VECT_SCOPE ("get_loop_niters");
555 : :
556 : 266163 : if (exits.is_empty ())
557 : 0 : return conds;
558 : :
559 : 266163 : if (dump_enabled_p ())
560 : 13959 : dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
561 : : exits.length ());
562 : :
563 : : edge exit;
564 : : unsigned int i;
565 : 640934 : FOR_EACH_VEC_ELT (exits, i, exit)
566 : : {
567 : 374771 : gcond *cond = get_loop_exit_condition (exit);
568 : 374771 : if (cond)
569 : 365092 : conds.safe_push (cond);
570 : :
571 : 374771 : if (dump_enabled_p ())
572 : 15018 : dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
573 : :
574 : 374771 : if (exit != main_exit)
575 : 149706 : continue;
576 : :
577 : 266163 : may_be_zero = NULL_TREE;
578 : 266163 : if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
579 : 266163 : || chrec_contains_undetermined (niter_desc.niter))
580 : 41098 : continue;
581 : :
582 : 225065 : niter_assumptions = niter_desc.assumptions;
583 : 225065 : may_be_zero = niter_desc.may_be_zero;
584 : 225065 : niter = niter_desc.niter;
585 : :
586 : 225065 : if (may_be_zero && integer_zerop (may_be_zero))
587 : : may_be_zero = NULL_TREE;
588 : :
589 : 9665 : if (may_be_zero)
590 : : {
591 : 9665 : if (COMPARISON_CLASS_P (may_be_zero))
592 : : {
593 : : /* Try to combine may_be_zero with assumptions, this can simplify
594 : : computation of niter expression. */
595 : 9665 : if (niter_assumptions && !integer_nonzerop (niter_assumptions))
596 : 1083 : niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
597 : : niter_assumptions,
598 : : fold_build1 (TRUTH_NOT_EXPR,
599 : : boolean_type_node,
600 : : may_be_zero));
601 : : else
602 : 8582 : niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
603 : : build_int_cst (TREE_TYPE (niter), 0),
604 : : rewrite_to_non_trapping_overflow (niter));
605 : :
606 : 225065 : may_be_zero = NULL_TREE;
607 : : }
608 : 0 : else if (integer_nonzerop (may_be_zero))
609 : : {
610 : 0 : *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
611 : 0 : *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
612 : 0 : continue;
613 : : }
614 : : else
615 : 0 : continue;
616 : : }
617 : :
618 : : /* Loop assumptions are based off the normal exit. */
619 : 225065 : *assumptions = niter_assumptions;
620 : 225065 : *number_of_iterationsm1 = niter;
621 : :
622 : : /* We want the number of loop header executions which is the number
623 : : of latch executions plus one.
624 : : ??? For UINT_MAX latch executions this number overflows to zero
625 : : for loops like do { n++; } while (n != 0); */
626 : 225065 : if (niter && !chrec_contains_undetermined (niter))
627 : : {
628 : 225065 : niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
629 : : unshare_expr (niter),
630 : : build_int_cst (TREE_TYPE (niter), 1));
631 : 225065 : if (TREE_CODE (niter) == INTEGER_CST
632 : 121160 : && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
633 : : {
634 : : /* If we manage to fold niter + 1 into INTEGER_CST even when
635 : : niter is some complex expression, ensure back
636 : : *number_of_iterationsm1 is an INTEGER_CST as well. See
637 : : PR113210. */
638 : 0 : *number_of_iterationsm1
639 : 0 : = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
640 : : build_minus_one_cst (TREE_TYPE (niter)));
641 : : }
642 : : }
643 : 225065 : *number_of_iterations = niter;
644 : : }
645 : :
646 : 266163 : if (dump_enabled_p ())
647 : 13959 : dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
648 : :
649 : 266163 : return conds;
650 : 266163 : }
651 : :
652 : : /* Determine the main loop exit for the vectorizer. */
653 : :
654 : : edge
655 : 502033 : vec_init_loop_exit_info (class loop *loop)
656 : : {
657 : : /* Before we begin we must first determine which exit is the main one and
658 : : which are auxilary exits. */
659 : 502033 : auto_vec<edge> exits = get_loop_exit_edges (loop);
660 : 502033 : if (exits.length () == 1)
661 : 319402 : return exits[0];
662 : :
663 : : /* If we have multiple exits we only support counting IV at the moment.
664 : : Analyze all exits and return the last one we can analyze. */
665 : 182631 : class tree_niter_desc niter_desc;
666 : 182631 : edge candidate = NULL;
667 : 1184527 : for (edge exit : exits)
668 : : {
669 : 646498 : if (!get_loop_exit_condition (exit))
670 : 147523 : continue;
671 : :
672 : 498975 : if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
673 : 498975 : && !chrec_contains_undetermined (niter_desc.niter))
674 : : {
675 : 138197 : tree may_be_zero = niter_desc.may_be_zero;
676 : 138197 : if ((integer_zerop (may_be_zero)
677 : : /* As we are handling may_be_zero that's not false by
678 : : rewriting niter to may_be_zero ? 0 : niter we require
679 : : an empty latch. */
680 : 657654 : || (single_pred_p (loop->latch)
681 : 10827 : && exit->src == single_pred (loop->latch)
682 : 2704 : && (integer_nonzerop (may_be_zero)
683 : 2704 : || COMPARISON_CLASS_P (may_be_zero))))
684 : 140901 : && (!candidate
685 : 7078 : || dominated_by_p (CDI_DOMINATORS, exit->src,
686 : 7078 : candidate->src)))
687 : : candidate = exit;
688 : : }
689 : : }
690 : :
691 : 182631 : return candidate;
692 : 182631 : }
693 : :
694 : : /* Function bb_in_loop_p
695 : :
696 : : Used as predicate for dfs order traversal of the loop bbs. */
697 : :
698 : : static bool
699 : 1315918 : bb_in_loop_p (const_basic_block bb, const void *data)
700 : : {
701 : 1315918 : const class loop *const loop = (const class loop *)data;
702 : 1315918 : if (flow_bb_inside_loop_p (loop, bb))
703 : : return true;
704 : : return false;
705 : : }
706 : :
707 : :
708 : : /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
709 : : stmt_vec_info structs for all the stmts in LOOP_IN. */
710 : :
711 : 416289 : _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
712 : : : vec_info (vec_info::loop, shared),
713 : 416289 : loop (loop_in),
714 : 416289 : num_itersm1 (NULL_TREE),
715 : 416289 : num_iters (NULL_TREE),
716 : 416289 : num_iters_unchanged (NULL_TREE),
717 : 416289 : num_iters_assumptions (NULL_TREE),
718 : 416289 : vector_costs (nullptr),
719 : 416289 : scalar_costs (nullptr),
720 : 416289 : th (0),
721 : 416289 : versioning_threshold (0),
722 : 416289 : vectorization_factor (0),
723 : 416289 : main_loop_edge (nullptr),
724 : 416289 : skip_main_loop_edge (nullptr),
725 : 416289 : skip_this_loop_edge (nullptr),
726 : 416289 : reusable_accumulators (),
727 : 416289 : suggested_unroll_factor (1),
728 : 416289 : max_vectorization_factor (0),
729 : 416289 : mask_skip_niters (NULL_TREE),
730 : 416289 : mask_skip_niters_pfa_offset (NULL_TREE),
731 : 416289 : rgroup_compare_type (NULL_TREE),
732 : 416289 : simd_if_cond (NULL_TREE),
733 : 416289 : partial_vector_style (vect_partial_vectors_none),
734 : 416289 : unaligned_dr (NULL),
735 : 416289 : peeling_for_alignment (0),
736 : 416289 : ptr_mask (0),
737 : 416289 : max_spec_read_amount (0),
738 : 416289 : nonlinear_iv (false),
739 : 416289 : ivexpr_map (NULL),
740 : 416289 : scan_map (NULL),
741 : 416289 : inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
742 : 416289 : vectorizable (false),
743 : 416289 : can_use_partial_vectors_p (true),
744 : 416289 : must_use_partial_vectors_p (false),
745 : 416289 : using_partial_vectors_p (false),
746 : 416289 : using_decrementing_iv_p (false),
747 : 416289 : using_select_vl_p (false),
748 : 416289 : allow_mutual_alignment (false),
749 : 416289 : partial_load_store_bias (0),
750 : 416289 : peeling_for_gaps (false),
751 : 416289 : peeling_for_niter (false),
752 : 416289 : early_breaks (false),
753 : 416289 : user_unroll (false),
754 : 416289 : no_data_dependencies (false),
755 : 416289 : has_mask_store (false),
756 : 416289 : scalar_loop_scaling (profile_probability::uninitialized ()),
757 : 416289 : scalar_loop (NULL),
758 : 416289 : main_loop_info (NULL),
759 : 416289 : orig_loop_info (NULL),
760 : 416289 : epilogue_vinfo (NULL),
761 : 416289 : drs_advanced_by (NULL_TREE),
762 : 416289 : vec_loop_iv_exit (NULL),
763 : 416289 : vec_epilogue_loop_iv_exit (NULL),
764 : 416289 : scalar_loop_iv_exit (NULL)
765 : : {
766 : : /* CHECKME: We want to visit all BBs before their successors (except for
767 : : latch blocks, for which this assertion wouldn't hold). In the simple
768 : : case of the loop forms we allow, a dfs order of the BBs would the same
769 : : as reversed postorder traversal, so we are safe. */
770 : :
771 : 416289 : bbs = XCNEWVEC (basic_block, loop->num_nodes);
772 : 832578 : nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, bbs,
773 : 416289 : loop->num_nodes, loop);
774 : 416289 : gcc_assert (nbbs == loop->num_nodes);
775 : :
776 : 1503253 : for (unsigned int i = 0; i < nbbs; i++)
777 : : {
778 : 1086964 : basic_block bb = bbs[i];
779 : 1086964 : gimple_stmt_iterator si;
780 : :
781 : 2208761 : for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
782 : : {
783 : 1121797 : gimple *phi = gsi_stmt (si);
784 : 1121797 : gimple_set_uid (phi, 0);
785 : 1121797 : add_stmt (phi);
786 : : }
787 : :
788 : 9275875 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
789 : : {
790 : 7101947 : gimple *stmt = gsi_stmt (si);
791 : 7101947 : gimple_set_uid (stmt, 0);
792 : 7101947 : if (is_gimple_debug (stmt))
793 : 2662549 : continue;
794 : 4439398 : add_stmt (stmt);
795 : : /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
796 : : third argument is the #pragma omp simd if (x) condition, when 0,
797 : : loop shouldn't be vectorized, when non-zero constant, it should
798 : : be vectorized normally, otherwise versioned with vectorized loop
799 : : done if the condition is non-zero at runtime. */
800 : 4439398 : if (loop_in->simduid
801 : 43390 : && is_gimple_call (stmt)
802 : 4268 : && gimple_call_internal_p (stmt)
803 : 4141 : && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
804 : 4137 : && gimple_call_num_args (stmt) >= 3
805 : 103 : && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
806 : 4439501 : && (loop_in->simduid
807 : 103 : == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
808 : : {
809 : 103 : tree arg = gimple_call_arg (stmt, 2);
810 : 103 : if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
811 : 103 : simd_if_cond = arg;
812 : : else
813 : 0 : gcc_assert (integer_nonzerop (arg));
814 : : }
815 : : }
816 : : }
817 : 416289 : }
818 : :
819 : : /* Free all levels of rgroup CONTROLS. */
820 : :
821 : : void
822 : 1078302 : release_vec_loop_controls (vec<rgroup_controls> *controls)
823 : : {
824 : 1078302 : rgroup_controls *rgc;
825 : 1078302 : unsigned int i;
826 : 1094755 : FOR_EACH_VEC_ELT (*controls, i, rgc)
827 : 16453 : rgc->controls.release ();
828 : 1078302 : controls->release ();
829 : 1078302 : }
830 : :
831 : : /* Free all memory used by the _loop_vec_info, as well as all the
832 : : stmt_vec_info structs of all the stmts in the loop. */
833 : :
834 : 416289 : _loop_vec_info::~_loop_vec_info ()
835 : : {
836 : 416289 : free (bbs);
837 : :
838 : 416289 : release_vec_loop_controls (&masks.rgc_vec);
839 : 416289 : release_vec_loop_controls (&lens);
840 : 420087 : delete ivexpr_map;
841 : 416611 : delete scan_map;
842 : 416289 : delete scalar_costs;
843 : 416289 : delete vector_costs;
844 : 559398 : for (auto reduc_info : reduc_infos)
845 : 139322 : delete reduc_info;
846 : :
847 : : /* When we release an epiloge vinfo that we do not intend to use
848 : : avoid clearing AUX of the main loop which should continue to
849 : : point to the main loop vinfo since otherwise we'll leak that. */
850 : 416289 : if (loop->aux == this)
851 : 60627 : loop->aux = NULL;
852 : 832578 : }
853 : :
854 : : /* Return an invariant or register for EXPR and emit necessary
855 : : computations in the LOOP_VINFO loop preheader. */
856 : :
857 : : tree
858 : 19410 : cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
859 : : {
860 : 19410 : if (is_gimple_reg (expr)
861 : 19410 : || is_gimple_min_invariant (expr))
862 : 6428 : return expr;
863 : :
864 : 12982 : if (! loop_vinfo->ivexpr_map)
865 : 3798 : loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
866 : 12982 : tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
867 : 12982 : if (! cached)
868 : : {
869 : 8474 : gimple_seq stmts = NULL;
870 : 8474 : cached = force_gimple_operand (unshare_expr (expr),
871 : : &stmts, true, NULL_TREE);
872 : 8474 : if (stmts)
873 : : {
874 : 8334 : edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
875 : 8334 : gsi_insert_seq_on_edge_immediate (e, stmts);
876 : : }
877 : : }
878 : 12982 : return cached;
879 : : }
880 : :
881 : : /* Return true if we can use CMP_TYPE as the comparison type to produce
882 : : all masks required to mask LOOP_VINFO. */
883 : :
884 : : static bool
885 : 74007 : can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
886 : : {
887 : 74007 : rgroup_controls *rgm;
888 : 74007 : unsigned int i;
889 : 86530 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
890 : 86530 : if (rgm->type != NULL_TREE
891 : 86530 : && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
892 : : cmp_type, rgm->type,
893 : : OPTIMIZE_FOR_SPEED))
894 : : return false;
895 : : return true;
896 : : }
897 : :
898 : : /* Calculate the maximum number of scalars per iteration for every
899 : : rgroup in LOOP_VINFO. */
900 : :
901 : : static unsigned int
902 : 15654 : vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
903 : : {
904 : 15654 : unsigned int res = 1;
905 : 15654 : unsigned int i;
906 : 15654 : rgroup_controls *rgm;
907 : 38764 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
908 : 23110 : res = MAX (res, rgm->max_nscalars_per_iter);
909 : 15654 : return res;
910 : : }
911 : :
912 : : /* Calculate the minimum precision necessary to represent:
913 : :
914 : : MAX_NITERS * FACTOR
915 : :
916 : : as an unsigned integer, where MAX_NITERS is the maximum number of
917 : : loop header iterations for the original scalar form of LOOP_VINFO. */
918 : :
919 : : static unsigned
920 : 15654 : vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
921 : : {
922 : 15654 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
923 : :
924 : : /* Get the maximum number of iterations that is representable
925 : : in the counter type. */
926 : 15654 : tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
927 : 15654 : widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
928 : :
929 : : /* Get a more refined estimate for the number of iterations. */
930 : 15654 : widest_int max_back_edges;
931 : 15654 : if (max_loop_iterations (loop, &max_back_edges))
932 : 15654 : max_ni = wi::smin (max_ni, max_back_edges + 1);
933 : :
934 : : /* Work out how many bits we need to represent the limit. */
935 : 15654 : return wi::min_precision (max_ni * factor, UNSIGNED);
936 : 15654 : }
937 : :
938 : : /* True if the loop needs peeling or partial vectors when vectorized. */
939 : :
940 : : static bool
941 : 113391 : vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
942 : : {
943 : 113391 : unsigned HOST_WIDE_INT const_vf;
944 : 113391 : HOST_WIDE_INT max_niter
945 : 113391 : = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
946 : :
947 : 113391 : unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
948 : 113391 : if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
949 : 12452 : th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
950 : : (loop_vinfo));
951 : :
952 : 12452 : loop_vec_info main_loop_vinfo
953 : 113391 : = (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
954 : 113391 : ? LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo) : loop_vinfo);
955 : 113391 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
956 : 53701 : && LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo) >= 0)
957 : : {
958 : : /* Work out the (constant) number of iterations that need to be
959 : : peeled for reasons other than niters. */
960 : 53658 : unsigned int peel_niter
961 : : = LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo);
962 : 53658 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
963 : 367 : peel_niter += 1;
964 : 112403 : if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
965 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
966 : : return true;
967 : : }
968 : 59733 : else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo)
969 : : /* ??? When peeling for gaps but not alignment, we could
970 : : try to check whether the (variable) niters is known to be
971 : : VF * N + 1. That's something of a niche case though. */
972 : 59458 : || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
973 : 58586 : || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
974 : 118319 : || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
975 : 117172 : < (unsigned) exact_log2 (const_vf))
976 : : /* In case of versioning, check if the maximum number of
977 : : iterations is greater than th. If they are identical,
978 : : the epilogue is unnecessary. */
979 : 57624 : && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
980 : 4279 : || ((unsigned HOST_WIDE_INT) max_niter
981 : : /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
982 : : but that's only computed later based on our result.
983 : : The following is the most conservative approximation. */
984 : 4279 : > (std::max ((unsigned HOST_WIDE_INT) th,
985 : 4279 : const_vf) / const_vf) * const_vf))))
986 : 58745 : return true;
987 : :
988 : : return false;
989 : : }
990 : :
991 : : /* Each statement in LOOP_VINFO can be masked where necessary. Check
992 : : whether we can actually generate the masks required. Return true if so,
993 : : storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
994 : :
995 : : static bool
996 : 15654 : vect_verify_full_masking (loop_vec_info loop_vinfo)
997 : : {
998 : 15654 : unsigned int min_ni_width;
999 : :
1000 : : /* Use a normal loop if there are no statements that need masking.
1001 : : This only happens in rare degenerate cases: it means that the loop
1002 : : has no loads, no stores, and no live-out values. */
1003 : 15654 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1004 : : return false;
1005 : :
1006 : : /* Produce the rgroup controls. */
1007 : 61618 : for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1008 : : {
1009 : 22982 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1010 : 22982 : tree vectype = mask.first;
1011 : 22982 : unsigned nvectors = mask.second;
1012 : :
1013 : 30310 : if (masks->rgc_vec.length () < nvectors)
1014 : 17361 : masks->rgc_vec.safe_grow_cleared (nvectors, true);
1015 : 22982 : rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1016 : : /* The number of scalars per iteration and the number of vectors are
1017 : : both compile-time constants. */
1018 : 22982 : unsigned int nscalars_per_iter
1019 : 22982 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1020 : 22982 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1021 : :
1022 : 22982 : if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1023 : : {
1024 : 18878 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1025 : 18878 : rgm->type = truth_type_for (vectype);
1026 : 18878 : rgm->factor = 1;
1027 : : }
1028 : : }
1029 : :
1030 : 15654 : unsigned int max_nscalars_per_iter
1031 : 15654 : = vect_get_max_nscalars_per_iter (loop_vinfo);
1032 : :
1033 : : /* Work out how many bits we need to represent the limit. */
1034 : 15654 : min_ni_width
1035 : 15654 : = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1036 : :
1037 : : /* Find a scalar mode for which WHILE_ULT is supported. */
1038 : 15654 : opt_scalar_int_mode cmp_mode_iter;
1039 : 15654 : tree cmp_type = NULL_TREE;
1040 : 15654 : tree iv_type = NULL_TREE;
1041 : 15654 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1042 : 15654 : unsigned int iv_precision = UINT_MAX;
1043 : :
1044 : 15654 : if (iv_limit != -1)
1045 : 15654 : iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1046 : : UNSIGNED);
1047 : :
1048 : 125232 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1049 : : {
1050 : 109578 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1051 : 109578 : if (cmp_bits >= min_ni_width
1052 : 109578 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1053 : : {
1054 : 74007 : tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1055 : 74007 : if (this_type
1056 : 74007 : && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1057 : : {
1058 : : /* Although we could stop as soon as we find a valid mode,
1059 : : there are at least two reasons why that's not always the
1060 : : best choice:
1061 : :
1062 : : - An IV that's Pmode or wider is more likely to be reusable
1063 : : in address calculations than an IV that's narrower than
1064 : : Pmode.
1065 : :
1066 : : - Doing the comparison in IV_PRECISION or wider allows
1067 : : a natural 0-based IV, whereas using a narrower comparison
1068 : : type requires mitigations against wrap-around.
1069 : :
1070 : : Conversely, if the IV limit is variable, doing the comparison
1071 : : in a wider type than the original type can introduce
1072 : : unnecessary extensions, so picking the widest valid mode
1073 : : is not always a good choice either.
1074 : :
1075 : : Here we prefer the first IV type that's Pmode or wider,
1076 : : and the first comparison type that's IV_PRECISION or wider.
1077 : : (The comparison type must be no wider than the IV type,
1078 : : to avoid extensions in the vector loop.)
1079 : :
1080 : : ??? We might want to try continuing beyond Pmode for ILP32
1081 : : targets if CMP_BITS < IV_PRECISION. */
1082 : 0 : iv_type = this_type;
1083 : 0 : if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1084 : : cmp_type = this_type;
1085 : 0 : if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1086 : : break;
1087 : : }
1088 : : }
1089 : : }
1090 : :
1091 : 15654 : if (!cmp_type)
1092 : : {
1093 : 15654 : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1094 : 15654 : return false;
1095 : : }
1096 : :
1097 : 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1098 : 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1099 : 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1100 : 0 : return true;
1101 : 15654 : }
1102 : :
1103 : : /* Each statement in LOOP_VINFO can be masked where necessary. Check
1104 : : whether we can actually generate AVX512 style masks. Return true if so,
1105 : : storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1106 : :
1107 : : static bool
1108 : 15654 : vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1109 : : {
1110 : : /* Produce differently organized rgc_vec and differently check
1111 : : we can produce masks. */
1112 : :
1113 : : /* Use a normal loop if there are no statements that need masking.
1114 : : This only happens in rare degenerate cases: it means that the loop
1115 : : has no loads, no stores, and no live-out values. */
1116 : 15654 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1117 : : return false;
1118 : :
1119 : : /* For the decrementing IV we need to represent all values in
1120 : : [0, niter + niter_skip] where niter_skip is the elements we
1121 : : skip in the first iteration for prologue peeling. */
1122 : 15654 : tree iv_type = NULL_TREE;
1123 : 15654 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1124 : 15654 : unsigned int iv_precision = UINT_MAX;
1125 : 15654 : if (iv_limit != -1)
1126 : 15654 : iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1127 : :
1128 : : /* First compute the type for the IV we use to track the remaining
1129 : : scalar iterations. */
1130 : 15654 : opt_scalar_int_mode cmp_mode_iter;
1131 : 19901 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1132 : : {
1133 : 19901 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1134 : 19901 : if (cmp_bits >= iv_precision
1135 : 19901 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1136 : : {
1137 : 15654 : iv_type = build_nonstandard_integer_type (cmp_bits, true);
1138 : 15654 : if (iv_type)
1139 : : break;
1140 : : }
1141 : : }
1142 : 15654 : if (!iv_type)
1143 : : return false;
1144 : :
1145 : : /* Produce the rgroup controls. */
1146 : 61618 : for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1147 : : {
1148 : 22982 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1149 : 22982 : tree vectype = mask.first;
1150 : 22982 : unsigned nvectors = mask.second;
1151 : :
1152 : : /* The number of scalars per iteration and the number of vectors are
1153 : : both compile-time constants. */
1154 : 22982 : unsigned int nscalars_per_iter
1155 : 22982 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1156 : 22982 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1157 : :
1158 : : /* We index the rgroup_controls vector with nscalars_per_iter
1159 : : which we keep constant and instead have a varying nvectors,
1160 : : remembering the vector mask with the fewest nV. */
1161 : 30310 : if (masks->rgc_vec.length () < nscalars_per_iter)
1162 : 15712 : masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1163 : 22982 : rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1164 : :
1165 : 22982 : if (!rgm->type || rgm->factor > nvectors)
1166 : : {
1167 : 17226 : rgm->type = truth_type_for (vectype);
1168 : 17226 : rgm->compare_type = NULL_TREE;
1169 : 17226 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1170 : 17226 : rgm->factor = nvectors;
1171 : 17226 : rgm->bias_adjusted_ctrl = NULL_TREE;
1172 : : }
1173 : : }
1174 : :
1175 : : /* There is no fixed compare type we are going to use but we have to
1176 : : be able to get at one for each mask group. */
1177 : 15654 : unsigned int min_ni_width
1178 : 15654 : = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1179 : :
1180 : 15654 : bool ok = true;
1181 : 60265 : for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1182 : : {
1183 : 16411 : tree mask_type = rgc.type;
1184 : 16411 : if (!mask_type)
1185 : 681 : continue;
1186 : :
1187 : : /* For now vect_get_loop_mask only supports integer mode masks
1188 : : when we need to split it. */
1189 : 15730 : if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1190 : 15730 : || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1191 : : {
1192 : : ok = false;
1193 : : break;
1194 : : }
1195 : :
1196 : : /* If iv_type is usable as compare type use that - we can elide the
1197 : : saturation in that case. */
1198 : 12626 : if (TYPE_PRECISION (iv_type) >= min_ni_width)
1199 : : {
1200 : 12626 : tree cmp_vectype
1201 : 12626 : = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1202 : 12626 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1203 : 4467 : rgc.compare_type = cmp_vectype;
1204 : : }
1205 : 12626 : if (!rgc.compare_type)
1206 : 23851 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1207 : : {
1208 : 23847 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1209 : 23847 : if (cmp_bits >= min_ni_width
1210 : 23847 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1211 : : {
1212 : 23835 : tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1213 : 23835 : if (!cmp_type)
1214 : 0 : continue;
1215 : :
1216 : : /* Check whether we can produce the mask with cmp_type. */
1217 : 23835 : tree cmp_vectype
1218 : 23835 : = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1219 : 23835 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1220 : : {
1221 : 8155 : rgc.compare_type = cmp_vectype;
1222 : 8155 : break;
1223 : : }
1224 : : }
1225 : : }
1226 : 12626 : if (!rgc.compare_type)
1227 : : {
1228 : : ok = false;
1229 : : break;
1230 : : }
1231 : : }
1232 : 15654 : if (!ok)
1233 : : {
1234 : 3108 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1235 : 3108 : return false;
1236 : : }
1237 : :
1238 : 12546 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1239 : 12546 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1240 : 12546 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1241 : 12546 : return true;
1242 : 15654 : }
1243 : :
1244 : : /* Check whether we can use vector access with length based on precison
1245 : : comparison. So far, to keep it simple, we only allow the case that the
1246 : : precision of the target supported length is larger than the precision
1247 : : required by loop niters. */
1248 : :
1249 : : static bool
1250 : 6 : vect_verify_loop_lens (loop_vec_info loop_vinfo)
1251 : : {
1252 : 6 : if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1253 : : return false;
1254 : :
1255 : 0 : if (!VECTOR_MODE_P (loop_vinfo->vector_mode))
1256 : : return false;
1257 : :
1258 : 0 : machine_mode len_load_mode, len_store_mode;
1259 : 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1260 : 0 : .exists (&len_load_mode))
1261 : 0 : return false;
1262 : 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1263 : 0 : .exists (&len_store_mode))
1264 : 0 : return false;
1265 : :
1266 : 0 : signed char partial_load_bias = internal_len_load_store_bias
1267 : 0 : (IFN_LEN_LOAD, len_load_mode);
1268 : :
1269 : 0 : signed char partial_store_bias = internal_len_load_store_bias
1270 : 0 : (IFN_LEN_STORE, len_store_mode);
1271 : :
1272 : 0 : gcc_assert (partial_load_bias == partial_store_bias);
1273 : :
1274 : 0 : if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1275 : : return false;
1276 : :
1277 : : /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1278 : : len_loads with a length of zero. In order to avoid that we prohibit
1279 : : more than one loop length here. */
1280 : 0 : if (partial_load_bias == -1
1281 : 0 : && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1282 : : return false;
1283 : :
1284 : 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1285 : :
1286 : 0 : unsigned int max_nitems_per_iter = 1;
1287 : 0 : unsigned int i;
1288 : 0 : rgroup_controls *rgl;
1289 : : /* Find the maximum number of items per iteration for every rgroup. */
1290 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1291 : : {
1292 : 0 : unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1293 : 0 : max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1294 : : }
1295 : :
1296 : : /* Work out how many bits we need to represent the length limit. */
1297 : 0 : unsigned int min_ni_prec
1298 : 0 : = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1299 : :
1300 : : /* Now use the maximum of below precisions for one suitable IV type:
1301 : : - the IV's natural precision
1302 : : - the precision needed to hold: the maximum number of scalar
1303 : : iterations multiplied by the scale factor (min_ni_prec above)
1304 : : - the Pmode precision
1305 : :
1306 : : If min_ni_prec is less than the precision of the current niters,
1307 : : we perfer to still use the niters type. Prefer to use Pmode and
1308 : : wider IV to avoid narrow conversions. */
1309 : :
1310 : 0 : unsigned int ni_prec
1311 : 0 : = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1312 : 0 : min_ni_prec = MAX (min_ni_prec, ni_prec);
1313 : 0 : min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1314 : :
1315 : 0 : tree iv_type = NULL_TREE;
1316 : 0 : opt_scalar_int_mode tmode_iter;
1317 : 0 : FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1318 : : {
1319 : 0 : scalar_mode tmode = tmode_iter.require ();
1320 : 0 : unsigned int tbits = GET_MODE_BITSIZE (tmode);
1321 : :
1322 : : /* ??? Do we really want to construct one IV whose precision exceeds
1323 : : BITS_PER_WORD? */
1324 : 0 : if (tbits > BITS_PER_WORD)
1325 : : break;
1326 : :
1327 : : /* Find the first available standard integral type. */
1328 : 0 : if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1329 : : {
1330 : 0 : iv_type = build_nonstandard_integer_type (tbits, true);
1331 : 0 : break;
1332 : : }
1333 : : }
1334 : :
1335 : 0 : if (!iv_type)
1336 : : {
1337 : 0 : if (dump_enabled_p ())
1338 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1339 : : "can't vectorize with length-based partial vectors"
1340 : : " because there is no suitable iv type.\n");
1341 : 0 : return false;
1342 : : }
1343 : :
1344 : 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1345 : 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1346 : 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1347 : :
1348 : 0 : return true;
1349 : : }
1350 : :
1351 : : /* Calculate the cost of one scalar iteration of the loop. */
1352 : : static void
1353 : 285981 : vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1354 : : {
1355 : 285981 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1356 : 285981 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1357 : 285981 : int nbbs = loop->num_nodes, factor;
1358 : 285981 : int innerloop_iters, i;
1359 : :
1360 : 285981 : DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1361 : :
1362 : : /* Gather costs for statements in the scalar loop. */
1363 : :
1364 : : /* FORNOW. */
1365 : 285981 : innerloop_iters = 1;
1366 : 285981 : if (loop->inner)
1367 : 1261 : innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1368 : :
1369 : 1017541 : for (i = 0; i < nbbs; i++)
1370 : : {
1371 : 731560 : gimple_stmt_iterator si;
1372 : 731560 : basic_block bb = bbs[i];
1373 : :
1374 : 731560 : if (bb->loop_father == loop->inner)
1375 : : factor = innerloop_iters;
1376 : : else
1377 : 729038 : factor = 1;
1378 : :
1379 : 5829270 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1380 : : {
1381 : 4366150 : gimple *stmt = gsi_stmt (si);
1382 : 4366150 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1383 : :
1384 : 4366150 : if (!is_gimple_assign (stmt)
1385 : : && !is_gimple_call (stmt)
1386 : : && !is_a<gcond *> (stmt))
1387 : 1582586 : continue;
1388 : :
1389 : : /* Skip stmts that are not vectorized inside the loop. */
1390 : 2783564 : stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1391 : 2783564 : if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1392 : 1183405 : && (!STMT_VINFO_LIVE_P (vstmt_info)
1393 : 65 : || !VECTORIZABLE_CYCLE_DEF
1394 : : (STMT_VINFO_DEF_TYPE (vstmt_info))))
1395 : 1183405 : continue;
1396 : :
1397 : 1600159 : vect_cost_for_stmt kind;
1398 : 1600159 : if (STMT_VINFO_DATA_REF (stmt_info))
1399 : : {
1400 : 677198 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1401 : : kind = scalar_load;
1402 : : else
1403 : 240168 : kind = scalar_store;
1404 : : }
1405 : 922961 : else if (vect_nop_conversion_p (stmt_info))
1406 : 41136 : continue;
1407 : : else
1408 : : kind = scalar_stmt;
1409 : :
1410 : : /* We are using vect_prologue here to avoid scaling twice
1411 : : by the inner loop factor. */
1412 : 1559023 : record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1413 : : factor, kind, stmt_info, 0, vect_prologue);
1414 : : }
1415 : : }
1416 : :
1417 : : /* Now accumulate cost. */
1418 : 285981 : loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1419 : 285981 : add_stmt_costs (loop_vinfo->scalar_costs,
1420 : : &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1421 : 285981 : loop_vinfo->scalar_costs->finish_cost (nullptr);
1422 : 285981 : }
1423 : :
1424 : : /* Function vect_analyze_loop_form.
1425 : :
1426 : : Verify that certain CFG restrictions hold, including:
1427 : : - the loop has a pre-header
1428 : : - the loop has a single entry
1429 : : - nested loops can have only a single exit.
1430 : : - the loop exit condition is simple enough
1431 : : - the number of iterations can be analyzed, i.e, a countable loop. The
1432 : : niter could be analyzed under some assumptions. */
1433 : :
1434 : : opt_result
1435 : 468047 : vect_analyze_loop_form (class loop *loop, gimple *loop_vectorized_call,
1436 : : vect_loop_form_info *info)
1437 : : {
1438 : 468047 : DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1439 : :
1440 : 468047 : edge exit_e = vec_init_loop_exit_info (loop);
1441 : 468047 : if (!exit_e)
1442 : 59964 : return opt_result::failure_at (vect_location,
1443 : : "not vectorized:"
1444 : : " could not determine main exit from"
1445 : : " loop with multiple exits.\n");
1446 : 408083 : if (loop_vectorized_call)
1447 : : {
1448 : 26260 : tree arg = gimple_call_arg (loop_vectorized_call, 1);
1449 : 26260 : class loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg));
1450 : 26260 : edge scalar_exit_e = vec_init_loop_exit_info (scalar_loop);
1451 : 26260 : if (!scalar_exit_e)
1452 : 0 : return opt_result::failure_at (vect_location,
1453 : : "not vectorized:"
1454 : : " could not determine main exit from"
1455 : : " loop with multiple exits.\n");
1456 : : }
1457 : :
1458 : 408083 : info->loop_exit = exit_e;
1459 : 408083 : if (dump_enabled_p ())
1460 : 15273 : dump_printf_loc (MSG_NOTE, vect_location,
1461 : : "using as main loop exit: %d -> %d [AUX: %p]\n",
1462 : 15273 : exit_e->src->index, exit_e->dest->index, exit_e->aux);
1463 : :
1464 : : /* Check if we have any control flow that doesn't leave the loop. */
1465 : 408083 : basic_block *bbs = get_loop_body (loop);
1466 : 1365511 : for (unsigned i = 0; i < loop->num_nodes; i++)
1467 : 1066020 : if (EDGE_COUNT (bbs[i]->succs) != 1
1468 : 1066020 : && (EDGE_COUNT (bbs[i]->succs) != 2
1469 : 630293 : || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1470 : : {
1471 : 108592 : free (bbs);
1472 : 108592 : return opt_result::failure_at (vect_location,
1473 : : "not vectorized:"
1474 : : " unsupported control flow in loop.\n");
1475 : : }
1476 : 299491 : free (bbs);
1477 : :
1478 : : /* Different restrictions apply when we are considering an inner-most loop,
1479 : : vs. an outer (nested) loop.
1480 : : (FORNOW. May want to relax some of these restrictions in the future). */
1481 : :
1482 : 299491 : info->inner_loop_cond = NULL;
1483 : 299491 : if (!loop->inner)
1484 : : {
1485 : : /* Inner-most loop. */
1486 : :
1487 : 277893 : if (empty_block_p (loop->header))
1488 : 3 : return opt_result::failure_at (vect_location,
1489 : : "not vectorized: empty loop.\n");
1490 : : }
1491 : : else
1492 : : {
1493 : 21598 : class loop *innerloop = loop->inner;
1494 : 21598 : edge entryedge;
1495 : :
1496 : : /* Nested loop. We currently require that the loop is doubly-nested,
1497 : : contains a single inner loop with a single exit to the block
1498 : : with the single exit condition in the outer loop.
1499 : : Vectorizable outer-loops look like this:
1500 : :
1501 : : (pre-header)
1502 : : |
1503 : : header <---+
1504 : : | |
1505 : : inner-loop |
1506 : : | |
1507 : : tail ------+
1508 : : |
1509 : : (exit-bb)
1510 : :
1511 : : The inner-loop also has the properties expected of inner-most loops
1512 : : as described above. */
1513 : :
1514 : 21598 : if ((loop->inner)->inner || (loop->inner)->next)
1515 : 2977 : return opt_result::failure_at (vect_location,
1516 : : "not vectorized:"
1517 : : " multiple nested loops.\n");
1518 : :
1519 : 18621 : entryedge = loop_preheader_edge (innerloop);
1520 : 18621 : if (entryedge->src != loop->header
1521 : 18271 : || !single_exit (innerloop)
1522 : 29907 : || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1523 : 7620 : return opt_result::failure_at (vect_location,
1524 : : "not vectorized:"
1525 : : " unsupported outerloop form.\n");
1526 : :
1527 : : /* Analyze the inner-loop. */
1528 : 11001 : vect_loop_form_info inner;
1529 : 11001 : opt_result res = vect_analyze_loop_form (loop->inner, NULL, &inner);
1530 : 11001 : if (!res)
1531 : : {
1532 : 1272 : if (dump_enabled_p ())
1533 : 5 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1534 : : "not vectorized: Bad inner loop.\n");
1535 : 1272 : return res;
1536 : : }
1537 : :
1538 : : /* Don't support analyzing niter under assumptions for inner
1539 : : loop. */
1540 : 9729 : if (!integer_onep (inner.assumptions))
1541 : 283 : return opt_result::failure_at (vect_location,
1542 : : "not vectorized: Bad inner loop.\n");
1543 : :
1544 : 9446 : if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1545 : 1086 : return opt_result::failure_at (vect_location,
1546 : : "not vectorized: inner-loop count not"
1547 : : " invariant.\n");
1548 : :
1549 : 8360 : if (dump_enabled_p ())
1550 : 946 : dump_printf_loc (MSG_NOTE, vect_location,
1551 : : "Considering outer-loop vectorization.\n");
1552 : 8360 : info->inner_loop_cond = inner.conds[0];
1553 : 11001 : }
1554 : :
1555 : 286250 : if (EDGE_COUNT (loop->header->preds) != 2)
1556 : 0 : return opt_result::failure_at (vect_location,
1557 : : "not vectorized:"
1558 : : " too many incoming edges.\n");
1559 : :
1560 : : /* We assume that the latch is empty. */
1561 : 286250 : basic_block latch = loop->latch;
1562 : 286250 : do
1563 : : {
1564 : 286250 : if (!empty_block_p (latch)
1565 : 286250 : || !gimple_seq_empty_p (phi_nodes (latch)))
1566 : 20043 : return opt_result::failure_at (vect_location,
1567 : : "not vectorized: latch block not "
1568 : : "empty.\n");
1569 : 266207 : latch = single_pred (latch);
1570 : : }
1571 : 532414 : while (single_succ_p (latch));
1572 : :
1573 : : /* Make sure there is no abnormal exit. */
1574 : 266207 : auto_vec<edge> exits = get_loop_exit_edges (loop);
1575 : 1173403 : for (edge e : exits)
1576 : : {
1577 : 374826 : if (e->flags & EDGE_ABNORMAL)
1578 : 44 : return opt_result::failure_at (vect_location,
1579 : : "not vectorized:"
1580 : : " abnormal loop exit edge.\n");
1581 : : }
1582 : :
1583 : 266163 : info->conds
1584 : 266163 : = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1585 : : &info->number_of_iterations,
1586 : 266163 : &info->number_of_iterationsm1);
1587 : 266163 : if (info->conds.is_empty ())
1588 : 36 : return opt_result::failure_at
1589 : 36 : (vect_location,
1590 : : "not vectorized: complicated exit condition.\n");
1591 : :
1592 : : /* Determine what the primary and alternate exit conds are. */
1593 : 631219 : for (unsigned i = 0; i < info->conds.length (); i++)
1594 : : {
1595 : 365092 : gcond *cond = info->conds[i];
1596 : 365092 : if (exit_e->src == gimple_bb (cond))
1597 : 266127 : std::swap (info->conds[0], info->conds[i]);
1598 : : }
1599 : :
1600 : 266127 : if (integer_zerop (info->assumptions)
1601 : 266127 : || !info->number_of_iterations
1602 : 532254 : || chrec_contains_undetermined (info->number_of_iterations))
1603 : 41062 : return opt_result::failure_at
1604 : 41062 : (info->conds[0],
1605 : : "not vectorized: number of iterations cannot be computed.\n");
1606 : :
1607 : 225065 : if (integer_zerop (info->number_of_iterations))
1608 : 14 : return opt_result::failure_at
1609 : 14 : (info->conds[0],
1610 : : "not vectorized: number of iterations = 0.\n");
1611 : :
1612 : 225051 : if (!(tree_fits_shwi_p (info->number_of_iterations)
1613 : 121139 : && tree_to_shwi (info->number_of_iterations) > 0))
1614 : : {
1615 : 103912 : if (dump_enabled_p ())
1616 : : {
1617 : 2403 : dump_printf_loc (MSG_NOTE, vect_location,
1618 : : "Symbolic number of iterations is ");
1619 : 2403 : dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1620 : 2403 : dump_printf (MSG_NOTE, "\n");
1621 : : }
1622 : : }
1623 : :
1624 : 225051 : if (!integer_onep (info->assumptions))
1625 : : {
1626 : 10335 : if (dump_enabled_p ())
1627 : : {
1628 : 63 : dump_printf_loc (MSG_NOTE, vect_location,
1629 : : "Loop to be versioned with niter assumption ");
1630 : 63 : dump_generic_expr (MSG_NOTE, TDF_SLIM, info->assumptions);
1631 : 63 : dump_printf (MSG_NOTE, "\n");
1632 : : }
1633 : : }
1634 : :
1635 : 225051 : return opt_result::success ();
1636 : 266207 : }
1637 : :
1638 : : /* Create a loop_vec_info for LOOP with SHARED and the
1639 : : vect_analyze_loop_form result. */
1640 : :
1641 : : loop_vec_info
1642 : 416289 : vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1643 : : const vect_loop_form_info *info,
1644 : : loop_vec_info orig_loop_info)
1645 : : {
1646 : 416289 : loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1647 : 416289 : LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1648 : 416289 : LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1649 : 416289 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1650 : 416289 : LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_info;
1651 : 416289 : if (orig_loop_info && LOOP_VINFO_EPILOGUE_P (orig_loop_info))
1652 : 171 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo)
1653 : 171 : = LOOP_VINFO_MAIN_LOOP_INFO (orig_loop_info);
1654 : : else
1655 : 416118 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo) = orig_loop_info;
1656 : : /* Also record the assumptions for versioning. */
1657 : 416289 : if (!integer_onep (info->assumptions) && !orig_loop_info)
1658 : 20575 : LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1659 : :
1660 : 1884467 : for (gcond *cond : info->conds)
1661 : : {
1662 : 635600 : stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1663 : : /* Mark the statement as a condition. */
1664 : 635600 : STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1665 : : }
1666 : :
1667 : 635600 : for (unsigned i = 1; i < info->conds.length (); i ++)
1668 : 219311 : LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1669 : 416289 : LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1670 : :
1671 : 416289 : LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1672 : :
1673 : : /* Check to see if we're vectorizing multiple exits. */
1674 : 416289 : LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1675 : 416289 : = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1676 : :
1677 : 416289 : if (info->inner_loop_cond)
1678 : : {
1679 : : /* If we have an estimate on the number of iterations of the inner
1680 : : loop use that to limit the scale for costing, otherwise use
1681 : : --param vect-inner-loop-cost-factor literally. */
1682 : 8470 : widest_int nit;
1683 : 8470 : if (estimated_stmt_executions (loop->inner, &nit))
1684 : 7223 : LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1685 : 7223 : = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1686 : 8470 : }
1687 : :
1688 : 416289 : return loop_vinfo;
1689 : : }
1690 : :
1691 : :
1692 : :
1693 : : /* Return true if we know that the iteration count is smaller than the
1694 : : vectorization factor. Return false if it isn't, or if we can't be sure
1695 : : either way. */
1696 : :
1697 : : static bool
1698 : 112596 : vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1699 : : {
1700 : 112596 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1701 : :
1702 : 112596 : HOST_WIDE_INT max_niter;
1703 : 112596 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1704 : 53536 : max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1705 : : else
1706 : 59060 : max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1707 : :
1708 : 112596 : if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1709 : 10624 : return true;
1710 : :
1711 : : return false;
1712 : : }
1713 : :
1714 : : /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1715 : : is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1716 : : definitely no, or -1 if it's worth retrying. */
1717 : :
1718 : : static int
1719 : 112604 : vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1720 : : unsigned *suggested_unroll_factor)
1721 : : {
1722 : 112604 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1723 : 112604 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1724 : :
1725 : : /* Only loops that can handle partially-populated vectors can have iteration
1726 : : counts less than the vectorization factor. */
1727 : 112604 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
1728 : 112604 : && vect_known_niters_smaller_than_vf (loop_vinfo))
1729 : : {
1730 : 10614 : if (dump_enabled_p ())
1731 : 236 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1732 : : "not vectorized: iteration count smaller than "
1733 : : "vectorization factor.\n");
1734 : 10614 : return 0;
1735 : : }
1736 : :
1737 : : /* If we know the number of iterations we can do better, for the
1738 : : epilogue we can also decide whether the main loop leaves us
1739 : : with enough iterations, prefering a smaller vector epilog then
1740 : : also possibly used for the case we skip the vector loop. */
1741 : 101990 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1742 : : {
1743 : 44150 : widest_int scalar_niters
1744 : 44150 : = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
1745 : 44150 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1746 : : {
1747 : 2671 : loop_vec_info orig_loop_vinfo
1748 : : = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1749 : 2671 : loop_vec_info main_loop_vinfo
1750 : : = LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo);
1751 : 2671 : unsigned lowest_vf
1752 : 2671 : = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
1753 : 2671 : int prolog_peeling = 0;
1754 : 2671 : if (!vect_use_loop_mask_for_alignment_p (main_loop_vinfo))
1755 : 2671 : prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo);
1756 : 2671 : if (prolog_peeling >= 0
1757 : 2671 : && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
1758 : : lowest_vf))
1759 : : {
1760 : 5332 : unsigned gap
1761 : 2666 : = LOOP_VINFO_PEELING_FOR_GAPS (main_loop_vinfo) ? 1 : 0;
1762 : 5332 : scalar_niters = ((scalar_niters - gap - prolog_peeling)
1763 : 5332 : % lowest_vf + gap);
1764 : : }
1765 : : }
1766 : : /* Reject vectorizing for a single scalar iteration, even if
1767 : : we could in principle implement that using partial vectors. */
1768 : 44150 : unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
1769 : 44150 : if (scalar_niters <= peeling_gap + 1)
1770 : : {
1771 : 784 : if (dump_enabled_p ())
1772 : 168 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1773 : : "not vectorized: loop only has a single "
1774 : : "scalar iteration.\n");
1775 : 784 : return 0;
1776 : : }
1777 : :
1778 : 43366 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1779 : : {
1780 : : /* Check that the loop processes at least one full vector. */
1781 : 43355 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1782 : 43355 : if (known_lt (scalar_niters, vf))
1783 : : {
1784 : 361 : if (dump_enabled_p ())
1785 : 293 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1786 : : "loop does not have enough iterations "
1787 : : "to support vectorization.\n");
1788 : 401 : return 0;
1789 : : }
1790 : :
1791 : : /* If we need to peel an extra epilogue iteration to handle data
1792 : : accesses with gaps, check that there are enough scalar iterations
1793 : : available.
1794 : :
1795 : : The check above is redundant with this one when peeling for gaps,
1796 : : but the distinction is useful for diagnostics. */
1797 : 42994 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1798 : 43284 : && known_le (scalar_niters, vf))
1799 : : {
1800 : 40 : if (dump_enabled_p ())
1801 : 9 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1802 : : "loop does not have enough iterations "
1803 : : "to support peeling for gaps.\n");
1804 : 40 : return 0;
1805 : : }
1806 : : }
1807 : 44150 : }
1808 : :
1809 : : /* If using the "very cheap" model. reject cases in which we'd keep
1810 : : a copy of the scalar code (even if we might be able to vectorize it). */
1811 : 100805 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1812 : 100805 : && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1813 : 50944 : || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
1814 : : {
1815 : 721 : if (dump_enabled_p ())
1816 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1817 : : "some scalar iterations would need to be peeled\n");
1818 : 721 : return 0;
1819 : : }
1820 : :
1821 : 100084 : int min_profitable_iters, min_profitable_estimate;
1822 : 100084 : vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1823 : : &min_profitable_estimate,
1824 : : suggested_unroll_factor);
1825 : :
1826 : 100084 : if (min_profitable_iters < 0)
1827 : : {
1828 : 26003 : if (dump_enabled_p ())
1829 : 22 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1830 : : "not vectorized: vectorization not profitable.\n");
1831 : 26003 : if (dump_enabled_p ())
1832 : 22 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1833 : : "not vectorized: vector version will never be "
1834 : : "profitable.\n");
1835 : 26003 : return -1;
1836 : : }
1837 : :
1838 : 74081 : int min_scalar_loop_bound = (param_min_vect_loop_bound
1839 : 74081 : * assumed_vf);
1840 : :
1841 : : /* Use the cost model only if it is more conservative than user specified
1842 : : threshold. */
1843 : 74081 : unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1844 : : min_profitable_iters);
1845 : :
1846 : 74081 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1847 : :
1848 : 37758 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1849 : 111839 : && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1850 : : {
1851 : 408 : if (dump_enabled_p ())
1852 : 1 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1853 : : "not vectorized: vectorization not profitable.\n");
1854 : 408 : if (dump_enabled_p ())
1855 : 1 : dump_printf_loc (MSG_NOTE, vect_location,
1856 : : "not vectorized: iteration count smaller than user "
1857 : : "specified loop bound parameter or minimum profitable "
1858 : : "iterations (whichever is more conservative).\n");
1859 : 408 : return 0;
1860 : : }
1861 : :
1862 : : /* The static profitablity threshold min_profitable_estimate includes
1863 : : the cost of having to check at runtime whether the scalar loop
1864 : : should be used instead. If it turns out that we don't need or want
1865 : : such a check, the threshold we should use for the static estimate
1866 : : is simply the point at which the vector loop becomes more profitable
1867 : : than the scalar loop. */
1868 : 73673 : if (min_profitable_estimate > min_profitable_iters
1869 : 15562 : && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1870 : 15092 : && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1871 : 309 : && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1872 : 73982 : && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1873 : : {
1874 : 11 : if (dump_enabled_p ())
1875 : 6 : dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1876 : : " choice between the scalar and vector loops\n");
1877 : 11 : min_profitable_estimate = min_profitable_iters;
1878 : : }
1879 : :
1880 : : /* If the vector loop needs multiple iterations to be beneficial then
1881 : : things are probably too close to call, and the conservative thing
1882 : : would be to stick with the scalar code. */
1883 : 73673 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1884 : 73673 : && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1885 : : {
1886 : 8493 : if (dump_enabled_p ())
1887 : 177 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1888 : : "one iteration of the vector loop would be"
1889 : : " more expensive than the equivalent number of"
1890 : : " iterations of the scalar loop\n");
1891 : 8493 : return 0;
1892 : : }
1893 : :
1894 : 65180 : HOST_WIDE_INT estimated_niter;
1895 : :
1896 : : /* If we are vectorizing an epilogue then we know the maximum number of
1897 : : scalar iterations it will cover is at least one lower than the
1898 : : vectorization factor of the main loop. */
1899 : 65180 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1900 : 10720 : estimated_niter
1901 : 10720 : = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1902 : : else
1903 : : {
1904 : 54460 : estimated_niter = estimated_stmt_executions_int (loop);
1905 : 54460 : if (estimated_niter == -1)
1906 : 20486 : estimated_niter = likely_max_stmt_executions_int (loop);
1907 : : }
1908 : 31206 : if (estimated_niter != -1
1909 : 63567 : && ((unsigned HOST_WIDE_INT) estimated_niter
1910 : 63567 : < MAX (th, (unsigned) min_profitable_estimate)))
1911 : : {
1912 : 4345 : if (dump_enabled_p ())
1913 : 28 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1914 : : "not vectorized: estimated iteration count too "
1915 : : "small.\n");
1916 : 4345 : if (dump_enabled_p ())
1917 : 28 : dump_printf_loc (MSG_NOTE, vect_location,
1918 : : "not vectorized: estimated iteration count smaller "
1919 : : "than specified loop bound parameter or minimum "
1920 : : "profitable iterations (whichever is more "
1921 : : "conservative).\n");
1922 : 4345 : return -1;
1923 : : }
1924 : :
1925 : : return 1;
1926 : : }
1927 : :
1928 : : static opt_result
1929 : 222513 : vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1930 : : vec<data_reference_p> *datarefs)
1931 : : {
1932 : 680994 : for (unsigned i = 0; i < loop->num_nodes; i++)
1933 : 1006668 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1934 : 3773046 : !gsi_end_p (gsi); gsi_next (&gsi))
1935 : : {
1936 : 3314565 : gimple *stmt = gsi_stmt (gsi);
1937 : 3314565 : if (is_gimple_debug (stmt))
1938 : 1209427 : continue;
1939 : 2105266 : opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1940 : : NULL, 0);
1941 : 2105266 : if (!res)
1942 : : {
1943 : 44981 : if (is_gimple_call (stmt) && loop->safelen)
1944 : : {
1945 : 398 : tree fndecl = gimple_call_fndecl (stmt), op;
1946 : 398 : if (fndecl == NULL_TREE
1947 : 398 : && gimple_call_internal_p (stmt, IFN_MASK_CALL))
1948 : : {
1949 : 0 : fndecl = gimple_call_arg (stmt, 0);
1950 : 0 : gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
1951 : 0 : fndecl = TREE_OPERAND (fndecl, 0);
1952 : 0 : gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
1953 : : }
1954 : 398 : if (fndecl != NULL_TREE)
1955 : : {
1956 : 361 : cgraph_node *node = cgraph_node::get (fndecl);
1957 : 361 : if (node != NULL && node->simd_clones != NULL)
1958 : : {
1959 : 129 : unsigned int j, n = gimple_call_num_args (stmt);
1960 : 539 : for (j = 0; j < n; j++)
1961 : : {
1962 : 282 : op = gimple_call_arg (stmt, j);
1963 : 282 : if (DECL_P (op)
1964 : 282 : || (REFERENCE_CLASS_P (op)
1965 : 0 : && get_base_address (op)))
1966 : : break;
1967 : : }
1968 : 129 : op = gimple_call_lhs (stmt);
1969 : : /* Ignore #pragma omp declare simd functions
1970 : : if they don't have data references in the
1971 : : call stmt itself. */
1972 : 257 : if (j == n
1973 : 129 : && !(op
1974 : 118 : && (DECL_P (op)
1975 : 118 : || (REFERENCE_CLASS_P (op)
1976 : 0 : && get_base_address (op)))))
1977 : 128 : continue;
1978 : : }
1979 : : }
1980 : : }
1981 : 44853 : return res;
1982 : : }
1983 : : /* If dependence analysis will give up due to the limit on the
1984 : : number of datarefs stop here and fail fatally. */
1985 : 3616725 : if (datarefs->length ()
1986 : 1556440 : > (unsigned)param_loop_max_datarefs_for_datadeps)
1987 : 0 : return opt_result::failure_at (stmt, "exceeded param "
1988 : : "loop-max-datarefs-for-datadeps\n");
1989 : : }
1990 : 177660 : return opt_result::success ();
1991 : : }
1992 : :
1993 : : /* Determine if operating on full vectors for LOOP_VINFO might leave
1994 : : some scalar iterations still to do. If so, decide how we should
1995 : : handle those scalar iterations. The possibilities are:
1996 : :
1997 : : (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
1998 : : In this case:
1999 : :
2000 : : LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2001 : : LOOP_VINFO_PEELING_FOR_NITER == false
2002 : :
2003 : : (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2004 : : to handle the remaining scalar iterations. In this case:
2005 : :
2006 : : LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2007 : : LOOP_VINFO_PEELING_FOR_NITER == true
2008 : :
2009 : : The MASKED_P argument specifies to what extent
2010 : : param_vect_partial_vector_usage is to be honored. For MASKED_P == 0
2011 : : no partial vectors are to be used, for MASKED_P == -1 it's
2012 : : param_vect_partial_vector_usage that gets to decide whether we may
2013 : : consider partial vector usage. For MASKED_P == 1 partial vectors
2014 : : may be used if possible.
2015 : :
2016 : : */
2017 : :
2018 : : static opt_result
2019 : 113391 : vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2020 : : int masked_p)
2021 : : {
2022 : : /* Determine whether there would be any scalar iterations left over. */
2023 : 113391 : bool need_peeling_or_partial_vectors_p
2024 : 113391 : = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2025 : :
2026 : : /* Decide whether to vectorize the loop with partial vectors. */
2027 : 113391 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2028 : 113391 : if (masked_p == 0
2029 : 113391 : || (masked_p == -1 && param_vect_partial_vector_usage == 0))
2030 : : /* If requested explicitly do not use partial vectors. */
2031 : : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2032 : 119 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2033 : 39 : && LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo))
2034 : 0 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2035 : 119 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2036 : 39 : && need_peeling_or_partial_vectors_p)
2037 : : {
2038 : : /* For partial-vector-usage=1, try to push the handling of partial
2039 : : vectors to the epilogue, with the main loop continuing to operate
2040 : : on full vectors.
2041 : :
2042 : : If we are unrolling we also do not want to use partial vectors. This
2043 : : is to avoid the overhead of generating multiple masks and also to
2044 : : avoid having to execute entire iterations of FALSE masked instructions
2045 : : when dealing with one or less full iterations.
2046 : :
2047 : : ??? We could then end up failing to use partial vectors if we
2048 : : decide to peel iterations into a prologue, and if the main loop
2049 : : then ends up processing fewer than VF iterations. */
2050 : 32 : if ((param_vect_partial_vector_usage == 1
2051 : 8 : || loop_vinfo->suggested_unroll_factor > 1)
2052 : 24 : && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2053 : 49 : && !vect_known_niters_smaller_than_vf (loop_vinfo))
2054 : : ;
2055 : : else
2056 : 25 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2057 : : }
2058 : :
2059 : 113391 : if (LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo)
2060 : 0 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2061 : 0 : return opt_result::failure_at (vect_location,
2062 : : "not vectorized: loop needs but cannot "
2063 : : "use partial vectors\n");
2064 : :
2065 : 113391 : if (dump_enabled_p ())
2066 : 11509 : dump_printf_loc (MSG_NOTE, vect_location,
2067 : : "operating on %s vectors%s.\n",
2068 : 11509 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2069 : : ? "partial" : "full",
2070 : 11509 : LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2071 : : ? " for epilogue loop" : "");
2072 : :
2073 : 113391 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2074 : 226782 : = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2075 : 113391 : && need_peeling_or_partial_vectors_p);
2076 : :
2077 : 113391 : return opt_result::success ();
2078 : : }
2079 : :
2080 : : /* Function vect_analyze_loop_2.
2081 : :
2082 : : Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2083 : : analyses will record information in some members of LOOP_VINFO. FATAL
2084 : : indicates if some analysis meets fatal error. If one non-NULL pointer
2085 : : SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2086 : : worked out suggested unroll factor, while one NULL pointer shows it's
2087 : : going to apply the suggested unroll factor.
2088 : : SINGLE_LANE_SLP_DONE_FOR_SUGGESTED_UF is to hold whether single-lane
2089 : : slp was forced when the suggested unroll factor was worked out. */
2090 : : static opt_result
2091 : 415588 : vect_analyze_loop_2 (loop_vec_info loop_vinfo, int masked_p, bool &fatal,
2092 : : unsigned *suggested_unroll_factor,
2093 : : bool& single_lane_slp_done_for_suggested_uf)
2094 : : {
2095 : 415588 : opt_result ok = opt_result::success ();
2096 : 415588 : int res;
2097 : 415588 : unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2098 : 415588 : loop_vec_info orig_loop_vinfo = NULL;
2099 : :
2100 : : /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2101 : : loop_vec_info of the first vectorized loop. */
2102 : 415588 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2103 : 17691 : orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2104 : : else
2105 : : orig_loop_vinfo = loop_vinfo;
2106 : 17691 : gcc_assert (orig_loop_vinfo);
2107 : :
2108 : : /* The first group of checks is independent of the vector size. */
2109 : 415588 : fatal = true;
2110 : :
2111 : 415588 : if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2112 : 415588 : && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2113 : 5 : return opt_result::failure_at (vect_location,
2114 : : "not vectorized: simd if(0)\n");
2115 : :
2116 : : /* Find all data references in the loop (which correspond to vdefs/vuses)
2117 : : and analyze their evolution in the loop. */
2118 : :
2119 : 415583 : loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2120 : :
2121 : : /* Gather the data references and count stmts in the loop. */
2122 : 415583 : if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2123 : : {
2124 : 222513 : opt_result res
2125 : 222513 : = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2126 : : &LOOP_VINFO_DATAREFS (loop_vinfo));
2127 : 222513 : if (!res)
2128 : : {
2129 : 44853 : if (dump_enabled_p ())
2130 : 1561 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2131 : : "not vectorized: loop contains function "
2132 : : "calls or data references that cannot "
2133 : : "be analyzed\n");
2134 : 44853 : return res;
2135 : : }
2136 : 177660 : loop_vinfo->shared->save_datarefs ();
2137 : : }
2138 : : else
2139 : 193070 : loop_vinfo->shared->check_datarefs ();
2140 : :
2141 : : /* Analyze the data references and also adjust the minimal
2142 : : vectorization factor according to the loads and stores. */
2143 : :
2144 : 370730 : ok = vect_analyze_data_refs (loop_vinfo, &fatal);
2145 : 370730 : if (!ok)
2146 : : {
2147 : 49788 : if (dump_enabled_p ())
2148 : 984 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2149 : : "bad data references.\n");
2150 : 49788 : return ok;
2151 : : }
2152 : :
2153 : : /* Check if we are applying unroll factor now. */
2154 : 320942 : bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2155 : 320942 : gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2156 : :
2157 : : /* When single-lane SLP was forced and we are applying suggested unroll
2158 : : factor, keep that decision here. */
2159 : 641884 : bool force_single_lane = (applying_suggested_uf
2160 : 320942 : && single_lane_slp_done_for_suggested_uf);
2161 : :
2162 : : /* Classify all cross-iteration scalar data-flow cycles.
2163 : : Cross-iteration cycles caused by virtual phis are analyzed separately. */
2164 : 320942 : vect_analyze_scalar_cycles (loop_vinfo);
2165 : :
2166 : 320942 : vect_pattern_recog (loop_vinfo);
2167 : :
2168 : : /* Analyze the access patterns of the data-refs in the loop (consecutive,
2169 : : complex, etc.). FORNOW: Only handle consecutive access pattern. */
2170 : :
2171 : 320942 : ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2172 : 320942 : if (!ok)
2173 : : {
2174 : 7053 : if (dump_enabled_p ())
2175 : 264 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2176 : : "bad data access.\n");
2177 : 7053 : return ok;
2178 : : }
2179 : :
2180 : : /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2181 : :
2182 : 313889 : ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2183 : 313889 : if (!ok)
2184 : : {
2185 : 13523 : if (dump_enabled_p ())
2186 : 304 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2187 : : "unexpected pattern.\n");
2188 : 13523 : return ok;
2189 : : }
2190 : :
2191 : : /* While the rest of the analysis below depends on it in some way. */
2192 : 300366 : fatal = false;
2193 : :
2194 : : /* Analyze data dependences between the data-refs in the loop
2195 : : and adjust the maximum vectorization factor according to
2196 : : the dependences.
2197 : : FORNOW: fail at the first data dependence that we encounter. */
2198 : :
2199 : 300366 : ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2200 : 300366 : if (!ok)
2201 : : {
2202 : 14385 : if (dump_enabled_p ())
2203 : 372 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2204 : : "bad data dependence.\n");
2205 : 14385 : return ok;
2206 : : }
2207 : 285981 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2208 : :
2209 : : /* Compute the scalar iteration cost. */
2210 : 285981 : vect_compute_single_scalar_iteration_cost (loop_vinfo);
2211 : :
2212 : 285981 : bool saved_can_use_partial_vectors_p
2213 : : = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2214 : :
2215 : : /* This is the point where we can re-start analysis with single-lane
2216 : : SLP forced. */
2217 : 407289 : start_over:
2218 : :
2219 : : /* Check the SLP opportunities in the loop, analyze and build
2220 : : SLP trees. */
2221 : 814578 : ok = vect_analyze_slp (loop_vinfo, loop_vinfo->stmt_vec_infos.length (),
2222 : : force_single_lane);
2223 : 407289 : if (!ok)
2224 : 26904 : return ok;
2225 : :
2226 : : /* If there are any SLP instances mark them as pure_slp and compute
2227 : : the overall vectorization factor. */
2228 : 380385 : if (!vect_make_slp_decision (loop_vinfo))
2229 : 38753 : return opt_result::failure_at (vect_location, "no stmts to vectorize.\n");
2230 : :
2231 : 341632 : if (dump_enabled_p ())
2232 : 17799 : dump_printf_loc (MSG_NOTE, vect_location, "Loop contains only SLP stmts\n");
2233 : :
2234 : : /* Dump the vectorization factor from the SLP decision. */
2235 : 341632 : if (dump_enabled_p ())
2236 : : {
2237 : 17799 : dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
2238 : 17799 : dump_dec (MSG_NOTE, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2239 : 17799 : dump_printf (MSG_NOTE, "\n");
2240 : : }
2241 : :
2242 : : /* We don't expect to have to roll back to anything other than an empty
2243 : : set of rgroups. */
2244 : 341632 : gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2245 : :
2246 : : /* Apply the suggested unrolling factor, this was determined by the backend
2247 : : during finish_cost the first time we ran the analyzis for this
2248 : : vector mode. */
2249 : 341632 : if (applying_suggested_uf)
2250 : 247 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2251 : :
2252 : : /* Now the vectorization factor is final. */
2253 : 341632 : poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2254 : 341632 : gcc_assert (known_ne (vectorization_factor, 0U));
2255 : :
2256 : : /* Optimize the SLP graph with the vectorization factor fixed. */
2257 : 341632 : vect_optimize_slp (loop_vinfo);
2258 : :
2259 : : /* Gather the loads reachable from the SLP graph entries. */
2260 : 341632 : vect_gather_slp_loads (loop_vinfo);
2261 : :
2262 : 341632 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2263 : : {
2264 : 13424 : dump_printf_loc (MSG_NOTE, vect_location,
2265 : : "vectorization_factor = ");
2266 : 13424 : dump_dec (MSG_NOTE, vectorization_factor);
2267 : 13424 : dump_printf (MSG_NOTE, ", niters = %wd\n",
2268 : 13424 : LOOP_VINFO_INT_NITERS (loop_vinfo));
2269 : : }
2270 : :
2271 : 341632 : if (max_vf != MAX_VECTORIZATION_FACTOR
2272 : 341632 : && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2273 : 41 : return opt_result::failure_at (vect_location, "bad data dependence.\n");
2274 : :
2275 : 341591 : loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2276 : :
2277 : : /* Analyze the alignment of the data-refs in the loop.
2278 : : Fail if a data reference is found that cannot be vectorized. */
2279 : :
2280 : 341591 : ok = vect_analyze_data_refs_alignment (loop_vinfo);
2281 : 341591 : if (!ok)
2282 : : {
2283 : 0 : if (dump_enabled_p ())
2284 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2285 : : "bad data alignment.\n");
2286 : 0 : return ok;
2287 : : }
2288 : :
2289 : : /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2290 : : It is important to call pruning after vect_analyze_data_ref_accesses,
2291 : : since we use grouping information gathered by interleaving analysis. */
2292 : 341591 : ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2293 : 341591 : if (!ok)
2294 : 16483 : return ok;
2295 : :
2296 : : /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2297 : : vectorization, since we do not want to add extra peeling or
2298 : : add versioning for alignment. */
2299 : 325108 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2300 : : /* This pass will decide on using loop versioning and/or loop peeling in
2301 : : order to enhance the alignment of data references in the loop. */
2302 : 310273 : ok = vect_enhance_data_refs_alignment (loop_vinfo);
2303 : 325108 : if (!ok)
2304 : 0 : return ok;
2305 : :
2306 : : /* Analyze operations in the SLP instances. We can't simply
2307 : : remove unsupported SLP instances as this makes the above
2308 : : SLP kind detection invalid and might also affect the VF. */
2309 : 325108 : if (! vect_slp_analyze_operations (loop_vinfo))
2310 : : {
2311 : 211717 : ok = opt_result::failure_at (vect_location,
2312 : : "unsupported SLP instances\n");
2313 : 211717 : goto again;
2314 : : }
2315 : :
2316 : : /* For now, we don't expect to mix both masking and length approaches for one
2317 : : loop, disable it if both are recorded. */
2318 : 113391 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2319 : 15660 : && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2320 : 129045 : && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2321 : : {
2322 : 0 : if (dump_enabled_p ())
2323 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2324 : : "can't vectorize a loop with partial vectors"
2325 : : " because we don't expect to mix different"
2326 : : " approaches with partial vectors for the"
2327 : : " same loop.\n");
2328 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2329 : : }
2330 : :
2331 : : /* If we still have the option of using partial vectors,
2332 : : check whether we can generate the necessary loop controls. */
2333 : 113391 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2334 : : {
2335 : 15660 : if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2336 : : {
2337 : 15654 : if (!vect_verify_full_masking (loop_vinfo)
2338 : 15654 : && !vect_verify_full_masking_avx512 (loop_vinfo))
2339 : 3108 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2340 : : }
2341 : : else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
2342 : 6 : if (!vect_verify_loop_lens (loop_vinfo))
2343 : 6 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2344 : : }
2345 : :
2346 : : /* Decide whether this loop_vinfo should use partial vectors or peeling,
2347 : : assuming that the loop will be used as a main loop. We will redo
2348 : : this analysis later if we instead decide to use the loop as an
2349 : : epilogue loop. */
2350 : 113391 : ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, masked_p);
2351 : 113391 : if (!ok)
2352 : 0 : return ok;
2353 : :
2354 : : /* If we're vectorizing a loop that uses length "controls" and
2355 : : can iterate more than once, we apply decrementing IV approach
2356 : : in loop control. */
2357 : 113391 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2358 : 25 : && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
2359 : 0 : && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
2360 : 113391 : && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2361 : 0 : && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
2362 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
2363 : 0 : LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
2364 : :
2365 : : /* If a loop uses length controls and has a decrementing loop control IV,
2366 : : we will normally pass that IV through a MIN_EXPR to calcaluate the
2367 : : basis for the length controls. E.g. in a loop that processes one
2368 : : element per scalar iteration, the number of elements would be
2369 : : MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
2370 : :
2371 : : This MIN_EXPR approach allows us to use pointer IVs with an invariant
2372 : : step, since only the final iteration of the vector loop can have
2373 : : inactive lanes.
2374 : :
2375 : : However, some targets have a dedicated instruction for calculating the
2376 : : preferred length, given the total number of elements that still need to
2377 : : be processed. This is encapsulated in the SELECT_VL internal function.
2378 : :
2379 : : If the target supports SELECT_VL, we can use it instead of MIN_EXPR
2380 : : to determine the basis for the length controls. However, unlike the
2381 : : MIN_EXPR calculation, the SELECT_VL calculation can decide to make
2382 : : lanes inactive in any iteration of the vector loop, not just the last
2383 : : iteration. This SELECT_VL approach therefore requires us to use pointer
2384 : : IVs with variable steps.
2385 : :
2386 : : Once we've decided how many elements should be processed by one
2387 : : iteration of the vector loop, we need to populate the rgroup controls.
2388 : : If a loop has multiple rgroups, we need to make sure that those rgroups
2389 : : "line up" (that is, they must be consistent about which elements are
2390 : : active and which aren't). This is done by vect_adjust_loop_lens_control.
2391 : :
2392 : : In principle, it would be possible to use vect_adjust_loop_lens_control
2393 : : on either the result of a MIN_EXPR or the result of a SELECT_VL.
2394 : : However:
2395 : :
2396 : : (1) In practice, it only makes sense to use SELECT_VL when a vector
2397 : : operation will be controlled directly by the result. It is not
2398 : : worth using SELECT_VL if it would only be the input to other
2399 : : calculations.
2400 : :
2401 : : (2) If we use SELECT_VL for an rgroup that has N controls, each associated
2402 : : pointer IV will need N updates by a variable amount (N-1 updates
2403 : : within the iteration and 1 update to move to the next iteration).
2404 : :
2405 : : Because of this, we prefer to use the MIN_EXPR approach whenever there
2406 : : is more than one length control.
2407 : :
2408 : : In addition, SELECT_VL always operates to a granularity of 1 unit.
2409 : : If we wanted to use it to control an SLP operation on N consecutive
2410 : : elements, we would need to make the SELECT_VL inputs measure scalar
2411 : : iterations (rather than elements) and then multiply the SELECT_VL
2412 : : result by N. But using SELECT_VL this way is inefficient because
2413 : : of (1) above.
2414 : :
2415 : : 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
2416 : : satisfied:
2417 : :
2418 : : (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
2419 : : (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
2420 : :
2421 : : Since SELECT_VL (variable step) will make SCEV analysis failed and then
2422 : : we will fail to gain benefits of following unroll optimizations. We prefer
2423 : : using the MIN_EXPR approach in this situation. */
2424 : 113391 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
2425 : : {
2426 : 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
2427 : 0 : if (LOOP_VINFO_LENS (loop_vinfo).length () == 1
2428 : 0 : && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1
2429 : 0 : && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2430 : : || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
2431 : 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
2432 : :
2433 : 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
2434 : 0 : for (auto rgc : LOOP_VINFO_LENS (loop_vinfo))
2435 : 0 : if (rgc.type
2436 : 0 : && !direct_internal_fn_supported_p (IFN_SELECT_VL,
2437 : : rgc.type, iv_type,
2438 : : OPTIMIZE_FOR_SPEED))
2439 : : {
2440 : 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2441 : 0 : break;
2442 : : }
2443 : :
2444 : : /* If any of the SLP instances cover more than a single lane
2445 : : we cannot use .SELECT_VL at the moment, even if the number
2446 : : of lanes is uniform throughout the SLP graph. */
2447 : 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
2448 : 0 : for (slp_instance inst : LOOP_VINFO_SLP_INSTANCES (loop_vinfo))
2449 : 0 : if (SLP_TREE_LANES (SLP_INSTANCE_TREE (inst)) != 1
2450 : 0 : && !(SLP_INSTANCE_KIND (inst) == slp_inst_kind_store
2451 : 0 : && SLP_INSTANCE_TREE (inst)->ldst_lanes))
2452 : : {
2453 : 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2454 : 0 : break;
2455 : : }
2456 : : }
2457 : :
2458 : : /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2459 : : to be able to handle fewer than VF scalars, or needs to have a lower VF
2460 : : than the main loop. */
2461 : 113391 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2462 : 12452 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2463 : : {
2464 : 12443 : poly_uint64 unscaled_vf
2465 : 12443 : = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2466 : : orig_loop_vinfo->suggested_unroll_factor);
2467 : 12443 : if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
2468 : 285 : return opt_result::failure_at (vect_location,
2469 : : "Vectorization factor too high for"
2470 : : " epilogue loop.\n");
2471 : : }
2472 : :
2473 : : /* If the epilogue needs peeling for gaps but the main loop doesn't give
2474 : : up on the epilogue. */
2475 : 113106 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2476 : 12167 : && LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2477 : 62 : && (LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo)
2478 : : != LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
2479 : 4 : return opt_result::failure_at (vect_location,
2480 : : "Epilogue loop requires peeling for gaps "
2481 : : "but main loop does not.\n");
2482 : :
2483 : : /* If an epilogue loop is required make sure we can create one. */
2484 : 113102 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2485 : 111873 : || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2486 : 32689 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
2487 : : {
2488 : 81447 : if (dump_enabled_p ())
2489 : 5106 : dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2490 : 81447 : if (!vect_can_advance_ivs_p (loop_vinfo)
2491 : 162396 : || !slpeel_can_duplicate_loop_p (loop,
2492 : : LOOP_VINFO_IV_EXIT (loop_vinfo),
2493 : 80949 : LOOP_VINFO_IV_EXIT (loop_vinfo)))
2494 : : {
2495 : 498 : ok = opt_result::failure_at (vect_location,
2496 : : "not vectorized: can't create required "
2497 : : "epilog loop\n");
2498 : 498 : goto again;
2499 : : }
2500 : : }
2501 : :
2502 : : /* Check the costings of the loop make vectorizing worthwhile. */
2503 : 112604 : res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2504 : 112604 : if (res < 0)
2505 : : {
2506 : 30348 : ok = opt_result::failure_at (vect_location,
2507 : : "Loop costings may not be worthwhile.\n");
2508 : 30348 : goto again;
2509 : : }
2510 : 82256 : if (!res)
2511 : 21421 : return opt_result::failure_at (vect_location,
2512 : : "Loop costings not worthwhile.\n");
2513 : :
2514 : : /* During peeling, we need to check if number of loop iterations is
2515 : : enough for both peeled prolog loop and vector loop. This check
2516 : : can be merged along with threshold check of loop versioning, so
2517 : : increase threshold for this case if necessary.
2518 : :
2519 : : If we are analyzing an epilogue we still want to check what its
2520 : : versioning threshold would be. If we decide to vectorize the epilogues we
2521 : : will want to use the lowest versioning threshold of all epilogues and main
2522 : : loop. This will enable us to enter a vectorized epilogue even when
2523 : : versioning the loop. We can't simply check whether the epilogue requires
2524 : : versioning though since we may have skipped some versioning checks when
2525 : : analyzing the epilogue. For instance, checks for alias versioning will be
2526 : : skipped when dealing with epilogues as we assume we already checked them
2527 : : for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2528 : 60835 : if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2529 : : {
2530 : 5601 : poly_uint64 niters_th = 0;
2531 : 5601 : unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2532 : :
2533 : 5601 : if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2534 : : {
2535 : : /* Niters for peeled prolog loop. */
2536 : 5601 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2537 : : {
2538 : 126 : dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2539 : 126 : tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2540 : 126 : niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2541 : : }
2542 : : else
2543 : 5475 : niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2544 : : }
2545 : :
2546 : : /* Niters for at least one iteration of vectorized loop. */
2547 : 5601 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2548 : 5597 : niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2549 : : /* One additional iteration because of peeling for gap. */
2550 : 5601 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2551 : 59 : niters_th += 1;
2552 : :
2553 : : /* Use the same condition as vect_transform_loop to decide when to use
2554 : : the cost to determine a versioning threshold. */
2555 : 5601 : if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2556 : 5601 : && ordered_p (th, niters_th))
2557 : 3809 : niters_th = ordered_max (poly_uint64 (th), niters_th);
2558 : :
2559 : 5601 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2560 : : }
2561 : :
2562 : 60835 : gcc_assert (known_eq (vectorization_factor,
2563 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2564 : :
2565 : 60835 : single_lane_slp_done_for_suggested_uf = force_single_lane;
2566 : :
2567 : : /* Ok to vectorize! */
2568 : 60835 : LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2569 : 60835 : return opt_result::success ();
2570 : :
2571 : 242563 : again:
2572 : : /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2573 : 242563 : gcc_assert (!ok);
2574 : :
2575 : : /* Try again with single-lane SLP. */
2576 : 242563 : if (force_single_lane)
2577 : 120184 : return ok;
2578 : :
2579 : : /* If we are applying suggested unroll factor, we don't need to
2580 : : re-try any more as we want to keep the SLP mode fixed. */
2581 : 122379 : if (applying_suggested_uf)
2582 : 6 : return ok;
2583 : :
2584 : : /* Likewise if the grouped loads or stores in the SLP cannot be handled
2585 : : via interleaving or lane instructions. */
2586 : : slp_instance instance;
2587 : : slp_tree node;
2588 : : unsigned i, j;
2589 : 483914 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2590 : : {
2591 : 362606 : if (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance)) != vect_internal_def)
2592 : 0 : continue;
2593 : :
2594 : 362606 : stmt_vec_info vinfo;
2595 : 362606 : vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2596 : 362606 : if (!vinfo || !STMT_VINFO_GROUPED_ACCESS (vinfo))
2597 : 359997 : continue;
2598 : 2609 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2599 : 2609 : unsigned int size = DR_GROUP_SIZE (vinfo);
2600 : 2609 : tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
2601 : 2609 : if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
2602 : 4532 : && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2603 : 5210 : && ! vect_grouped_store_supported (vectype, size))
2604 : 678 : return opt_result::failure_at (vinfo->stmt,
2605 : : "unsupported grouped store\n");
2606 : 364837 : FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2607 : : {
2608 : 2089 : vinfo = SLP_TREE_REPRESENTATIVE (node);
2609 : 2089 : if (STMT_VINFO_GROUPED_ACCESS (vinfo))
2610 : : {
2611 : 1830 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2612 : 1830 : bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2613 : 1830 : size = DR_GROUP_SIZE (vinfo);
2614 : 1830 : vectype = SLP_TREE_VECTYPE (node);
2615 : 1830 : if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
2616 : 1830 : && ! vect_grouped_load_supported (vectype, single_element_p,
2617 : : size))
2618 : 387 : return opt_result::failure_at (vinfo->stmt,
2619 : : "unsupported grouped load\n");
2620 : : }
2621 : : }
2622 : : }
2623 : :
2624 : : /* Roll back state appropriately. Force single-lane SLP this time. */
2625 : 121308 : force_single_lane = true;
2626 : 121308 : if (dump_enabled_p ())
2627 : 3233 : dump_printf_loc (MSG_NOTE, vect_location,
2628 : : "re-trying with single-lane SLP\n");
2629 : :
2630 : : /* Reset the vectorization factor. */
2631 : 121308 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = 0;
2632 : : /* Free the SLP instances. */
2633 : 482833 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2634 : 361525 : vect_free_slp_instance (instance);
2635 : 121308 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2636 : : /* Reset SLP type to loop_vect on all stmts. */
2637 : 471346 : for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2638 : : {
2639 : 350038 : basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2640 : 350038 : for (gimple_stmt_iterator si = gsi_start_phis (bb);
2641 : 625255 : !gsi_end_p (si); gsi_next (&si))
2642 : : {
2643 : 275217 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2644 : 275217 : STMT_SLP_TYPE (stmt_info) = not_vect;
2645 : 275217 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2646 : 275217 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2647 : : {
2648 : : /* vectorizable_reduction adjusts reduction stmt def-types,
2649 : : restore them to that of the PHI. */
2650 : 19824 : STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2651 : 19824 : = STMT_VINFO_DEF_TYPE (stmt_info);
2652 : 19824 : STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2653 : : (STMT_VINFO_REDUC_DEF (stmt_info)))
2654 : 19824 : = STMT_VINFO_DEF_TYPE (stmt_info);
2655 : : }
2656 : : }
2657 : 700076 : for (gimple_stmt_iterator si = gsi_start_bb (bb);
2658 : 2144021 : !gsi_end_p (si); gsi_next (&si))
2659 : : {
2660 : 1793983 : if (is_gimple_debug (gsi_stmt (si)))
2661 : 656093 : continue;
2662 : 1137890 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2663 : 1137890 : STMT_SLP_TYPE (stmt_info) = not_vect;
2664 : 1137890 : if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2665 : : {
2666 : 211285 : stmt_vec_info pattern_stmt_info
2667 : : = STMT_VINFO_RELATED_STMT (stmt_info);
2668 : 211285 : if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2669 : 0 : STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2670 : :
2671 : 211285 : gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2672 : 211285 : STMT_SLP_TYPE (pattern_stmt_info) = not_vect;
2673 : 211285 : for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2674 : 429466 : !gsi_end_p (pi); gsi_next (&pi))
2675 : 218181 : STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2676 : 218181 : = not_vect;
2677 : : }
2678 : : }
2679 : : }
2680 : : /* Free optimized alias test DDRS. */
2681 : 121308 : LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2682 : 121308 : LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2683 : 121308 : LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2684 : : /* Reset target cost data. */
2685 : 121308 : delete loop_vinfo->vector_costs;
2686 : 121308 : loop_vinfo->vector_costs = nullptr;
2687 : : /* Reset accumulated rgroup information. */
2688 : 121308 : LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
2689 : 121308 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
2690 : 121308 : release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2691 : : /* Reset assorted flags. */
2692 : 121308 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2693 : 121308 : LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2694 : 121308 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2695 : 121308 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2696 : 121308 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2697 : 121308 : = saved_can_use_partial_vectors_p;
2698 : 121308 : LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2699 : 121308 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2700 : 121308 : if (loop_vinfo->scan_map)
2701 : 122 : loop_vinfo->scan_map->empty ();
2702 : :
2703 : 121308 : goto start_over;
2704 : : }
2705 : :
2706 : : /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2707 : : to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2708 : : OLD_LOOP_VINFO is better unless something specifically indicates
2709 : : otherwise.
2710 : :
2711 : : Note that this deliberately isn't a partial order. */
2712 : :
2713 : : static bool
2714 : 0 : vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2715 : : loop_vec_info old_loop_vinfo)
2716 : : {
2717 : 0 : struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2718 : 0 : gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2719 : :
2720 : 0 : poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2721 : 0 : poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2722 : :
2723 : : /* Always prefer a VF of loop->simdlen over any other VF. */
2724 : 0 : if (loop->simdlen)
2725 : : {
2726 : 0 : bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2727 : 0 : bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2728 : 0 : if (new_simdlen_p != old_simdlen_p)
2729 : : return new_simdlen_p;
2730 : : }
2731 : :
2732 : 0 : const auto *old_costs = old_loop_vinfo->vector_costs;
2733 : 0 : const auto *new_costs = new_loop_vinfo->vector_costs;
2734 : 0 : if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2735 : 0 : return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2736 : :
2737 : 0 : return new_costs->better_main_loop_than_p (old_costs);
2738 : : }
2739 : :
2740 : : /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2741 : : true if we should. */
2742 : :
2743 : : static bool
2744 : 0 : vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2745 : : loop_vec_info old_loop_vinfo)
2746 : : {
2747 : 0 : if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2748 : : return false;
2749 : :
2750 : 0 : if (dump_enabled_p ())
2751 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
2752 : : "***** Preferring vector mode %s to vector mode %s\n",
2753 : 0 : GET_MODE_NAME (new_loop_vinfo->vector_mode),
2754 : 0 : GET_MODE_NAME (old_loop_vinfo->vector_mode));
2755 : : return true;
2756 : : }
2757 : :
2758 : : /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if ORIG_LOOP_VINFO is
2759 : : not NULL. When MASKED_P is not -1 override the default
2760 : : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P with it.
2761 : : Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance MODE_I to the next
2762 : : mode useful to analyze.
2763 : : Return the loop_vinfo on success and wrapped null on failure. */
2764 : :
2765 : : static opt_loop_vec_info
2766 : 415341 : vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2767 : : const vect_loop_form_info *loop_form_info,
2768 : : loop_vec_info orig_loop_vinfo,
2769 : : const vector_modes &vector_modes, unsigned &mode_i,
2770 : : int masked_p,
2771 : : machine_mode &autodetected_vector_mode,
2772 : : bool &fatal)
2773 : : {
2774 : 415341 : loop_vec_info loop_vinfo
2775 : 415341 : = vect_create_loop_vinfo (loop, shared, loop_form_info, orig_loop_vinfo);
2776 : :
2777 : 415341 : machine_mode vector_mode = vector_modes[mode_i];
2778 : 415341 : loop_vinfo->vector_mode = vector_mode;
2779 : 415341 : unsigned int suggested_unroll_factor = 1;
2780 : 415341 : bool single_lane_slp_done_for_suggested_uf = false;
2781 : :
2782 : : /* Run the main analysis. */
2783 : 415341 : opt_result res = vect_analyze_loop_2 (loop_vinfo, masked_p, fatal,
2784 : : &suggested_unroll_factor,
2785 : : single_lane_slp_done_for_suggested_uf);
2786 : 415341 : if (dump_enabled_p ())
2787 : 19327 : dump_printf_loc (MSG_NOTE, vect_location,
2788 : : "***** Analysis %s with vector mode %s\n",
2789 : 19327 : res ? "succeeded" : "failed",
2790 : 19327 : GET_MODE_NAME (loop_vinfo->vector_mode));
2791 : :
2792 : 415341 : auto user_unroll = LOOP_VINFO_LOOP (loop_vinfo)->unroll;
2793 : 415341 : if (res && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2794 : : /* Check to see if the user wants to unroll or if the target wants to. */
2795 : 469120 : && (suggested_unroll_factor > 1 || user_unroll > 1))
2796 : : {
2797 : 261 : if (suggested_unroll_factor == 1)
2798 : : {
2799 : 44 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
2800 : 44 : suggested_unroll_factor = user_unroll / assumed_vf;
2801 : 44 : if (suggested_unroll_factor > 1)
2802 : : {
2803 : 30 : if (dump_enabled_p ())
2804 : 20 : dump_printf_loc (MSG_NOTE, vect_location,
2805 : : "setting unroll factor to %d based on user requested "
2806 : : "unroll factor %d and suggested vectorization "
2807 : : "factor: %d\n",
2808 : : suggested_unroll_factor, user_unroll, assumed_vf);
2809 : : }
2810 : : }
2811 : :
2812 : 261 : if (suggested_unroll_factor > 1)
2813 : : {
2814 : 247 : if (dump_enabled_p ())
2815 : 44 : dump_printf_loc (MSG_NOTE, vect_location,
2816 : : "***** Re-trying analysis for unrolling"
2817 : : " with unroll factor %d and %s slp.\n",
2818 : : suggested_unroll_factor,
2819 : : single_lane_slp_done_for_suggested_uf
2820 : : ? "single-lane" : "");
2821 : 247 : loop_vec_info unroll_vinfo
2822 : 247 : = vect_create_loop_vinfo (loop, shared, loop_form_info, NULL);
2823 : 247 : unroll_vinfo->vector_mode = vector_mode;
2824 : 247 : unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
2825 : 247 : opt_result new_res
2826 : 247 : = vect_analyze_loop_2 (unroll_vinfo, masked_p, fatal, NULL,
2827 : : single_lane_slp_done_for_suggested_uf);
2828 : 247 : if (new_res)
2829 : : {
2830 : 201 : delete loop_vinfo;
2831 : 201 : loop_vinfo = unroll_vinfo;
2832 : : }
2833 : : else
2834 : 46 : delete unroll_vinfo;
2835 : : }
2836 : :
2837 : : /* Record that we have honored a user unroll factor. */
2838 : 261 : LOOP_VINFO_USER_UNROLL (loop_vinfo) = user_unroll > 1;
2839 : : }
2840 : :
2841 : : /* Remember the autodetected vector mode. */
2842 : 415341 : if (vector_mode == VOIDmode)
2843 : 214621 : autodetected_vector_mode = loop_vinfo->vector_mode;
2844 : :
2845 : : /* Advance mode_i, first skipping modes that would result in the
2846 : : same analysis result. */
2847 : 1882537 : while (mode_i + 1 < vector_modes.length ()
2848 : 1319581 : && vect_chooses_same_modes_p (loop_vinfo,
2849 : 585983 : vector_modes[mode_i + 1]))
2850 : : {
2851 : 318257 : if (dump_enabled_p ())
2852 : 16086 : dump_printf_loc (MSG_NOTE, vect_location,
2853 : : "***** The result for vector mode %s would"
2854 : : " be the same\n",
2855 : 16086 : GET_MODE_NAME (vector_modes[mode_i + 1]));
2856 : 318257 : mode_i += 1;
2857 : : }
2858 : 415341 : if (mode_i + 1 < vector_modes.length ()
2859 : 683067 : && vect_chooses_same_modes_p (autodetected_vector_mode,
2860 : 267726 : vector_modes[mode_i + 1]))
2861 : : {
2862 : 349 : if (dump_enabled_p ())
2863 : 10 : dump_printf_loc (MSG_NOTE, vect_location,
2864 : : "***** Skipping vector mode %s, which would"
2865 : : " repeat the analysis for %s\n",
2866 : 10 : GET_MODE_NAME (vector_modes[mode_i + 1]),
2867 : 10 : GET_MODE_NAME (autodetected_vector_mode));
2868 : 349 : mode_i += 1;
2869 : : }
2870 : 415341 : mode_i++;
2871 : :
2872 : 415341 : if (!res)
2873 : : {
2874 : 354707 : delete loop_vinfo;
2875 : 354707 : if (fatal)
2876 : 65435 : gcc_checking_assert (orig_loop_vinfo == NULL);
2877 : 354707 : return opt_loop_vec_info::propagate_failure (res);
2878 : : }
2879 : :
2880 : 60634 : return opt_loop_vec_info::success (loop_vinfo);
2881 : : }
2882 : :
2883 : : /* Function vect_analyze_loop.
2884 : :
2885 : : Apply a set of analyses on LOOP, and create a loop_vec_info struct
2886 : : for it. The different analyses will record information in the
2887 : : loop_vec_info struct. */
2888 : : opt_loop_vec_info
2889 : 480962 : vect_analyze_loop (class loop *loop, gimple *loop_vectorized_call,
2890 : : vec_info_shared *shared)
2891 : : {
2892 : 480962 : DUMP_VECT_SCOPE ("analyze_loop_nest");
2893 : :
2894 : 480962 : if (loop_outer (loop)
2895 : 480962 : && loop_vec_info_for_loop (loop_outer (loop))
2896 : 481463 : && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2897 : 501 : return opt_loop_vec_info::failure_at (vect_location,
2898 : : "outer-loop already vectorized.\n");
2899 : :
2900 : 480461 : if (!find_loop_nest (loop, &shared->loop_nest))
2901 : 24691 : return opt_loop_vec_info::failure_at
2902 : 24691 : (vect_location,
2903 : : "not vectorized: loop nest containing two or more consecutive inner"
2904 : : " loops cannot be vectorized\n");
2905 : :
2906 : : /* Analyze the loop form. */
2907 : 455770 : vect_loop_form_info loop_form_info;
2908 : 455770 : opt_result res = vect_analyze_loop_form (loop, loop_vectorized_call,
2909 : : &loop_form_info);
2910 : 455770 : if (!res)
2911 : : {
2912 : 241149 : if (dump_enabled_p ())
2913 : 1641 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2914 : : "bad loop form.\n");
2915 : 241149 : return opt_loop_vec_info::propagate_failure (res);
2916 : : }
2917 : 214621 : if (!integer_onep (loop_form_info.assumptions))
2918 : : {
2919 : : /* We consider to vectorize this loop by versioning it under
2920 : : some assumptions. In order to do this, we need to clear
2921 : : existing information computed by scev and niter analyzer. */
2922 : 10052 : scev_reset_htab ();
2923 : 10052 : free_numbers_of_iterations_estimates (loop);
2924 : : /* Also set flag for this loop so that following scev and niter
2925 : : analysis are done under the assumptions. */
2926 : 10052 : loop_constraint_set (loop, LOOP_C_FINITE);
2927 : : }
2928 : : else
2929 : : /* Clear the existing niter information to make sure the nonwrapping flag
2930 : : will be calculated and set propriately. */
2931 : 204569 : free_numbers_of_iterations_estimates (loop);
2932 : :
2933 : 214621 : auto_vector_modes vector_modes;
2934 : : /* Autodetect first vector size we try. */
2935 : 214621 : vector_modes.safe_push (VOIDmode);
2936 : 214621 : unsigned int autovec_flags
2937 : 429242 : = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2938 : 214621 : loop->simdlen != 0);
2939 : 214621 : bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2940 : 214621 : && !unlimited_cost_model (loop));
2941 : 214621 : machine_mode autodetected_vector_mode = VOIDmode;
2942 : 214621 : opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2943 : 214621 : unsigned int mode_i = 0;
2944 : 214621 : unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2945 : :
2946 : : /* Keep track of the VF for each mode. Initialize all to 0 which indicates
2947 : : a mode has not been analyzed. */
2948 : 214621 : auto_vec<poly_uint64, 8> cached_vf_per_mode;
2949 : 2166294 : for (unsigned i = 0; i < vector_modes.length (); ++i)
2950 : 868526 : cached_vf_per_mode.safe_push (0);
2951 : :
2952 : : /* First determine the main loop vectorization mode, either the first
2953 : : one that works, starting with auto-detecting the vector mode and then
2954 : : following the targets order of preference, or the one with the
2955 : : lowest cost if pick_lowest_cost_p. */
2956 : 580679 : while (1)
2957 : : {
2958 : 397650 : bool fatal;
2959 : 397650 : unsigned int last_mode_i = mode_i;
2960 : : /* Set cached VF to -1 prior to analysis, which indicates a mode has
2961 : : failed. */
2962 : 397650 : cached_vf_per_mode[last_mode_i] = -1;
2963 : 397650 : opt_loop_vec_info loop_vinfo
2964 : 397650 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
2965 : : NULL, vector_modes, mode_i, -1,
2966 : : autodetected_vector_mode, fatal);
2967 : 397650 : if (fatal)
2968 : : break;
2969 : :
2970 : 332215 : if (loop_vinfo)
2971 : : {
2972 : : /* Analyzis has been successful so update the VF value. The
2973 : : VF should always be a multiple of unroll_factor and we want to
2974 : : capture the original VF here. */
2975 : 53779 : cached_vf_per_mode[last_mode_i]
2976 : 53779 : = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2977 : 53779 : loop_vinfo->suggested_unroll_factor);
2978 : : /* Once we hit the desired simdlen for the first time,
2979 : : discard any previous attempts. */
2980 : 53779 : if (simdlen
2981 : 53779 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2982 : : {
2983 : 47 : delete first_loop_vinfo;
2984 : : first_loop_vinfo = opt_loop_vec_info::success (NULL);
2985 : : simdlen = 0;
2986 : : }
2987 : 53732 : else if (pick_lowest_cost_p
2988 : 0 : && first_loop_vinfo
2989 : 53732 : && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2990 : : {
2991 : : /* Pick loop_vinfo over first_loop_vinfo. */
2992 : 0 : delete first_loop_vinfo;
2993 : 0 : first_loop_vinfo = opt_loop_vec_info::success (NULL);
2994 : : }
2995 : 53779 : if (first_loop_vinfo == NULL)
2996 : : first_loop_vinfo = loop_vinfo;
2997 : : else
2998 : : {
2999 : 2 : delete loop_vinfo;
3000 : 2 : loop_vinfo = opt_loop_vec_info::success (NULL);
3001 : : }
3002 : :
3003 : : /* Commit to first_loop_vinfo if we have no reason to try
3004 : : alternatives. */
3005 : 53779 : if (!simdlen && !pick_lowest_cost_p)
3006 : : break;
3007 : : }
3008 : 278445 : if (mode_i == vector_modes.length ()
3009 : 278445 : || autodetected_vector_mode == VOIDmode)
3010 : : break;
3011 : :
3012 : : /* Try the next biggest vector size. */
3013 : 183029 : if (dump_enabled_p ())
3014 : 3735 : dump_printf_loc (MSG_NOTE, vect_location,
3015 : : "***** Re-trying analysis with vector mode %s\n",
3016 : 3735 : GET_MODE_NAME (vector_modes[mode_i]));
3017 : 183029 : }
3018 : 214621 : if (!first_loop_vinfo)
3019 : 160849 : return opt_loop_vec_info::propagate_failure (res);
3020 : :
3021 : 53772 : if (dump_enabled_p ())
3022 : 9078 : dump_printf_loc (MSG_NOTE, vect_location,
3023 : : "***** Choosing vector mode %s\n",
3024 : 9078 : GET_MODE_NAME (first_loop_vinfo->vector_mode));
3025 : :
3026 : : /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3027 : : enabled, SIMDUID is not set, it is the innermost loop and we have
3028 : : either already found the loop's SIMDLEN or there was no SIMDLEN to
3029 : : begin with.
3030 : : TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3031 : 53772 : bool vect_epilogues = (!simdlen
3032 : 53770 : && loop->inner == NULL
3033 : 53255 : && param_vect_epilogues_nomask
3034 : 52211 : && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3035 : : /* No code motion support for multiple epilogues so for now
3036 : : not supported when multiple exits. */
3037 : 25454 : && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3038 : 25026 : && !loop->simduid
3039 : 77385 : && loop_cost_model (loop) > VECT_COST_MODEL_VERY_CHEAP);
3040 : 53772 : if (!vect_epilogues)
3041 : 41162 : return first_loop_vinfo;
3042 : :
3043 : : /* Now analyze first_loop_vinfo for epilogue vectorization. */
3044 : :
3045 : : /* For epilogues start the analysis from the first mode. The motivation
3046 : : behind starting from the beginning comes from cases where the VECTOR_MODES
3047 : : array may contain length-agnostic and length-specific modes. Their
3048 : : ordering is not guaranteed, so we could end up picking a mode for the main
3049 : : loop that is after the epilogue's optimal mode. */
3050 : 12610 : int masked_p = -1;
3051 : 12610 : if (!unlimited_cost_model (loop)
3052 : 12610 : && (first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p)
3053 : : != VOIDmode))
3054 : : {
3055 : 4 : vector_modes[0]
3056 : 4 : = first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p);
3057 : 4 : cached_vf_per_mode[0] = 0;
3058 : : }
3059 : : else
3060 : 12606 : vector_modes[0] = autodetected_vector_mode;
3061 : 12610 : mode_i = 0;
3062 : :
3063 : 12645 : bool supports_partial_vectors = (param_vect_partial_vector_usage != 0
3064 : 12610 : || masked_p == 1);
3065 : : if (supports_partial_vectors
3066 : 35 : && !partial_vectors_supported_p ()
3067 : 35 : && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (first_loop_vinfo))
3068 : : supports_partial_vectors = false;
3069 : 12610 : poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3070 : :
3071 : 12610 : loop_vec_info orig_loop_vinfo = first_loop_vinfo;
3072 : 12798 : do
3073 : : {
3074 : : /* Let the user override what the target suggests. */
3075 : 12704 : if (OPTION_SET_P (param_vect_partial_vector_usage))
3076 : 42 : masked_p = -1;
3077 : :
3078 : 43541 : while (1)
3079 : : {
3080 : : /* If the target does not support partial vectors we can shorten the
3081 : : number of modes to analyze for the epilogue as we know we can't
3082 : : pick a mode that would lead to a VF at least as big as the
3083 : : FIRST_VINFO_VF. */
3084 : 56997 : if (!supports_partial_vectors
3085 : 43541 : && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3086 : : {
3087 : 13481 : mode_i++;
3088 : 26962 : if (mode_i == vector_modes.length ())
3089 : : break;
3090 : 25825 : continue;
3091 : : }
3092 : : /* We would need an exhaustive search to find all modes we
3093 : : skipped but that would lead to the same result as the
3094 : : analysis it was skipped for and where we'd could check
3095 : : cached_vf_per_mode against.
3096 : : Check for the autodetected mode, which is the common
3097 : : situation on x86 which does not perform cost comparison. */
3098 : 42429 : if (!supports_partial_vectors
3099 : 30051 : && maybe_ge (cached_vf_per_mode[0], first_vinfo_vf)
3100 : 59572 : && vect_chooses_same_modes_p (autodetected_vector_mode,
3101 : 29512 : vector_modes[mode_i]))
3102 : : {
3103 : 12369 : mode_i++;
3104 : 24738 : if (mode_i == vector_modes.length ())
3105 : : break;
3106 : 12369 : continue;
3107 : : }
3108 : :
3109 : 17691 : if (dump_enabled_p ())
3110 : 3134 : dump_printf_loc (MSG_NOTE, vect_location,
3111 : : "***** Re-trying epilogue analysis with vector "
3112 : 3134 : "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3113 : :
3114 : 17691 : bool fatal;
3115 : 17691 : opt_loop_vec_info loop_vinfo
3116 : 17691 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3117 : : orig_loop_vinfo,
3118 : : vector_modes, mode_i, masked_p,
3119 : : autodetected_vector_mode, fatal);
3120 : 17691 : if (fatal)
3121 : : break;
3122 : :
3123 : 17691 : if (loop_vinfo)
3124 : : {
3125 : 6855 : if (pick_lowest_cost_p
3126 : 0 : && orig_loop_vinfo->epilogue_vinfo
3127 : 6855 : && vect_joust_loop_vinfos (loop_vinfo,
3128 : 0 : orig_loop_vinfo->epilogue_vinfo))
3129 : : {
3130 : 0 : gcc_assert (vect_epilogues);
3131 : 0 : delete orig_loop_vinfo->epilogue_vinfo;
3132 : 0 : orig_loop_vinfo->epilogue_vinfo = nullptr;
3133 : : }
3134 : 6855 : if (!orig_loop_vinfo->epilogue_vinfo)
3135 : 6855 : orig_loop_vinfo->epilogue_vinfo = loop_vinfo;
3136 : : else
3137 : : {
3138 : 0 : delete loop_vinfo;
3139 : 0 : loop_vinfo = opt_loop_vec_info::success (NULL);
3140 : : }
3141 : :
3142 : : /* For now only allow one epilogue loop, but allow
3143 : : pick_lowest_cost_p to replace it, so commit to the
3144 : : first epilogue if we have no reason to try alternatives. */
3145 : 6855 : if (!pick_lowest_cost_p)
3146 : : break;
3147 : : }
3148 : :
3149 : : /* Revert back to the default from the suggested prefered
3150 : : epilogue vectorization mode. */
3151 : 10836 : masked_p = -1;
3152 : 21672 : if (mode_i == vector_modes.length ())
3153 : : break;
3154 : : }
3155 : :
3156 : 12704 : orig_loop_vinfo = orig_loop_vinfo->epilogue_vinfo;
3157 : 12704 : if (!orig_loop_vinfo)
3158 : : break;
3159 : :
3160 : : /* When we selected a first vectorized epilogue, see if the target
3161 : : suggests to have another one. */
3162 : 6855 : masked_p = -1;
3163 : 6855 : if (!unlimited_cost_model (loop)
3164 : 3990 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (orig_loop_vinfo)
3165 : 10838 : && (orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p)
3166 : : != VOIDmode))
3167 : : {
3168 : 188 : vector_modes[0]
3169 : 94 : = orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p);
3170 : 94 : cached_vf_per_mode[0] = 0;
3171 : 94 : mode_i = 0;
3172 : : }
3173 : : else
3174 : : break;
3175 : 94 : }
3176 : : while (1);
3177 : :
3178 : 12610 : if (first_loop_vinfo->epilogue_vinfo)
3179 : : {
3180 : 6766 : poly_uint64 lowest_th
3181 : 6766 : = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3182 : 6766 : loop_vec_info epilog_vinfo = first_loop_vinfo->epilogue_vinfo;
3183 : 6855 : do
3184 : : {
3185 : 6855 : poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (epilog_vinfo);
3186 : 6855 : gcc_assert (!LOOP_REQUIRES_VERSIONING (epilog_vinfo)
3187 : : || maybe_ne (lowest_th, 0U));
3188 : : /* Keep track of the known smallest versioning threshold. */
3189 : 6855 : if (ordered_p (lowest_th, th))
3190 : 6855 : lowest_th = ordered_min (lowest_th, th);
3191 : 6855 : epilog_vinfo = epilog_vinfo->epilogue_vinfo;
3192 : : }
3193 : 6855 : while (epilog_vinfo);
3194 : 6766 : LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3195 : 6766 : if (dump_enabled_p ())
3196 : 1367 : dump_printf_loc (MSG_NOTE, vect_location,
3197 : : "***** Choosing epilogue vector mode %s\n",
3198 : 1367 : GET_MODE_NAME
3199 : : (first_loop_vinfo->epilogue_vinfo->vector_mode));
3200 : : }
3201 : :
3202 : 12610 : return first_loop_vinfo;
3203 : 670391 : }
3204 : :
3205 : : /* Return true if there is an in-order reduction function for CODE, storing
3206 : : it in *REDUC_FN if so. */
3207 : :
3208 : : static bool
3209 : 4716 : fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3210 : : {
3211 : : /* We support MINUS_EXPR by negating the operand. This also preserves an
3212 : : initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3213 : : (-0.0) = -0.0. */
3214 : 4716 : if (code == PLUS_EXPR || code == MINUS_EXPR)
3215 : : {
3216 : 4040 : *reduc_fn = IFN_FOLD_LEFT_PLUS;
3217 : 0 : return true;
3218 : : }
3219 : : return false;
3220 : : }
3221 : :
3222 : : /* Function reduction_fn_for_scalar_code
3223 : :
3224 : : Input:
3225 : : CODE - tree_code of a reduction operations.
3226 : :
3227 : : Output:
3228 : : REDUC_FN - the corresponding internal function to be used to reduce the
3229 : : vector of partial results into a single scalar result, or IFN_LAST
3230 : : if the operation is a supported reduction operation, but does not have
3231 : : such an internal function.
3232 : :
3233 : : Return FALSE if CODE currently cannot be vectorized as reduction. */
3234 : :
3235 : : bool
3236 : 2018939 : reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3237 : : {
3238 : 2018939 : if (code.is_tree_code ())
3239 : 2018885 : switch (tree_code (code))
3240 : : {
3241 : 14209 : case MAX_EXPR:
3242 : 14209 : *reduc_fn = IFN_REDUC_MAX;
3243 : 14209 : return true;
3244 : :
3245 : 50611 : case MIN_EXPR:
3246 : 50611 : *reduc_fn = IFN_REDUC_MIN;
3247 : 50611 : return true;
3248 : :
3249 : 1088697 : case PLUS_EXPR:
3250 : 1088697 : *reduc_fn = IFN_REDUC_PLUS;
3251 : 1088697 : return true;
3252 : :
3253 : 256715 : case BIT_AND_EXPR:
3254 : 256715 : *reduc_fn = IFN_REDUC_AND;
3255 : 256715 : return true;
3256 : :
3257 : 285788 : case BIT_IOR_EXPR:
3258 : 285788 : *reduc_fn = IFN_REDUC_IOR;
3259 : 285788 : return true;
3260 : :
3261 : 43701 : case BIT_XOR_EXPR:
3262 : 43701 : *reduc_fn = IFN_REDUC_XOR;
3263 : 43701 : return true;
3264 : :
3265 : 279164 : case MULT_EXPR:
3266 : 279164 : case MINUS_EXPR:
3267 : 279164 : *reduc_fn = IFN_LAST;
3268 : 279164 : return true;
3269 : :
3270 : : default:
3271 : : return false;
3272 : : }
3273 : : else
3274 : 54 : switch (combined_fn (code))
3275 : : {
3276 : 30 : CASE_CFN_FMAX:
3277 : 30 : *reduc_fn = IFN_REDUC_FMAX;
3278 : 30 : return true;
3279 : :
3280 : 24 : CASE_CFN_FMIN:
3281 : 24 : *reduc_fn = IFN_REDUC_FMIN;
3282 : 24 : return true;
3283 : :
3284 : : default:
3285 : : return false;
3286 : : }
3287 : : }
3288 : :
3289 : : /* Set *SBOOL_FN to the corresponding function working on vector masks
3290 : : for REDUC_FN. Return true if that exists, false otherwise. */
3291 : :
3292 : : static bool
3293 : 0 : sbool_reduction_fn_for_fn (internal_fn reduc_fn, internal_fn *sbool_fn)
3294 : : {
3295 : 0 : switch (reduc_fn)
3296 : : {
3297 : 0 : case IFN_REDUC_AND:
3298 : 0 : *sbool_fn = IFN_REDUC_SBOOL_AND;
3299 : 0 : return true;
3300 : 0 : case IFN_REDUC_IOR:
3301 : 0 : *sbool_fn = IFN_REDUC_SBOOL_IOR;
3302 : 0 : return true;
3303 : 0 : case IFN_REDUC_XOR:
3304 : 0 : *sbool_fn = IFN_REDUC_SBOOL_XOR;
3305 : 0 : return true;
3306 : : default:
3307 : : return false;
3308 : : }
3309 : : }
3310 : :
3311 : : /* If there is a neutral value X such that a reduction would not be affected
3312 : : by the introduction of additional X elements, return that X, otherwise
3313 : : return null. CODE is the code of the reduction and SCALAR_TYPE is type
3314 : : of the scalar elements. If the reduction has just a single initial value
3315 : : then INITIAL_VALUE is that value, otherwise it is null.
3316 : : If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3317 : : In that case no signed zero is returned. */
3318 : :
3319 : : tree
3320 : 76973 : neutral_op_for_reduction (tree scalar_type, code_helper code,
3321 : : tree initial_value, bool as_initial)
3322 : : {
3323 : 76973 : if (code.is_tree_code ())
3324 : 76919 : switch (tree_code (code))
3325 : : {
3326 : 11421 : case DOT_PROD_EXPR:
3327 : 11421 : case SAD_EXPR:
3328 : 11421 : case MINUS_EXPR:
3329 : 11421 : case BIT_IOR_EXPR:
3330 : 11421 : case BIT_XOR_EXPR:
3331 : 11421 : return build_zero_cst (scalar_type);
3332 : 59513 : case WIDEN_SUM_EXPR:
3333 : 59513 : case PLUS_EXPR:
3334 : 59513 : if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3335 : 56 : return build_real (scalar_type, dconstm0);
3336 : : else
3337 : 59457 : return build_zero_cst (scalar_type);
3338 : :
3339 : 2013 : case MULT_EXPR:
3340 : 2013 : return build_one_cst (scalar_type);
3341 : :
3342 : 1446 : case BIT_AND_EXPR:
3343 : 1446 : return build_all_ones_cst (scalar_type);
3344 : :
3345 : : case MAX_EXPR:
3346 : : case MIN_EXPR:
3347 : : return initial_value;
3348 : :
3349 : 436 : default:
3350 : 436 : return NULL_TREE;
3351 : : }
3352 : : else
3353 : 54 : switch (combined_fn (code))
3354 : : {
3355 : : CASE_CFN_FMIN:
3356 : : CASE_CFN_FMAX:
3357 : : return initial_value;
3358 : :
3359 : 0 : default:
3360 : 0 : return NULL_TREE;
3361 : : }
3362 : : }
3363 : :
3364 : : /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3365 : : STMT is printed with a message MSG. */
3366 : :
3367 : : static void
3368 : 489 : report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3369 : : {
3370 : 489 : dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3371 : 489 : }
3372 : :
3373 : : /* Return true if we need an in-order reduction for operation CODE
3374 : : on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3375 : : overflow must wrap. */
3376 : :
3377 : : bool
3378 : 6464201 : needs_fold_left_reduction_p (tree type, code_helper code)
3379 : : {
3380 : : /* CHECKME: check for !flag_finite_math_only too? */
3381 : 6464201 : if (SCALAR_FLOAT_TYPE_P (type))
3382 : : {
3383 : 547021 : if (code.is_tree_code ())
3384 : 546971 : switch (tree_code (code))
3385 : : {
3386 : : case MIN_EXPR:
3387 : : case MAX_EXPR:
3388 : : return false;
3389 : :
3390 : 545332 : default:
3391 : 545332 : return !flag_associative_math;
3392 : : }
3393 : : else
3394 : 50 : switch (combined_fn (code))
3395 : : {
3396 : : CASE_CFN_FMIN:
3397 : : CASE_CFN_FMAX:
3398 : : return false;
3399 : :
3400 : 2 : default:
3401 : 2 : return !flag_associative_math;
3402 : : }
3403 : : }
3404 : :
3405 : 5917180 : if (INTEGRAL_TYPE_P (type))
3406 : 5916351 : return (!code.is_tree_code ()
3407 : 5916351 : || !operation_no_trapping_overflow (type, tree_code (code)));
3408 : :
3409 : 829 : if (SAT_FIXED_POINT_TYPE_P (type))
3410 : : return true;
3411 : :
3412 : : return false;
3413 : : }
3414 : :
3415 : : /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3416 : : has a handled computation expression. Store the main reduction
3417 : : operation in *CODE. */
3418 : :
3419 : : static bool
3420 : 64361 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3421 : : tree loop_arg, code_helper *code,
3422 : : vec<std::pair<ssa_op_iter, use_operand_p> > &path,
3423 : : bool inner_loop_of_double_reduc)
3424 : : {
3425 : 64361 : auto_bitmap visited;
3426 : 64361 : tree lookfor = PHI_RESULT (phi);
3427 : 64361 : ssa_op_iter curri;
3428 : 64361 : use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3429 : 135220 : while (USE_FROM_PTR (curr) != loop_arg)
3430 : 6498 : curr = op_iter_next_use (&curri);
3431 : 64361 : curri.i = curri.numops;
3432 : 620613 : do
3433 : : {
3434 : 620613 : path.safe_push (std::make_pair (curri, curr));
3435 : 620613 : tree use = USE_FROM_PTR (curr);
3436 : 620613 : if (use == lookfor)
3437 : : break;
3438 : 556542 : gimple *def = SSA_NAME_DEF_STMT (use);
3439 : 556542 : if (gimple_nop_p (def)
3440 : 556542 : || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3441 : : {
3442 : 470970 : pop:
3443 : 470970 : do
3444 : : {
3445 : 470970 : std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3446 : 470970 : curri = x.first;
3447 : 470970 : curr = x.second;
3448 : 516217 : do
3449 : 516217 : curr = op_iter_next_use (&curri);
3450 : : /* Skip already visited or non-SSA operands (from iterating
3451 : : over PHI args). */
3452 : : while (curr != NULL_USE_OPERAND_P
3453 : 1032434 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3454 : 177506 : || ! bitmap_set_bit (visited,
3455 : 177506 : SSA_NAME_VERSION
3456 : : (USE_FROM_PTR (curr)))));
3457 : : }
3458 : 941940 : while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3459 : 158570 : if (curr == NULL_USE_OPERAND_P)
3460 : : break;
3461 : : }
3462 : : else
3463 : : {
3464 : 466989 : if (gimple_code (def) == GIMPLE_PHI)
3465 : 48700 : curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3466 : : else
3467 : 418289 : curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3468 : : while (curr != NULL_USE_OPERAND_P
3469 : 561319 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3470 : 488480 : || ! bitmap_set_bit (visited,
3471 : 488480 : SSA_NAME_VERSION
3472 : : (USE_FROM_PTR (curr)))))
3473 : 94330 : curr = op_iter_next_use (&curri);
3474 : 466989 : if (curr == NULL_USE_OPERAND_P)
3475 : 69017 : goto pop;
3476 : : }
3477 : : }
3478 : : while (1);
3479 : 64361 : if (dump_file && (dump_flags & TDF_DETAILS))
3480 : : {
3481 : 3763 : dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3482 : 3763 : unsigned i;
3483 : 3763 : std::pair<ssa_op_iter, use_operand_p> *x;
3484 : 12851 : FOR_EACH_VEC_ELT (path, i, x)
3485 : 9088 : dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3486 : 3763 : dump_printf (MSG_NOTE, "\n");
3487 : : }
3488 : :
3489 : : /* Check whether the reduction path detected is valid. */
3490 : 64361 : bool fail = path.length () == 0;
3491 : 64361 : bool neg = false;
3492 : 64361 : int sign = -1;
3493 : 64361 : *code = ERROR_MARK;
3494 : 140947 : for (unsigned i = 1; i < path.length (); ++i)
3495 : : {
3496 : 79519 : gimple *use_stmt = USE_STMT (path[i].second);
3497 : 79519 : gimple_match_op op;
3498 : 79519 : if (!gimple_extract_op (use_stmt, &op))
3499 : : {
3500 : : fail = true;
3501 : 2933 : break;
3502 : : }
3503 : 78960 : unsigned int opi = op.num_ops;
3504 : 78960 : if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3505 : : {
3506 : : /* The following make sure we can compute the operand index
3507 : : easily plus it mostly disallows chaining via COND_EXPR condition
3508 : : operands. */
3509 : 124572 : for (opi = 0; opi < op.num_ops; ++opi)
3510 : 123625 : if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3511 : : break;
3512 : : }
3513 : 3506 : else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3514 : : {
3515 : 7032 : for (opi = 0; opi < op.num_ops; ++opi)
3516 : 7032 : if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3517 : : break;
3518 : : }
3519 : 78960 : if (opi == op.num_ops)
3520 : : {
3521 : : fail = true;
3522 : : break;
3523 : : }
3524 : 78013 : op.code = canonicalize_code (op.code, op.type);
3525 : 78013 : if (op.code == MINUS_EXPR)
3526 : : {
3527 : 3844 : op.code = PLUS_EXPR;
3528 : : /* Track whether we negate the reduction value each iteration. */
3529 : 3844 : if (op.ops[1] == op.ops[opi])
3530 : 32 : neg = ! neg;
3531 : : }
3532 : 74169 : else if (op.code == IFN_COND_SUB)
3533 : : {
3534 : 2 : op.code = IFN_COND_ADD;
3535 : : /* Track whether we negate the reduction value each iteration. */
3536 : 2 : if (op.ops[2] == op.ops[opi])
3537 : 0 : neg = ! neg;
3538 : : }
3539 : : /* For an FMA the reduction code is the PLUS if the addition chain
3540 : : is the reduction. */
3541 : 74167 : else if (op.code == IFN_FMA && opi == 2)
3542 : 28 : op.code = PLUS_EXPR;
3543 : 78013 : if (CONVERT_EXPR_CODE_P (op.code)
3544 : 78013 : && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3545 : : ;
3546 : 74599 : else if (*code == ERROR_MARK)
3547 : : {
3548 : 62648 : *code = op.code;
3549 : 62648 : sign = TYPE_SIGN (op.type);
3550 : : }
3551 : 11951 : else if (op.code != *code)
3552 : : {
3553 : : fail = true;
3554 : : break;
3555 : : }
3556 : 10731 : else if ((op.code == MIN_EXPR
3557 : 10647 : || op.code == MAX_EXPR)
3558 : 10743 : && sign != TYPE_SIGN (op.type))
3559 : : {
3560 : : fail = true;
3561 : : break;
3562 : : }
3563 : : /* Check there's only a single stmt the op is used on. For the
3564 : : not value-changing tail and the last stmt allow out-of-loop uses,
3565 : : but not when this is the inner loop of a double reduction.
3566 : : ??? We could relax this and handle arbitrary live stmts by
3567 : : forcing a scalar epilogue for example. */
3568 : 76790 : imm_use_iterator imm_iter;
3569 : 76790 : use_operand_p use_p;
3570 : 76790 : gimple *op_use_stmt;
3571 : 76790 : unsigned cnt = 0;
3572 : 80266 : bool cond_fn_p = op.code.is_internal_fn ()
3573 : 3476 : && (conditional_internal_fn_code (internal_fn (op.code))
3574 : 76790 : != ERROR_MARK);
3575 : :
3576 : 255467 : FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3577 : : {
3578 : : /* In case of a COND_OP (mask, op1, op2, op1) reduction we should
3579 : : have op1 twice (once as definition, once as else) in the same
3580 : : operation. Enforce this. */
3581 : 101887 : if (cond_fn_p && op_use_stmt == use_stmt)
3582 : : {
3583 : 3420 : gcall *call = as_a<gcall *> (use_stmt);
3584 : 3420 : unsigned else_pos
3585 : 3420 : = internal_fn_else_index (internal_fn (op.code));
3586 : 3420 : if (gimple_call_arg (call, else_pos) != op.ops[opi])
3587 : : {
3588 : : fail = true;
3589 : : break;
3590 : : }
3591 : 17100 : for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
3592 : : {
3593 : 13680 : if (j == else_pos)
3594 : 3420 : continue;
3595 : 10260 : if (gimple_call_arg (call, j) == op.ops[opi])
3596 : 3420 : cnt++;
3597 : : }
3598 : : }
3599 : 98467 : else if (!is_gimple_debug (op_use_stmt)
3600 : 98467 : && ((*code != ERROR_MARK || inner_loop_of_double_reduc)
3601 : 1761 : || flow_bb_inside_loop_p (loop,
3602 : 1761 : gimple_bb (op_use_stmt))))
3603 : 147295 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3604 : 73652 : cnt++;
3605 : 76790 : }
3606 : :
3607 : 76790 : if (cnt != 1)
3608 : : {
3609 : : fail = true;
3610 : : break;
3611 : : }
3612 : : }
3613 : 67593 : return ! fail && ! neg && *code != ERROR_MARK;
3614 : 64361 : }
3615 : :
3616 : : bool
3617 : 21 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3618 : : tree loop_arg, enum tree_code code)
3619 : : {
3620 : 21 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3621 : 21 : code_helper code_;
3622 : 21 : return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path, false)
3623 : 21 : && code_ == code);
3624 : 21 : }
3625 : :
3626 : :
3627 : :
3628 : : /* Function vect_is_simple_reduction
3629 : :
3630 : : (1) Detect a cross-iteration def-use cycle that represents a simple
3631 : : reduction computation. We look for the following pattern:
3632 : :
3633 : : loop_header:
3634 : : a1 = phi < a0, a2 >
3635 : : a3 = ...
3636 : : a2 = operation (a3, a1)
3637 : :
3638 : : or
3639 : :
3640 : : a3 = ...
3641 : : loop_header:
3642 : : a1 = phi < a0, a2 >
3643 : : a2 = operation (a3, a1)
3644 : :
3645 : : such that:
3646 : : 1. operation is commutative and associative and it is safe to
3647 : : change the order of the computation
3648 : : 2. no uses for a2 in the loop (a2 is used out of the loop)
3649 : : 3. no uses of a1 in the loop besides the reduction operation
3650 : : 4. no uses of a1 outside the loop.
3651 : :
3652 : : Conditions 1,4 are tested here.
3653 : : Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3654 : :
3655 : : (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3656 : : nested cycles.
3657 : :
3658 : : (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3659 : : reductions:
3660 : :
3661 : : a1 = phi < a0, a2 >
3662 : : inner loop (def of a3)
3663 : : a2 = phi < a3 >
3664 : :
3665 : : (4) Detect condition expressions, ie:
3666 : : for (int i = 0; i < N; i++)
3667 : : if (a[i] < val)
3668 : : ret_val = a[i];
3669 : :
3670 : : */
3671 : :
3672 : : static stmt_vec_info
3673 : 85381 : vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3674 : : gphi **double_reduc)
3675 : : {
3676 : 85381 : gphi *phi = as_a <gphi *> (phi_info->stmt);
3677 : 85381 : gimple *phi_use_stmt = NULL;
3678 : 85381 : imm_use_iterator imm_iter;
3679 : 85381 : use_operand_p use_p;
3680 : :
3681 : : /* When double_reduc is NULL we are testing the inner loop of a
3682 : : double reduction. */
3683 : 85381 : bool inner_loop_of_double_reduc = double_reduc == NULL;
3684 : 85381 : if (double_reduc)
3685 : 84399 : *double_reduc = NULL;
3686 : 85381 : STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3687 : :
3688 : 85381 : tree phi_name = PHI_RESULT (phi);
3689 : : /* ??? If there are no uses of the PHI result the inner loop reduction
3690 : : won't be detected as possibly double-reduction by vectorizable_reduction
3691 : : because that tries to walk the PHI arg from the preheader edge which
3692 : : can be constant. See PR60382. */
3693 : 85381 : if (has_zero_uses (phi_name))
3694 : : return NULL;
3695 : 85217 : class loop *loop = (gimple_bb (phi))->loop_father;
3696 : 85217 : unsigned nphi_def_loop_uses = 0;
3697 : 296745 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3698 : : {
3699 : 130117 : gimple *use_stmt = USE_STMT (use_p);
3700 : 130117 : if (is_gimple_debug (use_stmt))
3701 : 31233 : continue;
3702 : :
3703 : 98884 : if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3704 : : {
3705 : 3806 : if (dump_enabled_p ())
3706 : 30 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3707 : : "intermediate value used outside loop.\n");
3708 : :
3709 : 3806 : return NULL;
3710 : : }
3711 : :
3712 : : /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
3713 : : op1 twice (once as definition, once as else) in the same operation.
3714 : : Only count it as one. */
3715 : 95078 : if (use_stmt != phi_use_stmt)
3716 : : {
3717 : 91397 : nphi_def_loop_uses++;
3718 : 91397 : phi_use_stmt = use_stmt;
3719 : : }
3720 : 3806 : }
3721 : :
3722 : 81411 : tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3723 : 81411 : if (TREE_CODE (latch_def) != SSA_NAME)
3724 : : {
3725 : 1233 : if (dump_enabled_p ())
3726 : 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3727 : : "reduction: not ssa_name: %T\n", latch_def);
3728 : 1233 : return NULL;
3729 : : }
3730 : :
3731 : 80178 : stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3732 : 80178 : if (!def_stmt_info
3733 : 80178 : || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3734 : 134 : return NULL;
3735 : :
3736 : 80044 : bool nested_in_vect_loop
3737 : 80044 : = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3738 : 80044 : unsigned nlatch_def_loop_uses = 0;
3739 : 80044 : auto_vec<gphi *, 3> lcphis;
3740 : 381523 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3741 : : {
3742 : 221435 : gimple *use_stmt = USE_STMT (use_p);
3743 : 221435 : if (is_gimple_debug (use_stmt))
3744 : 61087 : continue;
3745 : 160348 : if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3746 : 88685 : nlatch_def_loop_uses++;
3747 : : else
3748 : : /* We can have more than one loop-closed PHI. */
3749 : 71663 : lcphis.safe_push (as_a <gphi *> (use_stmt));
3750 : 80044 : }
3751 : :
3752 : : /* If we are vectorizing an inner reduction we are executing that
3753 : : in the original order only in case we are not dealing with a
3754 : : double reduction. */
3755 : 80044 : if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3756 : : {
3757 : 2181 : if (dump_enabled_p ())
3758 : 357 : report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3759 : : "detected nested cycle: ");
3760 : 2181 : return def_stmt_info;
3761 : : }
3762 : :
3763 : : /* When the inner loop of a double reduction ends up with more than
3764 : : one loop-closed PHI we have failed to classify alternate such
3765 : : PHIs as double reduction, leading to wrong code. See PR103237. */
3766 : 78833 : if (inner_loop_of_double_reduc && lcphis.length () != 1)
3767 : : {
3768 : 1 : if (dump_enabled_p ())
3769 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3770 : : "unhandle double reduction\n");
3771 : 1 : return NULL;
3772 : : }
3773 : :
3774 : : /* If this isn't a nested cycle or if the nested cycle reduction value
3775 : : is used ouside of the inner loop we cannot handle uses of the reduction
3776 : : value. */
3777 : 77862 : if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3778 : : {
3779 : 12388 : if (dump_enabled_p ())
3780 : 314 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3781 : : "reduction used in loop.\n");
3782 : 12388 : return NULL;
3783 : : }
3784 : :
3785 : : /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3786 : : defined in the inner loop. */
3787 : 65474 : if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3788 : : {
3789 : 1134 : tree op1 = PHI_ARG_DEF (def_stmt, 0);
3790 : 1134 : if (gimple_phi_num_args (def_stmt) != 1
3791 : 1134 : || TREE_CODE (op1) != SSA_NAME)
3792 : : {
3793 : 38 : if (dump_enabled_p ())
3794 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3795 : : "unsupported phi node definition.\n");
3796 : :
3797 : 38 : return NULL;
3798 : : }
3799 : :
3800 : : /* Verify there is an inner cycle composed of the PHI phi_use_stmt
3801 : : and the latch definition op1. */
3802 : 1096 : gimple *def1 = SSA_NAME_DEF_STMT (op1);
3803 : 1096 : if (gimple_bb (def1)
3804 : 1096 : && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3805 : 1096 : && loop->inner
3806 : 1088 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3807 : 1088 : && (is_gimple_assign (def1) || is_gimple_call (def1))
3808 : 1079 : && is_a <gphi *> (phi_use_stmt)
3809 : 1068 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
3810 : 1068 : && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
3811 : : loop_latch_edge (loop->inner)))
3812 : 2162 : && lcphis.length () == 1)
3813 : : {
3814 : 982 : if (dump_enabled_p ())
3815 : 132 : report_vect_op (MSG_NOTE, def_stmt,
3816 : : "detected double reduction: ");
3817 : :
3818 : 982 : *double_reduc = as_a <gphi *> (phi_use_stmt);
3819 : 982 : return def_stmt_info;
3820 : : }
3821 : :
3822 : 114 : return NULL;
3823 : : }
3824 : :
3825 : : /* Look for the expression computing latch_def from then loop PHI result. */
3826 : 64340 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3827 : 64340 : code_helper code;
3828 : 64340 : if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3829 : : path, inner_loop_of_double_reduc))
3830 : : {
3831 : 61108 : STMT_VINFO_REDUC_CODE (phi_info) = code;
3832 : 61108 : if (code == COND_EXPR && !nested_in_vect_loop)
3833 : 4174 : STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3834 : :
3835 : : /* Fill in STMT_VINFO_REDUC_IDX. */
3836 : 61108 : unsigned i;
3837 : 197302 : for (i = path.length () - 1; i >= 1; --i)
3838 : : {
3839 : 75086 : gimple *stmt = USE_STMT (path[i].second);
3840 : 75086 : stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3841 : 75086 : gimple_match_op op;
3842 : 75086 : if (!gimple_extract_op (stmt, &op))
3843 : 0 : gcc_unreachable ();
3844 : 75086 : if (gassign *assign = dyn_cast<gassign *> (stmt))
3845 : 71600 : STMT_VINFO_REDUC_IDX (stmt_info)
3846 : 71600 : = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3847 : : else
3848 : : {
3849 : 3486 : gcall *call = as_a<gcall *> (stmt);
3850 : 3486 : STMT_VINFO_REDUC_IDX (stmt_info)
3851 : 3486 : = path[i].second->use - gimple_call_arg_ptr (call, 0);
3852 : : }
3853 : : }
3854 : 61108 : if (dump_enabled_p ())
3855 : 3707 : dump_printf_loc (MSG_NOTE, vect_location,
3856 : : "reduction: detected reduction\n");
3857 : :
3858 : 61108 : return def_stmt_info;
3859 : : }
3860 : :
3861 : 3232 : if (dump_enabled_p ())
3862 : 80 : dump_printf_loc (MSG_NOTE, vect_location,
3863 : : "reduction: unknown pattern\n");
3864 : :
3865 : : return NULL;
3866 : 144384 : }
3867 : :
3868 : : /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3869 : : PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3870 : : or -1 if not known. */
3871 : :
3872 : : static int
3873 : 347700 : vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3874 : : {
3875 : 347700 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
3876 : 347700 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3877 : : {
3878 : 139174 : if (dump_enabled_p ())
3879 : 2800 : dump_printf_loc (MSG_NOTE, vect_location,
3880 : : "cost model: epilogue peel iters set to vf/2 "
3881 : : "because loop iterations are unknown .\n");
3882 : 139174 : return assumed_vf / 2;
3883 : : }
3884 : : else
3885 : : {
3886 : 208526 : int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3887 : 208526 : peel_iters_prologue = MIN (niters, peel_iters_prologue);
3888 : 208526 : int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3889 : : /* If we need to peel for gaps, but no peeling is required, we have to
3890 : : peel VF iterations. */
3891 : 208526 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3892 : 208526 : peel_iters_epilogue = assumed_vf;
3893 : 208526 : return peel_iters_epilogue;
3894 : : }
3895 : : }
3896 : :
3897 : : /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3898 : : int
3899 : 264172 : vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3900 : : int *peel_iters_epilogue,
3901 : : stmt_vector_for_cost *scalar_cost_vec,
3902 : : stmt_vector_for_cost *prologue_cost_vec,
3903 : : stmt_vector_for_cost *epilogue_cost_vec)
3904 : : {
3905 : 264172 : int retval = 0;
3906 : :
3907 : 264172 : *peel_iters_epilogue
3908 : 264172 : = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3909 : :
3910 : 264172 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3911 : : {
3912 : : /* If peeled iterations are known but number of scalar loop
3913 : : iterations are unknown, count a taken branch per peeled loop. */
3914 : 89080 : if (peel_iters_prologue > 0)
3915 : 53401 : retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3916 : : vect_prologue);
3917 : 89080 : if (*peel_iters_epilogue > 0)
3918 : 89005 : retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3919 : : vect_epilogue);
3920 : : }
3921 : :
3922 : 264172 : stmt_info_for_cost *si;
3923 : 264172 : int j;
3924 : 264172 : if (peel_iters_prologue)
3925 : 651788 : FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3926 : 540371 : retval += record_stmt_cost (prologue_cost_vec,
3927 : 540371 : si->count * peel_iters_prologue,
3928 : : si->kind, si->stmt_info, si->misalign,
3929 : : vect_prologue);
3930 : 264172 : if (*peel_iters_epilogue)
3931 : 1016065 : FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3932 : 840184 : retval += record_stmt_cost (epilogue_cost_vec,
3933 : 840184 : si->count * *peel_iters_epilogue,
3934 : : si->kind, si->stmt_info, si->misalign,
3935 : : vect_epilogue);
3936 : :
3937 : 264172 : return retval;
3938 : : }
3939 : :
3940 : : /* Function vect_estimate_min_profitable_iters
3941 : :
3942 : : Return the number of iterations required for the vector version of the
3943 : : loop to be profitable relative to the cost of the scalar version of the
3944 : : loop.
3945 : :
3946 : : *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3947 : : of iterations for vectorization. -1 value means loop vectorization
3948 : : is not profitable. This returned value may be used for dynamic
3949 : : profitability check.
3950 : :
3951 : : *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3952 : : for static check against estimated number of iterations. */
3953 : :
3954 : : static void
3955 : 100084 : vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3956 : : int *ret_min_profitable_niters,
3957 : : int *ret_min_profitable_estimate,
3958 : : unsigned *suggested_unroll_factor)
3959 : : {
3960 : 100084 : int min_profitable_iters;
3961 : 100084 : int min_profitable_estimate;
3962 : 100084 : int peel_iters_prologue;
3963 : 100084 : int peel_iters_epilogue;
3964 : 100084 : unsigned vec_inside_cost = 0;
3965 : 100084 : int vec_outside_cost = 0;
3966 : 100084 : unsigned vec_prologue_cost = 0;
3967 : 100084 : unsigned vec_epilogue_cost = 0;
3968 : 100084 : int scalar_single_iter_cost = 0;
3969 : 100084 : int scalar_outside_cost = 0;
3970 : 100084 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
3971 : 100084 : int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3972 : 100084 : vector_costs *target_cost_data = loop_vinfo->vector_costs;
3973 : :
3974 : : /* Cost model disabled. */
3975 : 100084 : if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3976 : : {
3977 : 16351 : if (dump_enabled_p ())
3978 : 10072 : dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3979 : 16351 : *ret_min_profitable_niters = 0;
3980 : 16351 : *ret_min_profitable_estimate = 0;
3981 : 16351 : return;
3982 : : }
3983 : :
3984 : : /* Requires loop versioning tests to handle misalignment. */
3985 : 83733 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3986 : : {
3987 : : /* FIXME: Make cost depend on complexity of individual check. */
3988 : 28 : unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3989 : 28 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
3990 : 28 : if (dump_enabled_p ())
3991 : 1 : dump_printf (MSG_NOTE,
3992 : : "cost model: Adding cost of checks for loop "
3993 : : "versioning to treat misalignment.\n");
3994 : : }
3995 : :
3996 : : /* Requires loop versioning with alias checks. */
3997 : 83733 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3998 : : {
3999 : : /* FIXME: Make cost depend on complexity of individual check. */
4000 : 4026 : unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4001 : 4026 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4002 : 4026 : len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4003 : 0 : if (len)
4004 : : /* Count LEN - 1 ANDs and LEN comparisons. */
4005 : 0 : (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4006 : : scalar_stmt, vect_prologue);
4007 : 4026 : len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4008 : 1090 : if (len)
4009 : : {
4010 : : /* Count LEN - 1 ANDs and LEN comparisons. */
4011 : 1090 : unsigned int nstmts = len * 2 - 1;
4012 : : /* +1 for each bias that needs adding. */
4013 : 2180 : for (unsigned int i = 0; i < len; ++i)
4014 : 1090 : if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4015 : 123 : nstmts += 1;
4016 : 1090 : (void) add_stmt_cost (target_cost_data, nstmts,
4017 : : scalar_stmt, vect_prologue);
4018 : : }
4019 : 4026 : if (dump_enabled_p ())
4020 : 16 : dump_printf (MSG_NOTE,
4021 : : "cost model: Adding cost of checks for loop "
4022 : : "versioning aliasing.\n");
4023 : : }
4024 : :
4025 : : /* Requires loop versioning with niter checks. */
4026 : 83733 : if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4027 : : {
4028 : : /* FIXME: Make cost depend on complexity of individual check. */
4029 : 664 : (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4030 : : NULL, NULL, NULL_TREE, 0, vect_prologue);
4031 : 664 : if (dump_enabled_p ())
4032 : 1 : dump_printf (MSG_NOTE,
4033 : : "cost model: Adding cost of checks for loop "
4034 : : "versioning niters.\n");
4035 : : }
4036 : :
4037 : 83733 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4038 : 4704 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4039 : : vect_prologue);
4040 : :
4041 : : /* Count statements in scalar loop. Using this as scalar cost for a single
4042 : : iteration for now.
4043 : :
4044 : : TODO: Add outer loop support.
4045 : :
4046 : : TODO: Consider assigning different costs to different scalar
4047 : : statements. */
4048 : :
4049 : 83733 : scalar_single_iter_cost = (loop_vinfo->scalar_costs->total_cost ()
4050 : 83733 : * param_vect_scalar_cost_multiplier) / 100;
4051 : :
4052 : : /* Add additional cost for the peeled instructions in prologue and epilogue
4053 : : loop. (For fully-masked loops there will be no peeling.)
4054 : :
4055 : : FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4056 : : at compile-time - we assume it's vf/2 (the worst would be vf-1).
4057 : :
4058 : : TODO: Build an expression that represents peel_iters for prologue and
4059 : : epilogue to be used in a run-time test. */
4060 : :
4061 : 83733 : bool prologue_need_br_taken_cost = false;
4062 : 83733 : bool prologue_need_br_not_taken_cost = false;
4063 : :
4064 : : /* Calculate peel_iters_prologue. */
4065 : 83733 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4066 : : peel_iters_prologue = 0;
4067 : 83733 : else if (npeel < 0)
4068 : : {
4069 : 183 : peel_iters_prologue = assumed_vf / 2;
4070 : 183 : if (dump_enabled_p ())
4071 : 4 : dump_printf (MSG_NOTE, "cost model: "
4072 : : "prologue peel iters set to vf/2.\n");
4073 : :
4074 : : /* If peeled iterations are unknown, count a taken branch and a not taken
4075 : : branch per peeled loop. Even if scalar loop iterations are known,
4076 : : vector iterations are not known since peeled prologue iterations are
4077 : : not known. Hence guards remain the same. */
4078 : : prologue_need_br_taken_cost = true;
4079 : : prologue_need_br_not_taken_cost = true;
4080 : : }
4081 : : else
4082 : : {
4083 : 83550 : peel_iters_prologue = npeel;
4084 : 83550 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4085 : : /* If peeled iterations are known but number of scalar loop
4086 : : iterations are unknown, count a taken branch per peeled loop. */
4087 : 83733 : prologue_need_br_taken_cost = true;
4088 : : }
4089 : :
4090 : 83733 : bool epilogue_need_br_taken_cost = false;
4091 : 83733 : bool epilogue_need_br_not_taken_cost = false;
4092 : :
4093 : : /* Calculate peel_iters_epilogue. */
4094 : 83733 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4095 : : /* We need to peel exactly one iteration for gaps. */
4096 : 22 : peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4097 : 83711 : else if (npeel < 0)
4098 : : {
4099 : : /* If peeling for alignment is unknown, loop bound of main loop
4100 : : becomes unknown. */
4101 : 183 : peel_iters_epilogue = assumed_vf / 2;
4102 : 183 : if (dump_enabled_p ())
4103 : 4 : dump_printf (MSG_NOTE, "cost model: "
4104 : : "epilogue peel iters set to vf/2 because "
4105 : : "peeling for alignment is unknown.\n");
4106 : :
4107 : : /* See the same reason above in peel_iters_prologue calculation. */
4108 : : epilogue_need_br_taken_cost = true;
4109 : : epilogue_need_br_not_taken_cost = true;
4110 : : }
4111 : : else
4112 : : {
4113 : 83528 : peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4114 : 83528 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4115 : : /* If peeled iterations are known but number of scalar loop
4116 : : iterations are unknown, count a taken branch per peeled loop. */
4117 : 83733 : epilogue_need_br_taken_cost = true;
4118 : : }
4119 : :
4120 : 83733 : stmt_info_for_cost *si;
4121 : 83733 : int j;
4122 : : /* Add costs associated with peel_iters_prologue. */
4123 : 83733 : if (peel_iters_prologue)
4124 : 871 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4125 : : {
4126 : 679 : (void) add_stmt_cost (target_cost_data,
4127 : 679 : si->count * peel_iters_prologue, si->kind,
4128 : : si->stmt_info, si->node, si->vectype,
4129 : : si->misalign, vect_prologue);
4130 : : }
4131 : :
4132 : : /* Add costs associated with peel_iters_epilogue. */
4133 : 83733 : if (peel_iters_epilogue)
4134 : 284001 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4135 : : {
4136 : 224791 : (void) add_stmt_cost (target_cost_data,
4137 : 224791 : si->count * peel_iters_epilogue, si->kind,
4138 : : si->stmt_info, si->node, si->vectype,
4139 : : si->misalign, vect_epilogue);
4140 : : }
4141 : :
4142 : : /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4143 : :
4144 : 83733 : if (prologue_need_br_taken_cost)
4145 : 184 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4146 : : vect_prologue);
4147 : :
4148 : 83733 : if (prologue_need_br_not_taken_cost)
4149 : 183 : (void) add_stmt_cost (target_cost_data, 1,
4150 : : cond_branch_not_taken, vect_prologue);
4151 : :
4152 : 83733 : if (epilogue_need_br_taken_cost)
4153 : 49678 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4154 : : vect_epilogue);
4155 : :
4156 : 83733 : if (epilogue_need_br_not_taken_cost)
4157 : 183 : (void) add_stmt_cost (target_cost_data, 1,
4158 : : cond_branch_not_taken, vect_epilogue);
4159 : :
4160 : : /* Take care of special costs for rgroup controls of partial vectors. */
4161 : 22 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4162 : 83755 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4163 : : == vect_partial_vectors_avx512))
4164 : : {
4165 : : /* Calculate how many masks we need to generate. */
4166 : 22 : unsigned int num_masks = 0;
4167 : 22 : bool need_saturation = false;
4168 : 90 : for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4169 : 24 : if (rgm.type)
4170 : : {
4171 : 22 : unsigned nvectors = rgm.factor;
4172 : 22 : num_masks += nvectors;
4173 : 22 : if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4174 : 22 : < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4175 : 7 : need_saturation = true;
4176 : : }
4177 : :
4178 : : /* ??? The target isn't able to identify the costs below as
4179 : : producing masks so it cannot penaltize cases where we'd run
4180 : : out of mask registers for example. */
4181 : :
4182 : : /* ??? We are also failing to account for smaller vector masks
4183 : : we generate by splitting larger masks in vect_get_loop_mask. */
4184 : :
4185 : : /* In the worst case, we need to generate each mask in the prologue
4186 : : and in the loop body. We need one splat per group and one
4187 : : compare per mask.
4188 : :
4189 : : Sometimes the prologue mask will fold to a constant,
4190 : : so the actual prologue cost might be smaller. However, it's
4191 : : simpler and safer to use the worst-case cost; if this ends up
4192 : : being the tie-breaker between vectorizing or not, then it's
4193 : : probably better not to vectorize. */
4194 : 22 : (void) add_stmt_cost (target_cost_data,
4195 : : num_masks
4196 : 22 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4197 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4198 : : vect_prologue);
4199 : 44 : (void) add_stmt_cost (target_cost_data,
4200 : : num_masks
4201 : 44 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4202 : : vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4203 : :
4204 : : /* When we need saturation we need it both in the prologue and
4205 : : the epilogue. */
4206 : 22 : if (need_saturation)
4207 : : {
4208 : 7 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4209 : : NULL, NULL, NULL_TREE, 0, vect_prologue);
4210 : 7 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4211 : : NULL, NULL, NULL_TREE, 0, vect_body);
4212 : : }
4213 : : }
4214 : 0 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4215 : 83711 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4216 : : == vect_partial_vectors_while_ult))
4217 : : {
4218 : : /* Calculate how many masks we need to generate. */
4219 : : unsigned int num_masks = 0;
4220 : : rgroup_controls *rgm;
4221 : : unsigned int num_vectors_m1;
4222 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4223 : : num_vectors_m1, rgm)
4224 : 0 : if (rgm->type)
4225 : 0 : num_masks += num_vectors_m1 + 1;
4226 : 0 : gcc_assert (num_masks > 0);
4227 : :
4228 : : /* In the worst case, we need to generate each mask in the prologue
4229 : : and in the loop body. One of the loop body mask instructions
4230 : : replaces the comparison in the scalar loop, and since we don't
4231 : : count the scalar comparison against the scalar body, we shouldn't
4232 : : count that vector instruction against the vector body either.
4233 : :
4234 : : Sometimes we can use unpacks instead of generating prologue
4235 : : masks and sometimes the prologue mask will fold to a constant,
4236 : : so the actual prologue cost might be smaller. However, it's
4237 : : simpler and safer to use the worst-case cost; if this ends up
4238 : : being the tie-breaker between vectorizing or not, then it's
4239 : : probably better not to vectorize. */
4240 : 0 : (void) add_stmt_cost (target_cost_data, num_masks,
4241 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4242 : : vect_prologue);
4243 : 0 : (void) add_stmt_cost (target_cost_data, num_masks - 1,
4244 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4245 : : vect_body);
4246 : : }
4247 : 83711 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4248 : : {
4249 : : /* Referring to the functions vect_set_loop_condition_partial_vectors
4250 : : and vect_set_loop_controls_directly, we need to generate each
4251 : : length in the prologue and in the loop body if required. Although
4252 : : there are some possible optimizations, we consider the worst case
4253 : : here. */
4254 : :
4255 : 0 : bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4256 : 0 : signed char partial_load_store_bias
4257 : : = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4258 : 0 : bool need_iterate_p
4259 : 0 : = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4260 : 0 : && !vect_known_niters_smaller_than_vf (loop_vinfo));
4261 : :
4262 : : /* Calculate how many statements to be added. */
4263 : 0 : unsigned int prologue_stmts = 0;
4264 : 0 : unsigned int body_stmts = 0;
4265 : :
4266 : 0 : rgroup_controls *rgc;
4267 : 0 : unsigned int num_vectors_m1;
4268 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4269 : 0 : if (rgc->type)
4270 : : {
4271 : : /* May need one SHIFT for nitems_total computation. */
4272 : 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4273 : 0 : if (nitems != 1 && !niters_known_p)
4274 : 0 : prologue_stmts += 1;
4275 : :
4276 : : /* May need one MAX and one MINUS for wrap around. */
4277 : 0 : if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4278 : 0 : prologue_stmts += 2;
4279 : :
4280 : : /* Need one MAX and one MINUS for each batch limit excepting for
4281 : : the 1st one. */
4282 : 0 : prologue_stmts += num_vectors_m1 * 2;
4283 : :
4284 : 0 : unsigned int num_vectors = num_vectors_m1 + 1;
4285 : :
4286 : : /* Need to set up lengths in prologue, only one MIN required
4287 : : for each since start index is zero. */
4288 : 0 : prologue_stmts += num_vectors;
4289 : :
4290 : : /* If we have a non-zero partial load bias, we need one PLUS
4291 : : to adjust the load length. */
4292 : 0 : if (partial_load_store_bias != 0)
4293 : 0 : body_stmts += 1;
4294 : :
4295 : 0 : unsigned int length_update_cost = 0;
4296 : 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4297 : : /* For decrement IV style, Each only need a single SELECT_VL
4298 : : or MIN since beginning to calculate the number of elements
4299 : : need to be processed in current iteration. */
4300 : : length_update_cost = 1;
4301 : : else
4302 : : /* For increment IV stype, Each may need two MINs and one MINUS to
4303 : : update lengths in body for next iteration. */
4304 : 0 : length_update_cost = 3;
4305 : :
4306 : 0 : if (need_iterate_p)
4307 : 0 : body_stmts += length_update_cost * num_vectors;
4308 : : }
4309 : :
4310 : 0 : (void) add_stmt_cost (target_cost_data, prologue_stmts,
4311 : : scalar_stmt, vect_prologue);
4312 : 0 : (void) add_stmt_cost (target_cost_data, body_stmts,
4313 : : scalar_stmt, vect_body);
4314 : : }
4315 : :
4316 : : /* FORNOW: The scalar outside cost is incremented in one of the
4317 : : following ways:
4318 : :
4319 : : 1. The vectorizer checks for alignment and aliasing and generates
4320 : : a condition that allows dynamic vectorization. A cost model
4321 : : check is ANDED with the versioning condition. Hence scalar code
4322 : : path now has the added cost of the versioning check.
4323 : :
4324 : : if (cost > th & versioning_check)
4325 : : jmp to vector code
4326 : :
4327 : : Hence run-time scalar is incremented by not-taken branch cost.
4328 : :
4329 : : 2. The vectorizer then checks if a prologue is required. If the
4330 : : cost model check was not done before during versioning, it has to
4331 : : be done before the prologue check.
4332 : :
4333 : : if (cost <= th)
4334 : : prologue = scalar_iters
4335 : : if (prologue == 0)
4336 : : jmp to vector code
4337 : : else
4338 : : execute prologue
4339 : : if (prologue == num_iters)
4340 : : go to exit
4341 : :
4342 : : Hence the run-time scalar cost is incremented by a taken branch,
4343 : : plus a not-taken branch, plus a taken branch cost.
4344 : :
4345 : : 3. The vectorizer then checks if an epilogue is required. If the
4346 : : cost model check was not done before during prologue check, it
4347 : : has to be done with the epilogue check.
4348 : :
4349 : : if (prologue == 0)
4350 : : jmp to vector code
4351 : : else
4352 : : execute prologue
4353 : : if (prologue == num_iters)
4354 : : go to exit
4355 : : vector code:
4356 : : if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4357 : : jmp to epilogue
4358 : :
4359 : : Hence the run-time scalar cost should be incremented by 2 taken
4360 : : branches.
4361 : :
4362 : : TODO: The back end may reorder the BBS's differently and reverse
4363 : : conditions/branch directions. Change the estimates below to
4364 : : something more reasonable. */
4365 : :
4366 : : /* If the number of iterations is known and we do not do versioning, we can
4367 : : decide whether to vectorize at compile time. Hence the scalar version
4368 : : do not carry cost model guard costs. */
4369 : 33478 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4370 : 117211 : || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4371 : : {
4372 : : /* Cost model check occurs at versioning. */
4373 : 50843 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4374 : 4704 : scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4375 : : else
4376 : : {
4377 : : /* Cost model check occurs at prologue generation. */
4378 : 46139 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4379 : 38 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4380 : 38 : + vect_get_stmt_cost (cond_branch_not_taken);
4381 : : /* Cost model check occurs at epilogue generation. */
4382 : : else
4383 : 46101 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4384 : : }
4385 : : }
4386 : :
4387 : : /* Complete the target-specific cost calculations. */
4388 : 83733 : loop_vinfo->vector_costs->finish_cost (loop_vinfo->scalar_costs);
4389 : 83733 : vec_prologue_cost = loop_vinfo->vector_costs->prologue_cost ();
4390 : 83733 : vec_inside_cost = loop_vinfo->vector_costs->body_cost ();
4391 : 83733 : vec_epilogue_cost = loop_vinfo->vector_costs->epilogue_cost ();
4392 : 83733 : if (suggested_unroll_factor)
4393 : 83546 : *suggested_unroll_factor
4394 : 83546 : = loop_vinfo->vector_costs->suggested_unroll_factor ();
4395 : :
4396 : 83546 : if (suggested_unroll_factor && *suggested_unroll_factor > 1
4397 : 233 : && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4398 : 0 : && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4399 : : *suggested_unroll_factor,
4400 : : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4401 : : {
4402 : 0 : if (dump_enabled_p ())
4403 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4404 : : "can't unroll as unrolled vectorization factor larger"
4405 : : " than maximum vectorization factor: "
4406 : : HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4407 : : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4408 : 0 : *suggested_unroll_factor = 1;
4409 : : }
4410 : :
4411 : 83733 : vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4412 : :
4413 : 83733 : if (dump_enabled_p ())
4414 : : {
4415 : 627 : dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4416 : 627 : dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4417 : : vec_inside_cost);
4418 : 627 : dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4419 : : vec_prologue_cost);
4420 : 627 : dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4421 : : vec_epilogue_cost);
4422 : 627 : dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4423 : : scalar_single_iter_cost);
4424 : 627 : dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4425 : : scalar_outside_cost);
4426 : 627 : dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4427 : : vec_outside_cost);
4428 : 627 : dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4429 : : peel_iters_prologue);
4430 : 627 : dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4431 : : peel_iters_epilogue);
4432 : : }
4433 : :
4434 : : /* Calculate number of iterations required to make the vector version
4435 : : profitable, relative to the loop bodies only. The following condition
4436 : : must hold true:
4437 : : SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4438 : : where
4439 : : SIC = scalar iteration cost, VIC = vector iteration cost,
4440 : : VOC = vector outside cost, VF = vectorization factor,
4441 : : NPEEL = prologue iterations + epilogue iterations,
4442 : : SOC = scalar outside cost for run time cost model check. */
4443 : :
4444 : 83733 : int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4445 : 83733 : - vec_inside_cost);
4446 : 83733 : if (saving_per_viter <= 0)
4447 : : {
4448 : 26003 : if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4449 : 0 : warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4450 : : "vectorization did not happen for a simd loop");
4451 : :
4452 : 26003 : if (dump_enabled_p ())
4453 : 22 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4454 : : "cost model: the vector iteration cost = %d "
4455 : : "divided by the scalar iteration cost = %d "
4456 : : "is greater or equal to the vectorization factor = %d"
4457 : : ".\n",
4458 : : vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4459 : 26003 : *ret_min_profitable_niters = -1;
4460 : 26003 : *ret_min_profitable_estimate = -1;
4461 : 26003 : return;
4462 : : }
4463 : :
4464 : : /* ??? The "if" arm is written to handle all cases; see below for what
4465 : : we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4466 : 57730 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4467 : : {
4468 : : /* Rewriting the condition above in terms of the number of
4469 : : vector iterations (vniters) rather than the number of
4470 : : scalar iterations (niters) gives:
4471 : :
4472 : : SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4473 : :
4474 : : <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4475 : :
4476 : : For integer N, X and Y when X > 0:
4477 : :
4478 : : N * X > Y <==> N >= (Y /[floor] X) + 1. */
4479 : 14 : int outside_overhead = (vec_outside_cost
4480 : 14 : - scalar_single_iter_cost * peel_iters_prologue
4481 : 14 : - scalar_single_iter_cost * peel_iters_epilogue
4482 : : - scalar_outside_cost);
4483 : : /* We're only interested in cases that require at least one
4484 : : vector iteration. */
4485 : 14 : int min_vec_niters = 1;
4486 : 14 : if (outside_overhead > 0)
4487 : 11 : min_vec_niters = outside_overhead / saving_per_viter + 1;
4488 : :
4489 : 14 : if (dump_enabled_p ())
4490 : 6 : dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4491 : : min_vec_niters);
4492 : :
4493 : 14 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4494 : : {
4495 : : /* Now that we know the minimum number of vector iterations,
4496 : : find the minimum niters for which the scalar cost is larger:
4497 : :
4498 : : SIC * niters > VIC * vniters + VOC - SOC
4499 : :
4500 : : We know that the minimum niters is no more than
4501 : : vniters * VF + NPEEL, but it might be (and often is) less
4502 : : than that if a partial vector iteration is cheaper than the
4503 : : equivalent scalar code. */
4504 : 14 : int threshold = (vec_inside_cost * min_vec_niters
4505 : 14 : + vec_outside_cost
4506 : 14 : - scalar_outside_cost);
4507 : 14 : if (threshold <= 0)
4508 : : min_profitable_iters = 1;
4509 : : else
4510 : 14 : min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4511 : : }
4512 : : else
4513 : : /* Convert the number of vector iterations into a number of
4514 : : scalar iterations. */
4515 : 0 : min_profitable_iters = (min_vec_niters * assumed_vf
4516 : 0 : + peel_iters_prologue
4517 : : + peel_iters_epilogue);
4518 : : }
4519 : : else
4520 : : {
4521 : 57716 : min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4522 : 57716 : * assumed_vf
4523 : 57716 : - vec_inside_cost * peel_iters_prologue
4524 : 57716 : - vec_inside_cost * peel_iters_epilogue);
4525 : 57716 : if (min_profitable_iters <= 0)
4526 : : min_profitable_iters = 0;
4527 : : else
4528 : : {
4529 : 48649 : min_profitable_iters /= saving_per_viter;
4530 : :
4531 : 48649 : if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4532 : 48649 : <= (((int) vec_inside_cost * min_profitable_iters)
4533 : 48649 : + (((int) vec_outside_cost - scalar_outside_cost)
4534 : : * assumed_vf)))
4535 : 48649 : min_profitable_iters++;
4536 : : }
4537 : : }
4538 : :
4539 : 57730 : if (dump_enabled_p ())
4540 : 605 : dump_printf (MSG_NOTE,
4541 : : " Calculated minimum iters for profitability: %d\n",
4542 : : min_profitable_iters);
4543 : :
4544 : 57730 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4545 : 57716 : && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4546 : : /* We want the vectorized loop to execute at least once. */
4547 : : min_profitable_iters = assumed_vf + peel_iters_prologue;
4548 : 10586 : else if (min_profitable_iters < peel_iters_prologue)
4549 : : /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4550 : : vectorized loop executes at least once. */
4551 : : min_profitable_iters = peel_iters_prologue;
4552 : :
4553 : 57730 : if (dump_enabled_p ())
4554 : 605 : dump_printf_loc (MSG_NOTE, vect_location,
4555 : : " Runtime profitability threshold = %d\n",
4556 : : min_profitable_iters);
4557 : :
4558 : 57730 : *ret_min_profitable_niters = min_profitable_iters;
4559 : :
4560 : : /* Calculate number of iterations required to make the vector version
4561 : : profitable, relative to the loop bodies only.
4562 : :
4563 : : Non-vectorized variant is SIC * niters and it must win over vector
4564 : : variant on the expected loop trip count. The following condition must hold true:
4565 : : SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4566 : :
4567 : 57730 : if (vec_outside_cost <= 0)
4568 : : min_profitable_estimate = 0;
4569 : : /* ??? This "else if" arm is written to handle all cases; see below for
4570 : : what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4571 : 52362 : else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4572 : : {
4573 : : /* This is a repeat of the code above, but with + SOC rather
4574 : : than - SOC. */
4575 : 14 : int outside_overhead = (vec_outside_cost
4576 : 14 : - scalar_single_iter_cost * peel_iters_prologue
4577 : 14 : - scalar_single_iter_cost * peel_iters_epilogue
4578 : : + scalar_outside_cost);
4579 : 14 : int min_vec_niters = 1;
4580 : 14 : if (outside_overhead > 0)
4581 : 14 : min_vec_niters = outside_overhead / saving_per_viter + 1;
4582 : :
4583 : 14 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4584 : : {
4585 : 14 : int threshold = (vec_inside_cost * min_vec_niters
4586 : 14 : + vec_outside_cost
4587 : 14 : + scalar_outside_cost);
4588 : 14 : min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4589 : : }
4590 : : else
4591 : : min_profitable_estimate = (min_vec_niters * assumed_vf
4592 : : + peel_iters_prologue
4593 : : + peel_iters_epilogue);
4594 : : }
4595 : : else
4596 : : {
4597 : 52348 : min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4598 : 52348 : * assumed_vf
4599 : 52348 : - vec_inside_cost * peel_iters_prologue
4600 : 52348 : - vec_inside_cost * peel_iters_epilogue)
4601 : 52348 : / ((scalar_single_iter_cost * assumed_vf)
4602 : : - vec_inside_cost);
4603 : : }
4604 : 57730 : min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4605 : 57730 : if (dump_enabled_p ())
4606 : 605 : dump_printf_loc (MSG_NOTE, vect_location,
4607 : : " Static estimate profitability threshold = %d\n",
4608 : : min_profitable_estimate);
4609 : :
4610 : 57730 : *ret_min_profitable_estimate = min_profitable_estimate;
4611 : : }
4612 : :
4613 : : /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4614 : : vector elements (not bits) for a vector with NELT elements. */
4615 : : static void
4616 : 2147 : calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4617 : : vec_perm_builder *sel)
4618 : : {
4619 : : /* The encoding is a single stepped pattern. Any wrap-around is handled
4620 : : by vec_perm_indices. */
4621 : 2147 : sel->new_vector (nelt, 1, 3);
4622 : 8588 : for (unsigned int i = 0; i < 3; i++)
4623 : 6441 : sel->quick_push (i + offset);
4624 : 2147 : }
4625 : :
4626 : : /* Checks whether the target supports whole-vector shifts for vectors of mode
4627 : : MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4628 : : it supports vec_perm_const with masks for all necessary shift amounts. */
4629 : : static bool
4630 : 7518 : have_whole_vector_shift (machine_mode mode)
4631 : : {
4632 : 7518 : if (can_implement_p (vec_shr_optab, mode))
4633 : : return true;
4634 : :
4635 : : /* Variable-length vectors should be handled via the optab. */
4636 : 61 : unsigned int nelt;
4637 : 122 : if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4638 : : return false;
4639 : :
4640 : 61 : vec_perm_builder sel;
4641 : 61 : vec_perm_indices indices;
4642 : 307 : for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4643 : : {
4644 : 246 : calc_vec_perm_mask_for_shift (i, nelt, &sel);
4645 : 246 : indices.new_vector (sel, 2, nelt);
4646 : 246 : if (!can_vec_perm_const_p (mode, mode, indices, false))
4647 : : return false;
4648 : : }
4649 : : return true;
4650 : 61 : }
4651 : :
4652 : : /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
4653 : : multiplication operands have differing signs and (b) we intend
4654 : : to emulate the operation using a series of signed DOT_PROD_EXPRs.
4655 : : See vect_emulate_mixed_dot_prod for the actual sequence used. */
4656 : :
4657 : : static bool
4658 : 2179 : vect_is_emulated_mixed_dot_prod (slp_tree slp_node)
4659 : : {
4660 : 2179 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
4661 : 2179 : gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
4662 : 1726 : if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
4663 : : return false;
4664 : :
4665 : 589 : tree rhs1 = gimple_assign_rhs1 (assign);
4666 : 589 : tree rhs2 = gimple_assign_rhs2 (assign);
4667 : 589 : if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
4668 : : return false;
4669 : :
4670 : 435 : return !directly_supported_p (DOT_PROD_EXPR,
4671 : : SLP_TREE_VECTYPE (slp_node),
4672 : 145 : SLP_TREE_VECTYPE
4673 : : (SLP_TREE_CHILDREN (slp_node)[0]),
4674 : 145 : optab_vector_mixed_sign);
4675 : : }
4676 : :
4677 : : /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4678 : : functions. Design better to avoid maintenance issues. */
4679 : :
4680 : : /* Function vect_model_reduction_cost.
4681 : :
4682 : : Models cost for a reduction operation, including the vector ops
4683 : : generated within the strip-mine loop in some cases, the initial
4684 : : definition before the loop, and the epilogue code that must be generated. */
4685 : :
4686 : : static void
4687 : 46171 : vect_model_reduction_cost (loop_vec_info loop_vinfo,
4688 : : slp_tree node, internal_fn reduc_fn,
4689 : : vect_reduction_type reduction_type,
4690 : : int ncopies, stmt_vector_for_cost *cost_vec)
4691 : : {
4692 : 46171 : int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4693 : 46171 : tree vectype;
4694 : 46171 : machine_mode mode;
4695 : 46171 : class loop *loop = NULL;
4696 : :
4697 : 46171 : if (loop_vinfo)
4698 : 46171 : loop = LOOP_VINFO_LOOP (loop_vinfo);
4699 : :
4700 : : /* Condition reductions generate two reductions in the loop. */
4701 : 46171 : if (reduction_type == COND_REDUCTION)
4702 : 279 : ncopies *= 2;
4703 : :
4704 : 46171 : vectype = SLP_TREE_VECTYPE (node);
4705 : 46171 : mode = TYPE_MODE (vectype);
4706 : 46171 : stmt_vec_info orig_stmt_info
4707 : 46171 : = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
4708 : :
4709 : 46171 : gimple_match_op op;
4710 : 46171 : if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4711 : 0 : gcc_unreachable ();
4712 : :
4713 : 46171 : if (reduction_type == EXTRACT_LAST_REDUCTION)
4714 : : /* No extra instructions are needed in the prologue. The loop body
4715 : : operations are costed in vectorizable_condition. */
4716 : : inside_cost = 0;
4717 : 46171 : else if (reduction_type == FOLD_LEFT_REDUCTION)
4718 : : {
4719 : : /* No extra instructions needed in the prologue. */
4720 : 3927 : prologue_cost = 0;
4721 : :
4722 : 3927 : if (reduc_fn != IFN_LAST)
4723 : : /* Count one reduction-like operation per vector. */
4724 : 0 : inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4725 : : node, 0, vect_body);
4726 : : else
4727 : : {
4728 : : /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4729 : 3927 : unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4730 : 3927 : inside_cost = record_stmt_cost (cost_vec, nelements,
4731 : : vec_to_scalar, node, 0,
4732 : : vect_body);
4733 : 3927 : inside_cost += record_stmt_cost (cost_vec, nelements,
4734 : : scalar_stmt, node, 0,
4735 : : vect_body);
4736 : : }
4737 : : }
4738 : : else
4739 : : {
4740 : : /* Add in the cost of the initial definitions. */
4741 : 42244 : int prologue_stmts;
4742 : 42244 : if (reduction_type == COND_REDUCTION)
4743 : : /* For cond reductions we have four vectors: initial index, step,
4744 : : initial result of the data reduction, initial value of the index
4745 : : reduction. */
4746 : : prologue_stmts = 4;
4747 : : else
4748 : : /* We need the initial reduction value. */
4749 : 41965 : prologue_stmts = 1;
4750 : 42244 : prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4751 : : scalar_to_vec, node, 0,
4752 : : vect_prologue);
4753 : : }
4754 : :
4755 : : /* Determine cost of epilogue code.
4756 : :
4757 : : We have a reduction operator that will reduce the vector in one statement.
4758 : : Also requires scalar extract. */
4759 : :
4760 : 46171 : if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4761 : : {
4762 : 46033 : if (reduc_fn != IFN_LAST)
4763 : : {
4764 : 34782 : if (reduction_type == COND_REDUCTION)
4765 : : {
4766 : : /* An EQ stmt and an COND_EXPR stmt. */
4767 : 8 : epilogue_cost += record_stmt_cost (cost_vec, 2,
4768 : : vector_stmt, node, 0,
4769 : : vect_epilogue);
4770 : : /* Reduction of the max index and a reduction of the found
4771 : : values. */
4772 : 8 : epilogue_cost += record_stmt_cost (cost_vec, 2,
4773 : : vec_to_scalar, node, 0,
4774 : : vect_epilogue);
4775 : : /* A broadcast of the max value. */
4776 : 8 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4777 : : scalar_to_vec, node, 0,
4778 : : vect_epilogue);
4779 : : }
4780 : : else
4781 : : {
4782 : 34774 : epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4783 : : node, 0, vect_epilogue);
4784 : 34774 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4785 : : vec_to_scalar, node, 0,
4786 : : vect_epilogue);
4787 : : }
4788 : : }
4789 : 11251 : else if (reduction_type == COND_REDUCTION)
4790 : : {
4791 : 271 : unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4792 : : /* Extraction of scalar elements. */
4793 : 542 : epilogue_cost += record_stmt_cost (cost_vec,
4794 : 271 : 2 * estimated_nunits,
4795 : : vec_to_scalar, node, 0,
4796 : : vect_epilogue);
4797 : : /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4798 : 271 : epilogue_cost += record_stmt_cost (cost_vec,
4799 : 271 : 2 * estimated_nunits - 3,
4800 : : scalar_stmt, node, 0,
4801 : : vect_epilogue);
4802 : : }
4803 : 10980 : else if (reduction_type == EXTRACT_LAST_REDUCTION
4804 : 10980 : || reduction_type == FOLD_LEFT_REDUCTION)
4805 : : /* No extra instructions need in the epilogue. */
4806 : : ;
4807 : : else
4808 : : {
4809 : 7053 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4810 : 7053 : tree bitsize = TYPE_SIZE (op.type);
4811 : 7053 : int element_bitsize = tree_to_uhwi (bitsize);
4812 : 7053 : int nelements = vec_size_in_bits / element_bitsize;
4813 : :
4814 : 7053 : if (op.code == COND_EXPR)
4815 : 28 : op.code = MAX_EXPR;
4816 : :
4817 : : /* We have a whole vector shift available. */
4818 : 968 : if (VECTOR_MODE_P (mode)
4819 : 7053 : && directly_supported_p (op.code, vectype)
4820 : 12723 : && have_whole_vector_shift (mode))
4821 : : {
4822 : : /* Final reduction via vector shifts and the reduction operator.
4823 : : Also requires scalar extract. */
4824 : 17010 : epilogue_cost += record_stmt_cost (cost_vec,
4825 : 11340 : exact_log2 (nelements) * 2,
4826 : : vector_stmt, node, 0,
4827 : : vect_epilogue);
4828 : 5670 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4829 : : vec_to_scalar, node, 0,
4830 : : vect_epilogue);
4831 : : }
4832 : : else
4833 : : /* Use extracts and reduction op for final reduction. For N
4834 : : elements, we have N extracts and N-1 reduction ops. */
4835 : 1383 : epilogue_cost += record_stmt_cost (cost_vec,
4836 : 1383 : nelements + nelements - 1,
4837 : : vector_stmt, node, 0,
4838 : : vect_epilogue);
4839 : : }
4840 : : }
4841 : :
4842 : 46171 : if (dump_enabled_p ())
4843 : 2694 : dump_printf (MSG_NOTE,
4844 : : "vect_model_reduction_cost: inside_cost = %d, "
4845 : : "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4846 : : prologue_cost, epilogue_cost);
4847 : 46171 : }
4848 : :
4849 : : /* SEQ is a sequence of instructions that initialize the reduction
4850 : : described by REDUC_INFO. Emit them in the appropriate place. */
4851 : :
4852 : : static void
4853 : 442 : vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4854 : : vect_reduc_info reduc_info, gimple *seq)
4855 : : {
4856 : 442 : if (VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info))
4857 : : {
4858 : : /* When reusing an accumulator from the main loop, we only need
4859 : : initialization instructions if the main loop can be skipped.
4860 : : In that case, emit the initialization instructions at the end
4861 : : of the guard block that does the skip. */
4862 : 25 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
4863 : 25 : gcc_assert (skip_edge);
4864 : 25 : gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4865 : 25 : gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4866 : : }
4867 : : else
4868 : : {
4869 : : /* The normal case: emit the initialization instructions on the
4870 : : preheader edge. */
4871 : 417 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4872 : 417 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4873 : : }
4874 : 442 : }
4875 : :
4876 : : /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4877 : : which performs a reduction involving GROUP_SIZE scalar statements.
4878 : : NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
4879 : : is nonnull, introducing extra elements of that value will not change the
4880 : : result. */
4881 : :
4882 : : static void
4883 : 21579 : get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4884 : : vect_reduc_info reduc_info,
4885 : : tree vector_type,
4886 : : vec<tree> *vec_oprnds,
4887 : : unsigned int number_of_vectors,
4888 : : unsigned int group_size, tree neutral_op)
4889 : : {
4890 : 21579 : vec<tree> &initial_values = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info);
4891 : 21579 : unsigned HOST_WIDE_INT nunits;
4892 : 21579 : unsigned j, number_of_places_left_in_vector;
4893 : 21579 : unsigned int i;
4894 : :
4895 : 43158 : gcc_assert (group_size == initial_values.length () || neutral_op);
4896 : :
4897 : : /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4898 : : created vectors. It is greater than 1 if unrolling is performed.
4899 : :
4900 : : For example, we have two scalar operands, s1 and s2 (e.g., group of
4901 : : strided accesses of size two), while NUNITS is four (i.e., four scalars
4902 : : of this type can be packed in a vector). The output vector will contain
4903 : : two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4904 : : will be 2).
4905 : :
4906 : : If GROUP_SIZE > NUNITS, the scalars will be split into several
4907 : : vectors containing the operands.
4908 : :
4909 : : For example, NUNITS is four as before, and the group size is 8
4910 : : (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4911 : : {s5, s6, s7, s8}. */
4912 : :
4913 : 21579 : if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4914 : : nunits = group_size;
4915 : :
4916 : 21579 : tree vector_elt_type = TREE_TYPE (vector_type);
4917 : 21579 : number_of_places_left_in_vector = nunits;
4918 : 21579 : bool constant_p = true;
4919 : 21579 : tree_vector_builder elts (vector_type, nunits, 1);
4920 : 21579 : elts.quick_grow (nunits);
4921 : 21579 : gimple_seq ctor_seq = NULL;
4922 : 21579 : if (neutral_op
4923 : 43071 : && !useless_type_conversion_p (vector_elt_type,
4924 : 21492 : TREE_TYPE (neutral_op)))
4925 : 1 : neutral_op = gimple_convert (&ctor_seq, vector_elt_type, neutral_op);
4926 : 207373 : for (j = 0; j < nunits * number_of_vectors; ++j)
4927 : : {
4928 : 185794 : tree op;
4929 : 185794 : i = j % group_size;
4930 : :
4931 : : /* Get the def before the loop. In reduction chain we have only
4932 : : one initial value. Else we have as many as PHIs in the group. */
4933 : 185794 : if (i >= initial_values.length () || (j > i && neutral_op))
4934 : : op = neutral_op;
4935 : : else
4936 : : {
4937 : 44792 : if (!useless_type_conversion_p (vector_elt_type,
4938 : 22396 : TREE_TYPE (initial_values[i])))
4939 : : {
4940 : 140 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4941 : 236 : initial_values[i] = gimple_build (&ctor_seq, COND_EXPR,
4942 : : vector_elt_type,
4943 : 118 : initial_values[i],
4944 : : build_all_ones_cst
4945 : : (vector_elt_type),
4946 : : build_zero_cst
4947 : : (vector_elt_type));
4948 : : else
4949 : 44 : initial_values[i] = gimple_convert (&ctor_seq,
4950 : : vector_elt_type,
4951 : 22 : initial_values[i]);
4952 : : }
4953 : 22396 : op = initial_values[i];
4954 : : }
4955 : :
4956 : : /* Create 'vect_ = {op0,op1,...,opn}'. */
4957 : 185794 : number_of_places_left_in_vector--;
4958 : 185794 : elts[nunits - number_of_places_left_in_vector - 1] = op;
4959 : 185794 : if (!CONSTANT_CLASS_P (op))
4960 : 2331 : constant_p = false;
4961 : :
4962 : 185794 : if (number_of_places_left_in_vector == 0)
4963 : : {
4964 : 23023 : tree init;
4965 : 46046 : if (constant_p && !neutral_op
4966 : 45988 : ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4967 : 23023 : : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4968 : : /* Build the vector directly from ELTS. */
4969 : 23023 : init = gimple_build_vector (&ctor_seq, &elts);
4970 : 0 : else if (neutral_op)
4971 : : {
4972 : : /* Build a vector of the neutral value and shift the
4973 : : other elements into place. */
4974 : 0 : init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4975 : : neutral_op);
4976 : 0 : int k = nunits;
4977 : 0 : while (k > 0 && operand_equal_p (elts[k - 1], neutral_op))
4978 : : k -= 1;
4979 : 0 : while (k > 0)
4980 : : {
4981 : 0 : k -= 1;
4982 : 0 : init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4983 : 0 : vector_type, init, elts[k]);
4984 : : }
4985 : : }
4986 : : else
4987 : : {
4988 : : /* First time round, duplicate ELTS to fill the
4989 : : required number of vectors. */
4990 : 0 : duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
4991 : : elts, number_of_vectors, *vec_oprnds);
4992 : 0 : break;
4993 : : }
4994 : 23023 : vec_oprnds->quick_push (init);
4995 : :
4996 : 23023 : number_of_places_left_in_vector = nunits;
4997 : 23023 : elts.new_vector (vector_type, nunits, 1);
4998 : 23023 : elts.quick_grow (nunits);
4999 : 23023 : constant_p = true;
5000 : : }
5001 : : }
5002 : 21579 : if (ctor_seq != NULL)
5003 : 442 : vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5004 : 21579 : }
5005 : :
5006 : : vect_reduc_info
5007 : 131314 : info_for_reduction (loop_vec_info loop_vinfo, slp_tree node)
5008 : : {
5009 : 131314 : if (node->cycle_info.id == -1)
5010 : : return NULL;
5011 : 129524 : return loop_vinfo->reduc_infos[node->cycle_info.id];
5012 : : }
5013 : :
5014 : : /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5015 : : REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5016 : : return false. */
5017 : :
5018 : : static bool
5019 : 21216 : vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5020 : : vect_reduc_info reduc_info, tree vectype)
5021 : : {
5022 : 21216 : loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5023 : 21216 : if (!main_loop_vinfo)
5024 : : return false;
5025 : :
5026 : 4826 : if (VECT_REDUC_INFO_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5027 : : return false;
5028 : :
5029 : : /* We are not set up to handle vector bools when they are not mapped
5030 : : to vector integer data types. */
5031 : 4811 : if (VECTOR_BOOLEAN_TYPE_P (vectype)
5032 : 4881 : && GET_MODE_CLASS (TYPE_MODE (vectype)) != MODE_VECTOR_INT)
5033 : : return false;
5034 : :
5035 : 4809 : unsigned int num_phis = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).length ();
5036 : 4809 : auto_vec<tree, 16> main_loop_results (num_phis);
5037 : 4809 : auto_vec<tree, 16> initial_values (num_phis);
5038 : 4809 : if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5039 : : {
5040 : : /* The epilogue loop can be entered either from the main loop or
5041 : : from an earlier guard block. */
5042 : 4594 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
5043 : 18400 : for (tree incoming_value : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info))
5044 : : {
5045 : : /* Look for:
5046 : :
5047 : : INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5048 : : INITIAL_VALUE(guard block)>. */
5049 : 4618 : gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5050 : :
5051 : 4618 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5052 : 4618 : gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5053 : :
5054 : 4618 : tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5055 : 4618 : tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5056 : :
5057 : 4618 : main_loop_results.quick_push (from_main_loop);
5058 : 4618 : initial_values.quick_push (from_skip);
5059 : : }
5060 : : }
5061 : : else
5062 : : /* The main loop dominates the epilogue loop. */
5063 : 215 : main_loop_results.splice (VECT_REDUC_INFO_INITIAL_VALUES (reduc_info));
5064 : :
5065 : : /* See if the main loop has the kind of accumulator we need. */
5066 : 4809 : vect_reusable_accumulator *accumulator
5067 : 4809 : = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5068 : 4809 : if (!accumulator
5069 : 9602 : || num_phis != VECT_REDUC_INFO_SCALAR_RESULTS (accumulator->reduc_info).length ()
5070 : 14407 : || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5071 : : VECT_REDUC_INFO_SCALAR_RESULTS (accumulator->reduc_info).begin ()))
5072 : : return false;
5073 : :
5074 : : /* Handle the case where we can reduce wider vectors to narrower ones. */
5075 : 4799 : tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5076 : 4799 : unsigned HOST_WIDE_INT m;
5077 : 4799 : if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5078 : 4799 : TYPE_VECTOR_SUBPARTS (vectype), &m))
5079 : 0 : return false;
5080 : : /* Check the intermediate vector types and operations are available. */
5081 : 4799 : tree prev_vectype = old_vectype;
5082 : 4799 : poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5083 : 13874 : while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5084 : : {
5085 : 4798 : intermediate_nunits = exact_div (intermediate_nunits, 2);
5086 : 4798 : tree intermediate_vectype = get_related_vectype_for_scalar_type
5087 : 4798 : (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5088 : 4798 : if (!intermediate_vectype
5089 : 4798 : || !directly_supported_p (VECT_REDUC_INFO_CODE (reduc_info),
5090 : : intermediate_vectype)
5091 : 9076 : || !can_vec_extract (TYPE_MODE (prev_vectype),
5092 : 4278 : TYPE_MODE (intermediate_vectype)))
5093 : : return false;
5094 : : prev_vectype = intermediate_vectype;
5095 : : }
5096 : :
5097 : : /* Non-SLP reductions might apply an adjustment after the reduction
5098 : : operation, in order to simplify the initialization of the accumulator.
5099 : : If the epilogue loop carries on from where the main loop left off,
5100 : : it should apply the same adjustment to the final reduction result.
5101 : :
5102 : : If the epilogue loop can also be entered directly (rather than via
5103 : : the main loop), we need to be able to handle that case in the same way,
5104 : : with the same adjustment. (In principle we could add a PHI node
5105 : : to select the correct adjustment, but in practice that shouldn't be
5106 : : necessary.) */
5107 : 4277 : tree main_adjustment
5108 : 4277 : = VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5109 : 4277 : if (loop_vinfo->main_loop_edge && main_adjustment)
5110 : : {
5111 : 3638 : gcc_assert (num_phis == 1);
5112 : 3638 : tree initial_value = initial_values[0];
5113 : : /* Check that we can use INITIAL_VALUE as the adjustment and
5114 : : initialize the accumulator with a neutral value instead. */
5115 : 3638 : if (!operand_equal_p (initial_value, main_adjustment))
5116 : 106 : return false;
5117 : 3532 : code_helper code = VECT_REDUC_INFO_CODE (reduc_info);
5118 : 3532 : initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5119 : : code, initial_value);
5120 : : }
5121 : 4171 : VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5122 : 4171 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).truncate (0);
5123 : 4171 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).splice (initial_values);
5124 : 4171 : VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info) = accumulator;
5125 : 4171 : return true;
5126 : 4809 : }
5127 : :
5128 : : /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5129 : : CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5130 : :
5131 : : static tree
5132 : 4214 : vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5133 : : gimple_seq *seq)
5134 : : {
5135 : 4214 : gcc_assert (!VECTOR_BOOLEAN_TYPE_P (TREE_TYPE (vec_def))
5136 : : || (GET_MODE_CLASS (TYPE_MODE (TREE_TYPE (vec_def)))
5137 : : == MODE_VECTOR_INT));
5138 : 4214 : unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5139 : 4214 : unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5140 : 4214 : tree stype = TREE_TYPE (vectype);
5141 : 4214 : tree new_temp = vec_def;
5142 : 8425 : while (nunits > nunits1)
5143 : : {
5144 : 4211 : nunits /= 2;
5145 : 4211 : tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5146 : 4211 : stype, nunits);
5147 : 4211 : unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5148 : :
5149 : : /* The target has to make sure we support lowpart/highpart
5150 : : extraction, either via direct vector extract or through
5151 : : an integer mode punning. */
5152 : 4211 : tree dst1, dst2;
5153 : 4211 : gimple *epilog_stmt;
5154 : 4211 : if (convert_optab_handler (vec_extract_optab,
5155 : 4211 : TYPE_MODE (TREE_TYPE (new_temp)),
5156 : 4211 : TYPE_MODE (vectype1))
5157 : : != CODE_FOR_nothing)
5158 : : {
5159 : : /* Extract sub-vectors directly once vec_extract becomes
5160 : : a conversion optab. */
5161 : 2688 : dst1 = make_ssa_name (vectype1);
5162 : 2688 : epilog_stmt
5163 : 5376 : = gimple_build_assign (dst1, BIT_FIELD_REF,
5164 : : build3 (BIT_FIELD_REF, vectype1,
5165 : 2688 : new_temp, TYPE_SIZE (vectype1),
5166 : : bitsize_int (0)));
5167 : 2688 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5168 : 2688 : dst2 = make_ssa_name (vectype1);
5169 : 2688 : epilog_stmt
5170 : 2688 : = gimple_build_assign (dst2, BIT_FIELD_REF,
5171 : : build3 (BIT_FIELD_REF, vectype1,
5172 : 2688 : new_temp, TYPE_SIZE (vectype1),
5173 : 2688 : bitsize_int (bitsize)));
5174 : 2688 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5175 : : }
5176 : : else
5177 : : {
5178 : : /* Extract via punning to appropriately sized integer mode
5179 : : vector. */
5180 : 1523 : tree eltype = build_nonstandard_integer_type (bitsize, 1);
5181 : 1523 : tree etype = build_vector_type (eltype, 2);
5182 : 3046 : gcc_assert (convert_optab_handler (vec_extract_optab,
5183 : : TYPE_MODE (etype),
5184 : : TYPE_MODE (eltype))
5185 : : != CODE_FOR_nothing);
5186 : 1523 : tree tem = make_ssa_name (etype);
5187 : 1523 : epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5188 : : build1 (VIEW_CONVERT_EXPR,
5189 : : etype, new_temp));
5190 : 1523 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5191 : 1523 : new_temp = tem;
5192 : 1523 : tem = make_ssa_name (eltype);
5193 : 1523 : epilog_stmt
5194 : 3046 : = gimple_build_assign (tem, BIT_FIELD_REF,
5195 : : build3 (BIT_FIELD_REF, eltype,
5196 : 1523 : new_temp, TYPE_SIZE (eltype),
5197 : : bitsize_int (0)));
5198 : 1523 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5199 : 1523 : dst1 = make_ssa_name (vectype1);
5200 : 1523 : epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5201 : : build1 (VIEW_CONVERT_EXPR,
5202 : : vectype1, tem));
5203 : 1523 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5204 : 1523 : tem = make_ssa_name (eltype);
5205 : 1523 : epilog_stmt
5206 : 1523 : = gimple_build_assign (tem, BIT_FIELD_REF,
5207 : : build3 (BIT_FIELD_REF, eltype,
5208 : 1523 : new_temp, TYPE_SIZE (eltype),
5209 : 1523 : bitsize_int (bitsize)));
5210 : 1523 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5211 : 1523 : dst2 = make_ssa_name (vectype1);
5212 : 1523 : epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5213 : : build1 (VIEW_CONVERT_EXPR,
5214 : : vectype1, tem));
5215 : 1523 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5216 : : }
5217 : :
5218 : 4211 : new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5219 : : }
5220 : 4214 : if (!useless_type_conversion_p (vectype, TREE_TYPE (new_temp)))
5221 : : {
5222 : 66 : tree dst3 = make_ssa_name (vectype);
5223 : 66 : gimple *epilog_stmt = gimple_build_assign (dst3, VIEW_CONVERT_EXPR,
5224 : : build1 (VIEW_CONVERT_EXPR,
5225 : : vectype, new_temp));
5226 : 66 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5227 : 66 : new_temp = dst3;
5228 : : }
5229 : :
5230 : 4214 : return new_temp;
5231 : : }
5232 : :
5233 : : /* Function vect_create_epilog_for_reduction
5234 : :
5235 : : Create code at the loop-epilog to finalize the result of a reduction
5236 : : computation.
5237 : :
5238 : : STMT_INFO is the scalar reduction stmt that is being vectorized.
5239 : : SLP_NODE is an SLP node containing a group of reduction statements. The
5240 : : first one in this group is STMT_INFO.
5241 : : SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5242 : : REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5243 : : (counting from 0)
5244 : : LOOP_EXIT is the edge to update in the merge block. In the case of a single
5245 : : exit this edge is always the main loop exit.
5246 : :
5247 : : This function:
5248 : : 1. Completes the reduction def-use cycles.
5249 : : 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5250 : : by calling the function specified by REDUC_FN if available, or by
5251 : : other means (whole-vector shifts or a scalar loop).
5252 : : The function also creates a new phi node at the loop exit to preserve
5253 : : loop-closed form, as illustrated below.
5254 : :
5255 : : The flow at the entry to this function:
5256 : :
5257 : : loop:
5258 : : vec_def = phi <vec_init, null> # REDUCTION_PHI
5259 : : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5260 : : s_loop = scalar_stmt # (scalar) STMT_INFO
5261 : : loop_exit:
5262 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5263 : : use <s_out0>
5264 : : use <s_out0>
5265 : :
5266 : : The above is transformed by this function into:
5267 : :
5268 : : loop:
5269 : : vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5270 : : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5271 : : s_loop = scalar_stmt # (scalar) STMT_INFO
5272 : : loop_exit:
5273 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5274 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5275 : : v_out2 = reduce <v_out1>
5276 : : s_out3 = extract_field <v_out2, 0>
5277 : : s_out4 = adjust_result <s_out3>
5278 : : use <s_out4>
5279 : : use <s_out4>
5280 : : */
5281 : :
5282 : : static void
5283 : 21922 : vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5284 : : stmt_vec_info stmt_info,
5285 : : slp_tree slp_node,
5286 : : slp_instance slp_node_instance,
5287 : : edge loop_exit)
5288 : : {
5289 : 21922 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
5290 : 21922 : code_helper code = VECT_REDUC_INFO_CODE (reduc_info);
5291 : 21922 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
5292 : 21922 : tree vectype;
5293 : 21922 : machine_mode mode;
5294 : 21922 : basic_block exit_bb;
5295 : 21922 : gimple *new_phi = NULL, *phi = NULL;
5296 : 21922 : gimple_stmt_iterator exit_gsi;
5297 : 21922 : tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5298 : 21922 : gimple *epilog_stmt = NULL;
5299 : 21922 : gimple *exit_phi;
5300 : 21922 : tree def;
5301 : 21922 : tree orig_name, scalar_result;
5302 : 21922 : imm_use_iterator imm_iter;
5303 : 21922 : use_operand_p use_p;
5304 : 21922 : gimple *use_stmt;
5305 : 21922 : auto_vec<tree> reduc_inputs;
5306 : 21922 : int j, i;
5307 : 21922 : vec<tree> &scalar_results = VECT_REDUC_INFO_SCALAR_RESULTS (reduc_info);
5308 : 21922 : unsigned int k;
5309 : : /* SLP reduction without reduction chain, e.g.,
5310 : : # a1 = phi <a2, a0>
5311 : : # b1 = phi <b2, b0>
5312 : : a2 = operation (a1)
5313 : : b2 = operation (b1) */
5314 : 21922 : const bool slp_reduc = !reduc_info->is_reduc_chain;
5315 : 21922 : tree induction_index = NULL_TREE;
5316 : :
5317 : 21922 : unsigned int group_size = SLP_TREE_LANES (slp_node);
5318 : :
5319 : 21922 : bool double_reduc = false;
5320 : 21922 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5321 : 21922 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5322 : : {
5323 : 0 : double_reduc = true;
5324 : 0 : gcc_assert (slp_reduc);
5325 : : }
5326 : :
5327 : 21922 : vectype = VECT_REDUC_INFO_VECTYPE (reduc_info);
5328 : 21922 : gcc_assert (vectype);
5329 : 21922 : mode = TYPE_MODE (vectype);
5330 : :
5331 : 21922 : tree induc_val = NULL_TREE;
5332 : 21922 : tree adjustment_def = NULL;
5333 : : /* Optimize: for induction condition reduction, if we can't use zero
5334 : : for induc_val, use initial_def. */
5335 : 21922 : if (VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5336 : 66 : induc_val = VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info);
5337 : 21856 : else if (double_reduc)
5338 : : ;
5339 : : else
5340 : 21856 : adjustment_def = VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info);
5341 : :
5342 : 21922 : stmt_vec_info single_live_out_stmt[] = { stmt_info };
5343 : 21922 : array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5344 : 21922 : if (slp_reduc)
5345 : : /* All statements produce live-out values. */
5346 : 43454 : live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5347 : :
5348 : 21922 : unsigned vec_num
5349 : 21922 : = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5350 : :
5351 : : /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5352 : : which is updated with the current index of the loop for every match of
5353 : : the original loop's cond_expr (VEC_STMT). This results in a vector
5354 : : containing the last time the condition passed for that vector lane.
5355 : : The first match will be a 1 to allow 0 to be used for non-matching
5356 : : indexes. If there are no matches at all then the vector will be all
5357 : : zeroes.
5358 : :
5359 : : PR92772: This algorithm is broken for architectures that support
5360 : : masked vectors, but do not provide fold_extract_last. */
5361 : 21922 : if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION)
5362 : : {
5363 : 67 : gcc_assert (!double_reduc);
5364 : 67 : auto_vec<std::pair<tree, bool>, 2> ccompares;
5365 : 67 : slp_tree cond_node = slp_node_instance->root;
5366 : 143 : while (cond_node != slp_node_instance->reduc_phis)
5367 : : {
5368 : 76 : stmt_vec_info cond_info = SLP_TREE_REPRESENTATIVE (cond_node);
5369 : 76 : if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5370 : : {
5371 : 76 : gimple *vec_stmt
5372 : 76 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (cond_node)[0]);
5373 : 76 : gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5374 : 76 : ccompares.safe_push
5375 : 76 : (std::make_pair (gimple_assign_rhs1 (vec_stmt),
5376 : 76 : SLP_TREE_REDUC_IDX (cond_node) == 2));
5377 : : }
5378 : 76 : int slp_reduc_idx = SLP_TREE_REDUC_IDX (cond_node);
5379 : 76 : cond_node = SLP_TREE_CHILDREN (cond_node)[slp_reduc_idx];
5380 : : }
5381 : 67 : gcc_assert (ccompares.length () != 0);
5382 : :
5383 : 67 : tree indx_before_incr, indx_after_incr;
5384 : 67 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5385 : 67 : int scalar_precision
5386 : 67 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5387 : 67 : tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5388 : 67 : tree cr_index_vector_type = get_related_vectype_for_scalar_type
5389 : 67 : (TYPE_MODE (vectype), cr_index_scalar_type,
5390 : : TYPE_VECTOR_SUBPARTS (vectype));
5391 : :
5392 : : /* First we create a simple vector induction variable which starts
5393 : : with the values {1,2,3,...} (SERIES_VECT) and increments by the
5394 : : vector size (STEP). */
5395 : :
5396 : : /* Create a {1,2,3,...} vector. */
5397 : 67 : tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5398 : :
5399 : : /* Create a vector of the step value. */
5400 : 67 : tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5401 : 67 : tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5402 : :
5403 : : /* Create an induction variable. */
5404 : 67 : gimple_stmt_iterator incr_gsi;
5405 : 67 : bool insert_after;
5406 : 67 : vect_iv_increment_position (LOOP_VINFO_IV_EXIT (loop_vinfo),
5407 : : &incr_gsi, &insert_after);
5408 : 67 : create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
5409 : : insert_after, &indx_before_incr, &indx_after_incr);
5410 : :
5411 : : /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5412 : : filled with zeros (VEC_ZERO). */
5413 : :
5414 : : /* Create a vector of 0s. */
5415 : 67 : tree zero = build_zero_cst (cr_index_scalar_type);
5416 : 67 : tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5417 : :
5418 : : /* Create a vector phi node. */
5419 : 67 : tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5420 : 67 : new_phi = create_phi_node (new_phi_tree, loop->header);
5421 : 67 : add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5422 : : loop_preheader_edge (loop), UNKNOWN_LOCATION);
5423 : :
5424 : : /* Now take the condition from the loops original cond_exprs
5425 : : and produce a new cond_exprs (INDEX_COND_EXPR) which for
5426 : : every match uses values from the induction variable
5427 : : (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5428 : : (NEW_PHI_TREE).
5429 : : Finally, we update the phi (NEW_PHI_TREE) to take the value of
5430 : : the new cond_expr (INDEX_COND_EXPR). */
5431 : 67 : gimple_seq stmts = NULL;
5432 : 210 : for (int i = ccompares.length () - 1; i != -1; --i)
5433 : : {
5434 : 76 : tree ccompare = ccompares[i].first;
5435 : 76 : if (ccompares[i].second)
5436 : 69 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5437 : : cr_index_vector_type,
5438 : : ccompare,
5439 : : indx_before_incr, new_phi_tree);
5440 : : else
5441 : 7 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5442 : : cr_index_vector_type,
5443 : : ccompare,
5444 : : new_phi_tree, indx_before_incr);
5445 : : }
5446 : 67 : gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5447 : :
5448 : : /* Update the phi with the vec cond. */
5449 : 67 : induction_index = new_phi_tree;
5450 : 67 : add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5451 : : loop_latch_edge (loop), UNKNOWN_LOCATION);
5452 : 67 : }
5453 : :
5454 : : /* 2. Create epilog code.
5455 : : The reduction epilog code operates across the elements of the vector
5456 : : of partial results computed by the vectorized loop.
5457 : : The reduction epilog code consists of:
5458 : :
5459 : : step 1: compute the scalar result in a vector (v_out2)
5460 : : step 2: extract the scalar result (s_out3) from the vector (v_out2)
5461 : : step 3: adjust the scalar result (s_out3) if needed.
5462 : :
5463 : : Step 1 can be accomplished using one the following three schemes:
5464 : : (scheme 1) using reduc_fn, if available.
5465 : : (scheme 2) using whole-vector shifts, if available.
5466 : : (scheme 3) using a scalar loop. In this case steps 1+2 above are
5467 : : combined.
5468 : :
5469 : : The overall epilog code looks like this:
5470 : :
5471 : : s_out0 = phi <s_loop> # original EXIT_PHI
5472 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5473 : : v_out2 = reduce <v_out1> # step 1
5474 : : s_out3 = extract_field <v_out2, 0> # step 2
5475 : : s_out4 = adjust_result <s_out3> # step 3
5476 : :
5477 : : (step 3 is optional, and steps 1 and 2 may be combined).
5478 : : Lastly, the uses of s_out0 are replaced by s_out4. */
5479 : :
5480 : :
5481 : : /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5482 : : v_out1 = phi <VECT_DEF>
5483 : : Store them in NEW_PHIS. */
5484 : : /* We need to reduce values in all exits. */
5485 : 21922 : exit_bb = loop_exit->dest;
5486 : 21922 : exit_gsi = gsi_after_labels (exit_bb);
5487 : 21922 : reduc_inputs.create (vec_num);
5488 : 45298 : for (unsigned i = 0; i < vec_num; i++)
5489 : : {
5490 : 23376 : gimple_seq stmts = NULL;
5491 : 23376 : def = vect_get_slp_vect_def (slp_node, i);
5492 : 23376 : tree new_def = copy_ssa_name (def);
5493 : 23376 : phi = create_phi_node (new_def, exit_bb);
5494 : 23376 : if (LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit)
5495 : 23349 : SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
5496 : : else
5497 : : {
5498 : 57 : for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
5499 : 30 : SET_PHI_ARG_DEF (phi, k, def);
5500 : : }
5501 : 23376 : new_def = gimple_convert (&stmts, vectype, new_def);
5502 : 23376 : reduc_inputs.quick_push (new_def);
5503 : 23376 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5504 : : }
5505 : :
5506 : : /* 2.2 Get the original scalar reduction variable as defined in the loop.
5507 : : In case STMT is a "pattern-stmt" (i.e. - it represents a reduction
5508 : : pattern), the scalar-def is taken from the original stmt that the
5509 : : pattern-stmt (STMT) replaces. */
5510 : :
5511 : 22742 : tree scalar_dest = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
5512 : 21922 : tree scalar_type = TREE_TYPE (scalar_dest);
5513 : 21922 : scalar_results.truncate (0);
5514 : 21922 : scalar_results.reserve_exact (group_size);
5515 : 21922 : new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5516 : :
5517 : : /* True if we should implement SLP_REDUC using native reduction operations
5518 : : instead of scalar operations. */
5519 : 21922 : const bool direct_slp_reduc
5520 : 21922 : = (reduc_fn != IFN_LAST
5521 : 21922 : && slp_reduc
5522 : 21922 : && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5523 : :
5524 : : /* If signed overflow is undefined we might need to perform reduction
5525 : : computations in an unsigned type. */
5526 : 21922 : tree compute_vectype = vectype;
5527 : 21922 : if (ANY_INTEGRAL_TYPE_P (vectype)
5528 : 14919 : && TYPE_OVERFLOW_UNDEFINED (vectype)
5529 : 5440 : && code.is_tree_code ()
5530 : 27362 : && arith_code_with_undefined_signed_overflow ((tree_code) code))
5531 : 3985 : compute_vectype = unsigned_type_for (vectype);
5532 : :
5533 : : /* In case of reduction chain, e.g.,
5534 : : # a1 = phi <a3, a0>
5535 : : a2 = operation (a1)
5536 : : a3 = operation (a2),
5537 : :
5538 : : we may end up with more than one vector result. Here we reduce them
5539 : : to one vector.
5540 : :
5541 : : The same is true for a SLP reduction, e.g.,
5542 : : # a1 = phi <a2, a0>
5543 : : # b1 = phi <b2, b0>
5544 : : a2 = operation (a1)
5545 : : b2 = operation (a2),
5546 : :
5547 : : where we can end up with more than one vector as well. We can
5548 : : easily accumulate vectors when the number of vector elements is
5549 : : a multiple of the SLP group size.
5550 : :
5551 : : The same is true if we couldn't use a single defuse cycle. */
5552 : 21922 : if ((!slp_reduc
5553 : : || direct_slp_reduc
5554 : : || (slp_reduc
5555 : 21922 : && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size)))
5556 : 43844 : && reduc_inputs.length () > 1)
5557 : : {
5558 : 539 : gimple_seq stmts = NULL;
5559 : 539 : tree single_input = reduc_inputs[0];
5560 : 539 : if (compute_vectype != vectype)
5561 : 154 : single_input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5562 : : compute_vectype, single_input);
5563 : 1843 : for (k = 1; k < reduc_inputs.length (); k++)
5564 : : {
5565 : 1304 : tree input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5566 : 1304 : compute_vectype, reduc_inputs[k]);
5567 : 1304 : single_input = gimple_build (&stmts, code, compute_vectype,
5568 : : single_input, input);
5569 : : }
5570 : 539 : if (compute_vectype != vectype)
5571 : 154 : single_input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5572 : : vectype, single_input);
5573 : 539 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5574 : :
5575 : 539 : reduc_inputs.truncate (0);
5576 : 539 : reduc_inputs.safe_push (single_input);
5577 : : }
5578 : :
5579 : 21922 : tree orig_reduc_input = reduc_inputs[0];
5580 : :
5581 : : /* If this loop is an epilogue loop that can be skipped after the
5582 : : main loop, we can only share a reduction operation between the
5583 : : main loop and the epilogue if we put it at the target of the
5584 : : skip edge.
5585 : :
5586 : : We can still reuse accumulators if this check fails. Doing so has
5587 : : the minor(?) benefit of making the epilogue loop's scalar result
5588 : : independent of the main loop's scalar result. */
5589 : 21922 : bool unify_with_main_loop_p = false;
5590 : 21922 : if (VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)
5591 : 4171 : && loop_vinfo->skip_this_loop_edge
5592 : 3939 : && single_succ_p (exit_bb)
5593 : 21943 : && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5594 : : {
5595 : 21 : unify_with_main_loop_p = true;
5596 : :
5597 : 21 : basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5598 : 21 : reduc_inputs[0] = make_ssa_name (vectype);
5599 : 21 : gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5600 : 21 : add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5601 : : UNKNOWN_LOCATION);
5602 : 21 : add_phi_arg (new_phi,
5603 : 21 : VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)->reduc_input,
5604 : : loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5605 : 21 : exit_gsi = gsi_after_labels (reduc_block);
5606 : : }
5607 : :
5608 : : /* Shouldn't be used beyond this point. */
5609 : 21922 : exit_bb = nullptr;
5610 : :
5611 : : /* If we are operating on a mask vector and do not support direct mask
5612 : : reduction, work on a bool data vector instead of a mask vector. */
5613 : 21922 : if (VECTOR_BOOLEAN_TYPE_P (vectype)
5614 : 227 : && VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info)
5615 : 22114 : && vectype != VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info))
5616 : : {
5617 : 192 : compute_vectype = vectype = VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info);
5618 : 192 : gimple_seq stmts = NULL;
5619 : 392 : for (unsigned i = 0; i < reduc_inputs.length (); ++i)
5620 : 400 : reduc_inputs[i] = gimple_build (&stmts, VEC_COND_EXPR, vectype,
5621 : 200 : reduc_inputs[i],
5622 : : build_one_cst (vectype),
5623 : : build_zero_cst (vectype));
5624 : 192 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5625 : : }
5626 : :
5627 : 21922 : if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
5628 : 67 : && reduc_fn != IFN_LAST)
5629 : : {
5630 : : /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5631 : : various data values where the condition matched and another vector
5632 : : (INDUCTION_INDEX) containing all the indexes of those matches. We
5633 : : need to extract the last matching index (which will be the index with
5634 : : highest value) and use this to index into the data vector.
5635 : : For the case where there were no matches, the data vector will contain
5636 : : all default values and the index vector will be all zeros. */
5637 : :
5638 : : /* Get various versions of the type of the vector of indexes. */
5639 : 4 : tree index_vec_type = TREE_TYPE (induction_index);
5640 : 4 : gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5641 : 4 : tree index_scalar_type = TREE_TYPE (index_vec_type);
5642 : 4 : tree index_vec_cmp_type = truth_type_for (index_vec_type);
5643 : :
5644 : : /* Get an unsigned integer version of the type of the data vector. */
5645 : 4 : int scalar_precision
5646 : 4 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5647 : 4 : tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5648 : 4 : tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5649 : : vectype);
5650 : :
5651 : : /* First we need to create a vector (ZERO_VEC) of zeros and another
5652 : : vector (MAX_INDEX_VEC) filled with the last matching index, which we
5653 : : can create using a MAX reduction and then expanding.
5654 : : In the case where the loop never made any matches, the max index will
5655 : : be zero. */
5656 : :
5657 : : /* Vector of {0, 0, 0,...}. */
5658 : 4 : tree zero_vec = build_zero_cst (vectype);
5659 : :
5660 : : /* Find maximum value from the vector of found indexes. */
5661 : 4 : tree max_index = make_ssa_name (index_scalar_type);
5662 : 4 : gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5663 : : 1, induction_index);
5664 : 4 : gimple_call_set_lhs (max_index_stmt, max_index);
5665 : 4 : gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5666 : :
5667 : : /* Vector of {max_index, max_index, max_index,...}. */
5668 : 4 : tree max_index_vec = make_ssa_name (index_vec_type);
5669 : 4 : tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5670 : : max_index);
5671 : 4 : gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5672 : : max_index_vec_rhs);
5673 : 4 : gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5674 : :
5675 : : /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5676 : : with the vector (INDUCTION_INDEX) of found indexes, choosing values
5677 : : from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5678 : : otherwise. Only one value should match, resulting in a vector
5679 : : (VEC_COND) with one data value and the rest zeros.
5680 : : In the case where the loop never made any matches, every index will
5681 : : match, resulting in a vector with all data values (which will all be
5682 : : the default value). */
5683 : :
5684 : : /* Compare the max index vector to the vector of found indexes to find
5685 : : the position of the max value. */
5686 : 4 : tree vec_compare = make_ssa_name (index_vec_cmp_type);
5687 : 4 : gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5688 : : induction_index,
5689 : : max_index_vec);
5690 : 4 : gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5691 : :
5692 : : /* Use the compare to choose either values from the data vector or
5693 : : zero. */
5694 : 4 : tree vec_cond = make_ssa_name (vectype);
5695 : 4 : gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5696 : : vec_compare,
5697 : 4 : reduc_inputs[0],
5698 : : zero_vec);
5699 : 4 : gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5700 : :
5701 : : /* Finally we need to extract the data value from the vector (VEC_COND)
5702 : : into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5703 : : reduction, but because this doesn't exist, we can use a MAX reduction
5704 : : instead. The data value might be signed or a float so we need to cast
5705 : : it first.
5706 : : In the case where the loop never made any matches, the data values are
5707 : : all identical, and so will reduce down correctly. */
5708 : :
5709 : : /* Make the matched data values unsigned. */
5710 : 4 : tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5711 : 4 : tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5712 : : vec_cond);
5713 : 4 : gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5714 : : VIEW_CONVERT_EXPR,
5715 : : vec_cond_cast_rhs);
5716 : 4 : gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5717 : :
5718 : : /* Reduce down to a scalar value. */
5719 : 4 : tree data_reduc = make_ssa_name (scalar_type_unsigned);
5720 : 4 : gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5721 : : 1, vec_cond_cast);
5722 : 4 : gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5723 : 4 : gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5724 : :
5725 : : /* Convert the reduced value back to the result type and set as the
5726 : : result. */
5727 : 4 : gimple_seq stmts = NULL;
5728 : 4 : new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5729 : : data_reduc);
5730 : 4 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5731 : 4 : scalar_results.safe_push (new_temp);
5732 : 4 : }
5733 : 21918 : else if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
5734 : 63 : && reduc_fn == IFN_LAST)
5735 : : {
5736 : : /* Condition reduction without supported IFN_REDUC_MAX. Generate
5737 : : idx = 0;
5738 : : idx_val = induction_index[0];
5739 : : val = data_reduc[0];
5740 : : for (idx = 0, val = init, i = 0; i < nelts; ++i)
5741 : : if (induction_index[i] > idx_val)
5742 : : val = data_reduc[i], idx_val = induction_index[i];
5743 : : return val; */
5744 : :
5745 : 63 : tree data_eltype = TREE_TYPE (vectype);
5746 : 63 : tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5747 : 63 : unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5748 : 63 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5749 : : /* Enforced by vectorizable_reduction, which ensures we have target
5750 : : support before allowing a conditional reduction on variable-length
5751 : : vectors. */
5752 : 63 : unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5753 : 63 : tree idx_val = NULL_TREE, val = NULL_TREE;
5754 : 419 : for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5755 : : {
5756 : 356 : tree old_idx_val = idx_val;
5757 : 356 : tree old_val = val;
5758 : 356 : idx_val = make_ssa_name (idx_eltype);
5759 : 356 : epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5760 : : build3 (BIT_FIELD_REF, idx_eltype,
5761 : : induction_index,
5762 : 356 : bitsize_int (el_size),
5763 : 356 : bitsize_int (off)));
5764 : 356 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5765 : 356 : val = make_ssa_name (data_eltype);
5766 : 712 : epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5767 : : build3 (BIT_FIELD_REF,
5768 : : data_eltype,
5769 : 356 : reduc_inputs[0],
5770 : 356 : bitsize_int (el_size),
5771 : 356 : bitsize_int (off)));
5772 : 356 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5773 : 356 : if (off != 0)
5774 : : {
5775 : 293 : tree new_idx_val = idx_val;
5776 : 293 : if (off != v_size - el_size)
5777 : : {
5778 : 230 : new_idx_val = make_ssa_name (idx_eltype);
5779 : 230 : epilog_stmt = gimple_build_assign (new_idx_val,
5780 : : MAX_EXPR, idx_val,
5781 : : old_idx_val);
5782 : 230 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5783 : : }
5784 : 293 : tree cond = make_ssa_name (boolean_type_node);
5785 : 293 : epilog_stmt = gimple_build_assign (cond, GT_EXPR,
5786 : : idx_val, old_idx_val);
5787 : 293 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5788 : 293 : tree new_val = make_ssa_name (data_eltype);
5789 : 293 : epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
5790 : : cond, val, old_val);
5791 : 293 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5792 : 293 : idx_val = new_idx_val;
5793 : 293 : val = new_val;
5794 : : }
5795 : : }
5796 : : /* Convert the reduced value back to the result type and set as the
5797 : : result. */
5798 : 63 : gimple_seq stmts = NULL;
5799 : 63 : val = gimple_convert (&stmts, scalar_type, val);
5800 : 63 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5801 : 63 : scalar_results.safe_push (val);
5802 : 63 : }
5803 : :
5804 : : /* 2.3 Create the reduction code, using one of the three schemes described
5805 : : above. In SLP we simply need to extract all the elements from the
5806 : : vector (without reducing them), so we use scalar shifts. */
5807 : 21855 : else if (reduc_fn != IFN_LAST && (!slp_reduc || group_size == 1))
5808 : : {
5809 : 20007 : tree tmp;
5810 : 20007 : tree vec_elem_type;
5811 : :
5812 : : /* Case 1: Create:
5813 : : v_out2 = reduc_expr <v_out1> */
5814 : :
5815 : 20007 : if (dump_enabled_p ())
5816 : 1412 : dump_printf_loc (MSG_NOTE, vect_location,
5817 : : "Reduce using direct vector reduction.\n");
5818 : :
5819 : 20007 : gimple_seq stmts = NULL;
5820 : 20007 : vec_elem_type = TREE_TYPE (vectype);
5821 : 20007 : new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5822 : 20007 : vec_elem_type, reduc_inputs[0]);
5823 : 20007 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5824 : 20007 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5825 : :
5826 : 20007 : if ((VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5827 : 66 : && induc_val)
5828 : : {
5829 : : /* Earlier we set the initial value to be a vector if induc_val
5830 : : values. Check the result and if it is induc_val then replace
5831 : : with the original initial value, unless induc_val is
5832 : : the same as initial_def already. */
5833 : 63 : tree zcompare = make_ssa_name (boolean_type_node);
5834 : 63 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
5835 : : new_temp, induc_val);
5836 : 63 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5837 : 63 : tree initial_def = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[0];
5838 : 63 : tmp = make_ssa_name (new_scalar_dest);
5839 : 63 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5840 : : initial_def, new_temp);
5841 : 63 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5842 : 63 : new_temp = tmp;
5843 : : }
5844 : :
5845 : 20007 : scalar_results.safe_push (new_temp);
5846 : 20007 : }
5847 : 1658 : else if (direct_slp_reduc)
5848 : : {
5849 : : /* Here we create one vector for each of the GROUP_SIZE results,
5850 : : with the elements for other SLP statements replaced with the
5851 : : neutral value. We can then do a normal reduction on each vector. */
5852 : :
5853 : : /* Enforced by vectorizable_reduction. */
5854 : : gcc_assert (reduc_inputs.length () == 1);
5855 : : gcc_assert (pow2p_hwi (group_size));
5856 : :
5857 : : gimple_seq seq = NULL;
5858 : :
5859 : : /* Build a vector {0, 1, 2, ...}, with the same number of elements
5860 : : and the same element size as VECTYPE. */
5861 : : tree index = build_index_vector (vectype, 0, 1);
5862 : : tree index_type = TREE_TYPE (index);
5863 : : tree index_elt_type = TREE_TYPE (index_type);
5864 : : tree mask_type = truth_type_for (index_type);
5865 : :
5866 : : /* Create a vector that, for each element, identifies which of
5867 : : the results should use it. */
5868 : : tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5869 : : index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5870 : : build_vector_from_val (index_type, index_mask));
5871 : :
5872 : : /* Get a neutral vector value. This is simply a splat of the neutral
5873 : : scalar value if we have one, otherwise the initial scalar value
5874 : : is itself a neutral value. */
5875 : : tree vector_identity = NULL_TREE;
5876 : : tree neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5877 : : NULL_TREE, false);
5878 : : if (neutral_op)
5879 : : vector_identity = gimple_build_vector_from_val (&seq, vectype,
5880 : : neutral_op);
5881 : : for (unsigned int i = 0; i < group_size; ++i)
5882 : : {
5883 : : /* If there's no univeral neutral value, we can use the
5884 : : initial scalar value from the original PHI. This is used
5885 : : for MIN and MAX reduction, for example. */
5886 : : if (!neutral_op)
5887 : : {
5888 : : tree scalar_value
5889 : : = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[i];
5890 : : scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5891 : : scalar_value);
5892 : : vector_identity = gimple_build_vector_from_val (&seq, vectype,
5893 : : scalar_value);
5894 : : }
5895 : :
5896 : : /* Calculate the equivalent of:
5897 : :
5898 : : sel[j] = (index[j] == i);
5899 : :
5900 : : which selects the elements of REDUC_INPUTS[0] that should
5901 : : be included in the result. */
5902 : : tree compare_val = build_int_cst (index_elt_type, i);
5903 : : compare_val = build_vector_from_val (index_type, compare_val);
5904 : : tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5905 : : index, compare_val);
5906 : :
5907 : : /* Calculate the equivalent of:
5908 : :
5909 : : vec = seq ? reduc_inputs[0] : vector_identity;
5910 : :
5911 : : VEC is now suitable for a full vector reduction. */
5912 : : tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5913 : : sel, reduc_inputs[0], vector_identity);
5914 : :
5915 : : /* Do the reduction and convert it to the appropriate type. */
5916 : : tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5917 : : TREE_TYPE (vectype), vec);
5918 : : scalar = gimple_convert (&seq, scalar_type, scalar);
5919 : : scalar_results.safe_push (scalar);
5920 : : }
5921 : : gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5922 : : }
5923 : : else
5924 : : {
5925 : 1658 : bool reduce_with_shift;
5926 : 1658 : tree vec_temp;
5927 : :
5928 : 1658 : gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5929 : :
5930 : : /* See if the target wants to do the final (shift) reduction
5931 : : in a vector mode of smaller size and first reduce upper/lower
5932 : : halves against each other. */
5933 : 1848 : enum machine_mode mode1 = mode;
5934 : 1848 : tree stype = TREE_TYPE (vectype);
5935 : 1848 : if (compute_vectype != vectype)
5936 : : {
5937 : 461 : stype = unsigned_type_for (stype);
5938 : 461 : gimple_seq stmts = NULL;
5939 : 992 : for (unsigned i = 0; i < reduc_inputs.length (); ++i)
5940 : : {
5941 : 531 : tree new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5942 : 531 : compute_vectype, reduc_inputs[i]);
5943 : 531 : reduc_inputs[i] = new_temp;
5944 : : }
5945 : 461 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5946 : : }
5947 : 1848 : unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5948 : 1848 : unsigned nunits1 = nunits;
5949 : 1848 : if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5950 : 1848 : && reduc_inputs.length () == 1)
5951 : : {
5952 : 43 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5953 : : /* For SLP reductions we have to make sure lanes match up, but
5954 : : since we're doing individual element final reduction reducing
5955 : : vector width here is even more important.
5956 : : ??? We can also separate lanes with permutes, for the common
5957 : : case of power-of-two group-size odd/even extracts would work. */
5958 : 43 : if (slp_reduc && nunits != nunits1)
5959 : : {
5960 : 43 : nunits1 = least_common_multiple (nunits1, group_size);
5961 : 86 : gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5962 : : }
5963 : : }
5964 : 1805 : else if (!slp_reduc
5965 : 1805 : && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5966 : 0 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5967 : :
5968 : 1848 : tree vectype1 = vectype;
5969 : 1848 : if (mode1 != mode)
5970 : : {
5971 : 46 : vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5972 : 46 : stype, nunits1);
5973 : : /* First reduce the vector to the desired vector size we should
5974 : : do shift reduction on by combining upper and lower halves. */
5975 : 46 : gimple_seq stmts = NULL;
5976 : 46 : new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
5977 : : code, &stmts);
5978 : 46 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5979 : 46 : reduc_inputs[0] = new_temp;
5980 : : }
5981 : :
5982 : 1848 : reduce_with_shift = have_whole_vector_shift (mode1);
5983 : 733 : if (!VECTOR_MODE_P (mode1)
5984 : 2579 : || !directly_supported_p (code, vectype1))
5985 : : reduce_with_shift = false;
5986 : :
5987 : 1831 : if (reduce_with_shift && (!slp_reduc || group_size == 1))
5988 : : {
5989 : 1604 : tree bitsize = TYPE_SIZE (TREE_TYPE (vectype1));
5990 : 1604 : int element_bitsize = tree_to_uhwi (bitsize);
5991 : : /* Enforced by vectorizable_reduction, which disallows SLP reductions
5992 : : for variable-length vectors and also requires direct target support
5993 : : for loop reductions. */
5994 : 1604 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5995 : 1604 : int nelements = vec_size_in_bits / element_bitsize;
5996 : 1604 : vec_perm_builder sel;
5997 : 1604 : vec_perm_indices indices;
5998 : :
5999 : 1604 : int elt_offset;
6000 : :
6001 : 1604 : tree zero_vec = build_zero_cst (vectype1);
6002 : : /* Case 2: Create:
6003 : : for (offset = nelements/2; offset >= 1; offset/=2)
6004 : : {
6005 : : Create: va' = vec_shift <va, offset>
6006 : : Create: va = vop <va, va'>
6007 : : } */
6008 : :
6009 : 1604 : if (dump_enabled_p ())
6010 : 352 : dump_printf_loc (MSG_NOTE, vect_location,
6011 : : "Reduce using vector shifts\n");
6012 : :
6013 : 1604 : gimple_seq stmts = NULL;
6014 : 1604 : new_temp = gimple_convert (&stmts, vectype1, reduc_inputs[0]);
6015 : 1604 : for (elt_offset = nelements / 2;
6016 : 3505 : elt_offset >= 1;
6017 : 1901 : elt_offset /= 2)
6018 : : {
6019 : 1901 : calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6020 : 1901 : indices.new_vector (sel, 2, nelements);
6021 : 1901 : tree mask = vect_gen_perm_mask_any (vectype1, indices);
6022 : 1901 : new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6023 : : new_temp, zero_vec, mask);
6024 : 1901 : new_temp = gimple_build (&stmts, code,
6025 : : vectype1, new_name, new_temp);
6026 : : }
6027 : :
6028 : : /* 2.4 Extract the final scalar result. Create:
6029 : : s_out3 = extract_field <v_out2, bitpos> */
6030 : :
6031 : 1604 : if (dump_enabled_p ())
6032 : 352 : dump_printf_loc (MSG_NOTE, vect_location,
6033 : : "extract scalar result\n");
6034 : :
6035 : 1604 : new_temp = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype1),
6036 : 1604 : new_temp, bitsize, bitsize_zero_node);
6037 : 1604 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6038 : 1604 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6039 : 1604 : scalar_results.safe_push (new_temp);
6040 : 1604 : }
6041 : : else
6042 : : {
6043 : : /* Case 3: Create:
6044 : : s = extract_field <v_out2, 0>
6045 : : for (offset = element_size;
6046 : : offset < vector_size;
6047 : : offset += element_size;)
6048 : : {
6049 : : Create: s' = extract_field <v_out2, offset>
6050 : : Create: s = op <s, s'> // For non SLP cases
6051 : : } */
6052 : :
6053 : 244 : if (dump_enabled_p ())
6054 : 147 : dump_printf_loc (MSG_NOTE, vect_location,
6055 : : "Reduce using scalar code.\n");
6056 : :
6057 : 244 : tree compute_type = TREE_TYPE (vectype1);
6058 : 244 : unsigned vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6059 : 244 : unsigned element_bitsize = vector_element_bits (vectype1);
6060 : 244 : tree bitsize = bitsize_int (element_bitsize);
6061 : 244 : gimple_seq stmts = NULL;
6062 : 638 : FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6063 : : {
6064 : 394 : unsigned bit_offset;
6065 : 788 : new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6066 : 394 : vec_temp, bitsize, bitsize_zero_node);
6067 : :
6068 : : /* In SLP we don't need to apply reduction operation, so we just
6069 : : collect s' values in SCALAR_RESULTS. */
6070 : 394 : if (slp_reduc)
6071 : 384 : scalar_results.safe_push (new_temp);
6072 : :
6073 : 952 : for (bit_offset = element_bitsize;
6074 : 1346 : bit_offset < vec_size_in_bits;
6075 : 952 : bit_offset += element_bitsize)
6076 : : {
6077 : 952 : tree bitpos = bitsize_int (bit_offset);
6078 : 952 : new_name = gimple_build (&stmts, BIT_FIELD_REF,
6079 : : compute_type, vec_temp,
6080 : : bitsize, bitpos);
6081 : 952 : if (slp_reduc)
6082 : : {
6083 : : /* In SLP we don't need to apply reduction operation, so
6084 : : we just collect s' values in SCALAR_RESULTS. */
6085 : 942 : new_temp = new_name;
6086 : 942 : scalar_results.safe_push (new_name);
6087 : : }
6088 : : else
6089 : 10 : new_temp = gimple_build (&stmts, code, compute_type,
6090 : : new_name, new_temp);
6091 : : }
6092 : : }
6093 : :
6094 : : /* The only case where we need to reduce scalar results in a SLP
6095 : : reduction, is unrolling. If the size of SCALAR_RESULTS is
6096 : : greater than GROUP_SIZE, we reduce them combining elements modulo
6097 : : GROUP_SIZE. */
6098 : 244 : if (slp_reduc)
6099 : : {
6100 : 234 : tree res, first_res, new_res;
6101 : :
6102 : : /* Reduce multiple scalar results in case of SLP unrolling. */
6103 : 865 : for (j = group_size; scalar_results.iterate (j, &res);
6104 : : j++)
6105 : : {
6106 : 631 : first_res = scalar_results[j % group_size];
6107 : 631 : new_res = gimple_build (&stmts, code, compute_type,
6108 : : first_res, res);
6109 : 631 : scalar_results[j % group_size] = new_res;
6110 : : }
6111 : 234 : scalar_results.truncate (group_size);
6112 : 1163 : for (k = 0; k < group_size; k++)
6113 : 1390 : scalar_results[k] = gimple_convert (&stmts, scalar_type,
6114 : 695 : scalar_results[k]);
6115 : : }
6116 : : else
6117 : : {
6118 : : /* Reduction chain - we have one scalar to keep in
6119 : : SCALAR_RESULTS. */
6120 : 10 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6121 : 10 : scalar_results.safe_push (new_temp);
6122 : : }
6123 : :
6124 : 244 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6125 : : }
6126 : :
6127 : 1848 : if ((VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6128 : 0 : && induc_val)
6129 : : {
6130 : : /* Earlier we set the initial value to be a vector if induc_val
6131 : : values. Check the result and if it is induc_val then replace
6132 : : with the original initial value, unless induc_val is
6133 : : the same as initial_def already. */
6134 : 0 : tree zcompare = make_ssa_name (boolean_type_node);
6135 : 0 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6136 : 0 : scalar_results[0], induc_val);
6137 : 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6138 : 0 : tree initial_def = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[0];
6139 : 0 : tree tmp = make_ssa_name (new_scalar_dest);
6140 : 0 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6141 : 0 : initial_def, scalar_results[0]);
6142 : 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6143 : 0 : scalar_results[0] = tmp;
6144 : : }
6145 : : }
6146 : :
6147 : : /* 2.5 Adjust the final result by the initial value of the reduction
6148 : : variable. (When such adjustment is not needed, then
6149 : : 'adjustment_def' is zero). For example, if code is PLUS we create:
6150 : : new_temp = loop_exit_def + adjustment_def */
6151 : :
6152 : 21922 : if (adjustment_def)
6153 : : {
6154 : 15833 : gcc_assert (!slp_reduc || group_size == 1);
6155 : 15833 : gimple_seq stmts = NULL;
6156 : 15833 : if (double_reduc)
6157 : : {
6158 : 0 : gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6159 : 0 : adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6160 : 0 : new_temp = gimple_build (&stmts, code, vectype,
6161 : 0 : reduc_inputs[0], adjustment_def);
6162 : : }
6163 : : else
6164 : : {
6165 : 15833 : new_temp = scalar_results[0];
6166 : 15833 : gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6167 : 15833 : adjustment_def = gimple_convert (&stmts, TREE_TYPE (compute_vectype),
6168 : : adjustment_def);
6169 : 15833 : new_temp = gimple_convert (&stmts, TREE_TYPE (compute_vectype),
6170 : : new_temp);
6171 : 15833 : new_temp = gimple_build (&stmts, code, TREE_TYPE (compute_vectype),
6172 : : new_temp, adjustment_def);
6173 : 15833 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6174 : : }
6175 : :
6176 : 15833 : epilog_stmt = gimple_seq_last_stmt (stmts);
6177 : 15833 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6178 : 15833 : scalar_results[0] = new_temp;
6179 : : }
6180 : :
6181 : : /* Record this operation if it could be reused by the epilogue loop. */
6182 : 21922 : if (VECT_REDUC_INFO_TYPE (reduc_info) == TREE_CODE_REDUCTION
6183 : 21922 : && reduc_inputs.length () == 1)
6184 : 21735 : loop_vinfo->reusable_accumulators.put (scalar_results[0],
6185 : : { orig_reduc_input, reduc_info });
6186 : :
6187 : : /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6188 : : phis with new adjusted scalar results, i.e., replace use <s_out0>
6189 : : with use <s_out4>.
6190 : :
6191 : : Transform:
6192 : : loop_exit:
6193 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6194 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6195 : : v_out2 = reduce <v_out1>
6196 : : s_out3 = extract_field <v_out2, 0>
6197 : : s_out4 = adjust_result <s_out3>
6198 : : use <s_out0>
6199 : : use <s_out0>
6200 : :
6201 : : into:
6202 : :
6203 : : loop_exit:
6204 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6205 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6206 : : v_out2 = reduce <v_out1>
6207 : : s_out3 = extract_field <v_out2, 0>
6208 : : s_out4 = adjust_result <s_out3>
6209 : : use <s_out4>
6210 : : use <s_out4> */
6211 : :
6212 : 43844 : gcc_assert (live_out_stmts.size () == scalar_results.length ());
6213 : 21922 : auto_vec<gimple *> phis;
6214 : 44305 : for (k = 0; k < live_out_stmts.size (); k++)
6215 : : {
6216 : 22383 : stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6217 : 22383 : tree scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6218 : :
6219 : : /* Find the loop-closed-use at the loop exit of the original scalar
6220 : : result. (The reduction result is expected to have two immediate uses,
6221 : : one at the latch block, and one at the loop exit). Note with
6222 : : early break we can have two exit blocks, so pick the correct PHI. */
6223 : 113800 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6224 : 69034 : if (!is_gimple_debug (USE_STMT (use_p))
6225 : 69034 : && !flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6226 : : {
6227 : 22378 : gcc_assert (is_a <gphi *> (USE_STMT (use_p)));
6228 : 22378 : if (gimple_bb (USE_STMT (use_p)) == loop_exit->dest)
6229 : 22370 : phis.safe_push (USE_STMT (use_p));
6230 : 22383 : }
6231 : :
6232 : 44753 : FOR_EACH_VEC_ELT (phis, i, exit_phi)
6233 : : {
6234 : : /* Replace the uses: */
6235 : 22370 : orig_name = PHI_RESULT (exit_phi);
6236 : :
6237 : : /* Look for a single use at the target of the skip edge. */
6238 : 22370 : if (unify_with_main_loop_p)
6239 : : {
6240 : 38 : use_operand_p use_p;
6241 : 38 : gimple *user;
6242 : 38 : if (!single_imm_use (orig_name, &use_p, &user))
6243 : 0 : gcc_unreachable ();
6244 : 38 : orig_name = gimple_get_lhs (user);
6245 : : }
6246 : :
6247 : 22370 : scalar_result = scalar_results[k];
6248 : 83031 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6249 : : {
6250 : 114917 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6251 : 38313 : SET_USE (use_p, scalar_result);
6252 : 38291 : update_stmt (use_stmt);
6253 : 22370 : }
6254 : : }
6255 : :
6256 : 22383 : phis.truncate (0);
6257 : : }
6258 : 21922 : }
6259 : :
6260 : : /* Return a vector of type VECTYPE that is equal to the vector select
6261 : : operation "MASK ? VEC : IDENTITY". Insert the select statements
6262 : : before GSI. */
6263 : :
6264 : : static tree
6265 : 9 : merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6266 : : tree vec, tree identity)
6267 : : {
6268 : 9 : tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6269 : 9 : gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6270 : : mask, vec, identity);
6271 : 9 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6272 : 9 : return cond;
6273 : : }
6274 : :
6275 : : /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6276 : : order, starting with LHS. Insert the extraction statements before GSI and
6277 : : associate the new scalar SSA names with variable SCALAR_DEST.
6278 : : If MASK is nonzero mask the input and then operate on it unconditionally.
6279 : : Return the SSA name for the result. */
6280 : :
6281 : : static tree
6282 : 1043 : vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6283 : : tree_code code, tree lhs, tree vector_rhs,
6284 : : tree mask)
6285 : : {
6286 : 1043 : tree vectype = TREE_TYPE (vector_rhs);
6287 : 1043 : tree scalar_type = TREE_TYPE (vectype);
6288 : 1043 : tree bitsize = TYPE_SIZE (scalar_type);
6289 : 1043 : unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6290 : 1043 : unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6291 : :
6292 : : /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6293 : : to perform an unconditional element-wise reduction of it. */
6294 : 1043 : if (mask)
6295 : : {
6296 : 45 : tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
6297 : : "masked_vector_rhs");
6298 : 45 : tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
6299 : : false);
6300 : 45 : tree vector_identity = build_vector_from_val (vectype, neutral_op);
6301 : 45 : gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
6302 : : mask, vector_rhs, vector_identity);
6303 : 45 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6304 : 45 : vector_rhs = masked_vector_rhs;
6305 : : }
6306 : :
6307 : 1043 : for (unsigned HOST_WIDE_INT bit_offset = 0;
6308 : 4647 : bit_offset < vec_size_in_bits;
6309 : 3604 : bit_offset += element_bitsize)
6310 : : {
6311 : 3604 : tree bitpos = bitsize_int (bit_offset);
6312 : 3604 : tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6313 : : bitsize, bitpos);
6314 : :
6315 : 3604 : gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6316 : 3604 : rhs = make_ssa_name (scalar_dest, stmt);
6317 : 3604 : gimple_assign_set_lhs (stmt, rhs);
6318 : 3604 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6319 : : /* Fold the vector extract, combining it with a previous reversal
6320 : : like seen in PR90579. */
6321 : 3604 : auto gsi2 = gsi_for_stmt (stmt);
6322 : 3604 : if (fold_stmt (&gsi2, follow_all_ssa_edges))
6323 : 356 : update_stmt (gsi_stmt (gsi2));
6324 : :
6325 : 3604 : stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6326 : 3604 : tree new_name = make_ssa_name (scalar_dest, stmt);
6327 : 3604 : gimple_assign_set_lhs (stmt, new_name);
6328 : 3604 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6329 : 3604 : lhs = new_name;
6330 : : }
6331 : 1043 : return lhs;
6332 : : }
6333 : :
6334 : : /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6335 : : type of the vector input. */
6336 : :
6337 : : static internal_fn
6338 : 2520 : get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6339 : : {
6340 : 2520 : internal_fn mask_reduc_fn;
6341 : 2520 : internal_fn mask_len_reduc_fn;
6342 : :
6343 : 2520 : switch (reduc_fn)
6344 : : {
6345 : 0 : case IFN_FOLD_LEFT_PLUS:
6346 : 0 : mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6347 : 0 : mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6348 : 0 : break;
6349 : :
6350 : : default:
6351 : : return IFN_LAST;
6352 : : }
6353 : :
6354 : 0 : if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6355 : : OPTIMIZE_FOR_SPEED))
6356 : : return mask_reduc_fn;
6357 : 0 : if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6358 : : OPTIMIZE_FOR_SPEED))
6359 : : return mask_len_reduc_fn;
6360 : : return IFN_LAST;
6361 : : }
6362 : :
6363 : : /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6364 : : statement that sets the live-out value. REDUC_DEF_STMT is the phi
6365 : : statement. CODE is the operation performed by STMT_INFO and OPS are
6366 : : its scalar operands. REDUC_INDEX is the index of the operand in
6367 : : OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6368 : : implements in-order reduction, or IFN_LAST if we should open-code it.
6369 : : VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6370 : : that should be used to control the operation in a fully-masked loop. */
6371 : :
6372 : : static bool
6373 : 830 : vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6374 : : stmt_vec_info stmt_info,
6375 : : gimple_stmt_iterator *gsi,
6376 : : slp_tree slp_node,
6377 : : code_helper code, internal_fn reduc_fn,
6378 : : int num_ops, tree vectype_in,
6379 : : int reduc_index, vec_loop_masks *masks,
6380 : : vec_loop_lens *lens)
6381 : : {
6382 : 830 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6383 : 830 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
6384 : 830 : internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6385 : :
6386 : 830 : gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6387 : :
6388 : 830 : bool is_cond_op = false;
6389 : 830 : if (!code.is_tree_code ())
6390 : : {
6391 : 15 : code = conditional_internal_fn_code (internal_fn (code));
6392 : 15 : gcc_assert (code != ERROR_MARK);
6393 : : is_cond_op = true;
6394 : : }
6395 : :
6396 : 830 : gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
6397 : :
6398 : 830 : gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6399 : : TYPE_VECTOR_SUBPARTS (vectype_in)));
6400 : :
6401 : : /* ??? We should, when transforming the cycle PHI, record the existing
6402 : : scalar def as vector def so looking up the vector def works. This
6403 : : would also allow generalizing this for reduction paths of length > 1
6404 : : and/or SLP reductions. */
6405 : 830 : slp_tree reduc_node = SLP_TREE_CHILDREN (slp_node)[reduc_index];
6406 : 830 : stmt_vec_info reduc_var_def = SLP_TREE_SCALAR_STMTS (reduc_node)[0];
6407 : 830 : tree reduc_var = gimple_get_lhs (STMT_VINFO_STMT (reduc_var_def));
6408 : :
6409 : : /* The operands either come from a binary operation or an IFN_COND operation.
6410 : : The former is a gimple assign with binary rhs and the latter is a
6411 : : gimple call with four arguments. */
6412 : 830 : gcc_assert (num_ops == 2 || num_ops == 4);
6413 : :
6414 : 830 : auto_vec<tree> vec_oprnds0, vec_opmask;
6415 : 830 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[(is_cond_op ? 2 : 0)
6416 : 830 : + (1 - reduc_index)],
6417 : : &vec_oprnds0);
6418 : : /* For an IFN_COND_OP we also need the vector mask operand. */
6419 : 830 : if (is_cond_op)
6420 : 15 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], &vec_opmask);
6421 : :
6422 : : /* The transform below relies on preserving the original scalar PHI
6423 : : and its latch def which we replace. So work backwards from there. */
6424 : 830 : tree scalar_dest
6425 : 830 : = gimple_phi_arg_def_from_edge (as_a <gphi *> (STMT_VINFO_STMT
6426 : : (reduc_var_def)),
6427 : 830 : loop_latch_edge (loop));
6428 : 830 : stmt_vec_info scalar_dest_def_info
6429 : 830 : = vect_stmt_to_vectorize (loop_vinfo->lookup_def (scalar_dest));
6430 : 830 : tree scalar_type = TREE_TYPE (scalar_dest);
6431 : :
6432 : 830 : int vec_num = vec_oprnds0.length ();
6433 : 830 : tree vec_elem_type = TREE_TYPE (vectype_out);
6434 : 830 : gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6435 : :
6436 : 830 : tree vector_identity = NULL_TREE;
6437 : 830 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6438 : : {
6439 : 2 : vector_identity = build_zero_cst (vectype_out);
6440 : 2 : if (!HONOR_SIGNED_ZEROS (vectype_out))
6441 : : ;
6442 : : else
6443 : : {
6444 : 2 : gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
6445 : 2 : vector_identity = const_unop (NEGATE_EXPR, vectype_out,
6446 : : vector_identity);
6447 : : }
6448 : : }
6449 : :
6450 : 830 : tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6451 : 830 : int i;
6452 : 830 : tree def0;
6453 : 1873 : FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6454 : : {
6455 : 1043 : gimple *new_stmt;
6456 : 1043 : tree mask = NULL_TREE;
6457 : 1043 : tree len = NULL_TREE;
6458 : 1043 : tree bias = NULL_TREE;
6459 : 1043 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6460 : : {
6461 : 9 : tree loop_mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
6462 : : vec_num, vectype_in, i);
6463 : 9 : if (is_cond_op)
6464 : 9 : mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (loop_mask),
6465 : 9 : loop_mask, vec_opmask[i], gsi);
6466 : : else
6467 : : mask = loop_mask;
6468 : : }
6469 : 1034 : else if (is_cond_op)
6470 : 36 : mask = vec_opmask[i];
6471 : 1043 : if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
6472 : : {
6473 : 0 : len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
6474 : : i, 1);
6475 : 0 : signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
6476 : 0 : bias = build_int_cst (intQI_type_node, biasval);
6477 : 0 : if (!is_cond_op)
6478 : 0 : mask = build_minus_one_cst (truth_type_for (vectype_in));
6479 : : }
6480 : :
6481 : : /* Handle MINUS by adding the negative. */
6482 : 1043 : if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6483 : : {
6484 : 0 : tree negated = make_ssa_name (vectype_out);
6485 : 0 : new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6486 : 0 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6487 : 0 : def0 = negated;
6488 : : }
6489 : :
6490 : 9 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
6491 : 1052 : && mask && mask_reduc_fn == IFN_LAST)
6492 : 9 : def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6493 : : vector_identity);
6494 : :
6495 : : /* On the first iteration the input is simply the scalar phi
6496 : : result, and for subsequent iterations it is the output of
6497 : : the preceding operation. */
6498 : 1043 : if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6499 : : {
6500 : 0 : if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6501 : 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
6502 : : def0, mask, len, bias);
6503 : 0 : else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
6504 : 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6505 : : def0, mask);
6506 : : else
6507 : 0 : new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6508 : : def0);
6509 : : /* For chained SLP reductions the output of the previous reduction
6510 : : operation serves as the input of the next. For the final statement
6511 : : the output cannot be a temporary - we reuse the original
6512 : : scalar destination of the last statement. */
6513 : 0 : if (i != vec_num - 1)
6514 : : {
6515 : 0 : gimple_set_lhs (new_stmt, scalar_dest_var);
6516 : 0 : reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6517 : 0 : gimple_set_lhs (new_stmt, reduc_var);
6518 : : }
6519 : : }
6520 : : else
6521 : : {
6522 : 1043 : reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
6523 : : tree_code (code), reduc_var, def0,
6524 : : mask);
6525 : 1043 : new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6526 : : /* Remove the statement, so that we can use the same code paths
6527 : : as for statements that we've just created. */
6528 : 1043 : gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6529 : 1043 : gsi_remove (&tmp_gsi, true);
6530 : : }
6531 : :
6532 : 1043 : if (i == vec_num - 1)
6533 : : {
6534 : 830 : gimple_set_lhs (new_stmt, scalar_dest);
6535 : 830 : vect_finish_replace_stmt (loop_vinfo,
6536 : : scalar_dest_def_info,
6537 : : new_stmt);
6538 : : }
6539 : : else
6540 : 213 : vect_finish_stmt_generation (loop_vinfo,
6541 : : scalar_dest_def_info,
6542 : : new_stmt, gsi);
6543 : :
6544 : 1043 : slp_node->push_vec_def (new_stmt);
6545 : : }
6546 : :
6547 : 830 : return true;
6548 : 830 : }
6549 : :
6550 : : /* Function is_nonwrapping_integer_induction.
6551 : :
6552 : : Check if STMT_VINO (which is part of loop LOOP) both increments and
6553 : : does not cause overflow. */
6554 : :
6555 : : static bool
6556 : 411 : is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6557 : : {
6558 : 411 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6559 : 411 : tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6560 : 411 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6561 : 411 : tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6562 : 411 : widest_int ni, max_loop_value, lhs_max;
6563 : 411 : wi::overflow_type overflow = wi::OVF_NONE;
6564 : :
6565 : : /* Make sure the loop is integer based. */
6566 : 411 : if (TREE_CODE (base) != INTEGER_CST
6567 : 112 : || TREE_CODE (step) != INTEGER_CST)
6568 : : return false;
6569 : :
6570 : : /* Check that the max size of the loop will not wrap. */
6571 : :
6572 : 112 : if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6573 : : return true;
6574 : :
6575 : 8 : if (! max_stmt_executions (loop, &ni))
6576 : : return false;
6577 : :
6578 : 8 : max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6579 : 8 : &overflow);
6580 : 8 : if (overflow)
6581 : : return false;
6582 : :
6583 : 8 : max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6584 : 16 : TYPE_SIGN (lhs_type), &overflow);
6585 : 8 : if (overflow)
6586 : : return false;
6587 : :
6588 : 8 : return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6589 : 8 : <= TYPE_PRECISION (lhs_type));
6590 : 411 : }
6591 : :
6592 : : /* Check if masking can be supported by inserting a conditional expression.
6593 : : CODE is the code for the operation. COND_FN is the conditional internal
6594 : : function, if it exists. VECTYPE_IN is the type of the vector input. */
6595 : : static bool
6596 : 5264 : use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6597 : : tree vectype_in)
6598 : : {
6599 : 5264 : if (cond_fn != IFN_LAST
6600 : 5264 : && direct_internal_fn_supported_p (cond_fn, vectype_in,
6601 : : OPTIMIZE_FOR_SPEED))
6602 : : return false;
6603 : :
6604 : 3770 : if (code.is_tree_code ())
6605 : 3768 : switch (tree_code (code))
6606 : : {
6607 : : case DOT_PROD_EXPR:
6608 : : case SAD_EXPR:
6609 : : return true;
6610 : :
6611 : : default:
6612 : : break;
6613 : : }
6614 : : return false;
6615 : : }
6616 : :
6617 : : /* Insert a conditional expression to enable masked vectorization. CODE is the
6618 : : code for the operation. VOP is the array of operands. MASK is the loop
6619 : : mask. GSI is a statement iterator used to place the new conditional
6620 : : expression. */
6621 : : static void
6622 : 4 : build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6623 : : gimple_stmt_iterator *gsi)
6624 : : {
6625 : 4 : switch (tree_code (code))
6626 : : {
6627 : 4 : case DOT_PROD_EXPR:
6628 : 4 : {
6629 : 4 : tree vectype = TREE_TYPE (vop[1]);
6630 : 4 : tree zero = build_zero_cst (vectype);
6631 : 4 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6632 : 4 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6633 : : mask, vop[1], zero);
6634 : 4 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6635 : 4 : vop[1] = masked_op1;
6636 : 4 : break;
6637 : : }
6638 : :
6639 : 0 : case SAD_EXPR:
6640 : 0 : {
6641 : 0 : tree vectype = TREE_TYPE (vop[1]);
6642 : 0 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6643 : 0 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6644 : : mask, vop[1], vop[0]);
6645 : 0 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6646 : 0 : vop[1] = masked_op1;
6647 : 0 : break;
6648 : : }
6649 : :
6650 : 0 : default:
6651 : 0 : gcc_unreachable ();
6652 : : }
6653 : 4 : }
6654 : :
6655 : : /* Given an operation with CODE in loop reduction path whose reduction PHI is
6656 : : specified by REDUC_INFO, the operation has TYPE of scalar result, and its
6657 : : input vectype is represented by VECTYPE_IN. The vectype of vectorized result
6658 : : may be different from VECTYPE_IN, either in base type or vectype lanes,
6659 : : lane-reducing operation is the case. This function check if it is possible,
6660 : : and how to perform partial vectorization on the operation in the context
6661 : : of LOOP_VINFO. */
6662 : :
6663 : : static void
6664 : 3392 : vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo,
6665 : : vect_reduc_info reduc_info,
6666 : : slp_tree slp_node,
6667 : : code_helper code, tree type,
6668 : : tree vectype_in)
6669 : : {
6670 : 3392 : enum vect_reduction_type reduc_type = VECT_REDUC_INFO_TYPE (reduc_info);
6671 : 3392 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
6672 : 3392 : internal_fn cond_fn
6673 : 920 : = ((code.is_internal_fn ()
6674 : 920 : && internal_fn_mask_index ((internal_fn)code) != -1)
6675 : 3392 : ? (internal_fn)code : get_conditional_internal_fn (code, type));
6676 : :
6677 : 3392 : if (reduc_type != FOLD_LEFT_REDUCTION
6678 : 2717 : && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6679 : 6066 : && (cond_fn == IFN_LAST
6680 : 2674 : || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6681 : : OPTIMIZE_FOR_SPEED)))
6682 : : {
6683 : 1702 : if (dump_enabled_p ())
6684 : 97 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6685 : : "can't operate on partial vectors because"
6686 : : " no conditional operation is available.\n");
6687 : 1702 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6688 : : }
6689 : 1690 : else if (reduc_type == FOLD_LEFT_REDUCTION
6690 : 1690 : && reduc_fn == IFN_LAST
6691 : 1690 : && !expand_vec_cond_expr_p (vectype_in, truth_type_for (vectype_in)))
6692 : : {
6693 : 0 : if (dump_enabled_p ())
6694 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6695 : : "can't operate on partial vectors because"
6696 : : " no conditional operation is available.\n");
6697 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6698 : : }
6699 : 1690 : else if (reduc_type == FOLD_LEFT_REDUCTION
6700 : 675 : && internal_fn_mask_index (reduc_fn) == -1
6701 : 675 : && FLOAT_TYPE_P (vectype_in)
6702 : 2360 : && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
6703 : : {
6704 : 0 : if (dump_enabled_p ())
6705 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6706 : : "can't operate on partial vectors because"
6707 : : " signed zeros cannot be preserved.\n");
6708 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6709 : : }
6710 : : else
6711 : : {
6712 : 1690 : internal_fn mask_reduc_fn
6713 : 1690 : = get_masked_reduction_fn (reduc_fn, vectype_in);
6714 : 1690 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6715 : 1690 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
6716 : 1690 : unsigned nvectors = vect_get_num_copies (loop_vinfo, slp_node);
6717 : :
6718 : 1690 : if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6719 : 0 : vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_in, 1);
6720 : : else
6721 : 1690 : vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_in, NULL);
6722 : : }
6723 : 3392 : }
6724 : :
6725 : : /* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
6726 : : the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC,
6727 : : and the analysis is for slp if SLP_NODE is not NULL.
6728 : :
6729 : : For a lane-reducing operation, the loop reduction path that it lies in,
6730 : : may contain normal operation, or other lane-reducing operation of different
6731 : : input type size, an example as:
6732 : :
6733 : : int sum = 0;
6734 : : for (i)
6735 : : {
6736 : : ...
6737 : : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
6738 : : sum += w[i]; // widen-sum <vector(16) char>
6739 : : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
6740 : : sum += n[i]; // normal <vector(4) int>
6741 : : ...
6742 : : }
6743 : :
6744 : : Vectorization factor is essentially determined by operation whose input
6745 : : vectype has the most lanes ("vector(16) char" in the example), while we
6746 : : need to choose input vectype with the least lanes ("vector(4) int" in the
6747 : : example) to determine effective number of vector reduction PHIs. */
6748 : :
6749 : : bool
6750 : 307273 : vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
6751 : : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
6752 : : {
6753 : 307273 : gimple *stmt = stmt_info->stmt;
6754 : :
6755 : 307273 : if (!lane_reducing_stmt_p (stmt))
6756 : : return false;
6757 : :
6758 : 462 : tree type = TREE_TYPE (gimple_assign_lhs (stmt));
6759 : :
6760 : 462 : if (!INTEGRAL_TYPE_P (type))
6761 : : return false;
6762 : :
6763 : : /* Do not try to vectorize bit-precision reductions. */
6764 : 462 : if (!type_has_mode_precision_p (type))
6765 : : return false;
6766 : :
6767 : 462 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
6768 : :
6769 : : /* TODO: Support lane-reducing operation that does not directly participate
6770 : : in loop reduction. */
6771 : 462 : if (!reduc_info)
6772 : : return false;
6773 : :
6774 : : /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
6775 : : recoginized. */
6776 : 462 : gcc_assert (!nested_in_vect_loop_p (LOOP_VINFO_LOOP (loop_vinfo), stmt_info));
6777 : 462 : gcc_assert (VECT_REDUC_INFO_TYPE (reduc_info) == TREE_CODE_REDUCTION);
6778 : :
6779 : 1848 : for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
6780 : : {
6781 : 1386 : slp_tree slp_op;
6782 : 1386 : tree op;
6783 : 1386 : tree vectype;
6784 : 1386 : enum vect_def_type dt;
6785 : :
6786 : 1386 : if (!vect_is_simple_use (loop_vinfo, slp_node, i, &op,
6787 : : &slp_op, &dt, &vectype))
6788 : : {
6789 : 0 : if (dump_enabled_p ())
6790 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6791 : : "use not simple.\n");
6792 : 0 : return false;
6793 : : }
6794 : :
6795 : 1386 : if (!vectype)
6796 : : {
6797 : 6 : vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
6798 : : slp_op);
6799 : 6 : if (!vectype)
6800 : : return false;
6801 : : }
6802 : :
6803 : 1386 : if (!vect_maybe_update_slp_op_vectype (slp_op, vectype))
6804 : : {
6805 : 0 : if (dump_enabled_p ())
6806 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6807 : : "incompatible vector types for invariants\n");
6808 : 0 : return false;
6809 : : }
6810 : :
6811 : 1386 : if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6812 : 462 : continue;
6813 : :
6814 : : /* There should be at most one cycle def in the stmt. */
6815 : 924 : if (VECTORIZABLE_CYCLE_DEF (dt))
6816 : : return false;
6817 : : }
6818 : :
6819 : 462 : slp_tree node_in = SLP_TREE_CHILDREN (slp_node)[0];
6820 : 462 : tree vectype_in = SLP_TREE_VECTYPE (node_in);
6821 : 462 : gcc_assert (vectype_in);
6822 : :
6823 : : /* Compute number of effective vector statements for costing. */
6824 : 462 : unsigned int ncopies_for_cost = vect_get_num_copies (loop_vinfo, node_in);
6825 : 462 : gcc_assert (ncopies_for_cost >= 1);
6826 : :
6827 : 462 : if (vect_is_emulated_mixed_dot_prod (slp_node))
6828 : : {
6829 : : /* We need extra two invariants: one that contains the minimum signed
6830 : : value and one that contains half of its negative. */
6831 : 11 : int prologue_stmts = 2;
6832 : 11 : unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
6833 : : scalar_to_vec, slp_node, 0,
6834 : : vect_prologue);
6835 : 11 : if (dump_enabled_p ())
6836 : 0 : dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
6837 : : "extra prologue_cost = %d .\n", cost);
6838 : :
6839 : : /* Three dot-products and a subtraction. */
6840 : 11 : ncopies_for_cost *= 4;
6841 : : }
6842 : :
6843 : 462 : record_stmt_cost (cost_vec, (int) ncopies_for_cost, vector_stmt, slp_node,
6844 : : 0, vect_body);
6845 : :
6846 : 462 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
6847 : : {
6848 : 43 : enum tree_code code = gimple_assign_rhs_code (stmt);
6849 : 43 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
6850 : 43 : node_in, code, type,
6851 : : vectype_in);
6852 : : }
6853 : :
6854 : : /* Transform via vect_transform_reduction. */
6855 : 462 : SLP_TREE_TYPE (slp_node) = reduc_vec_info_type;
6856 : 462 : return true;
6857 : : }
6858 : :
6859 : : /* Function vectorizable_reduction.
6860 : :
6861 : : Check if STMT_INFO performs a reduction operation that can be vectorized.
6862 : : If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6863 : : stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6864 : : Return true if STMT_INFO is vectorizable in this way.
6865 : :
6866 : : This function also handles reduction idioms (patterns) that have been
6867 : : recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6868 : : may be of this form:
6869 : : X = pattern_expr (arg0, arg1, ..., X)
6870 : : and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6871 : : sequence that had been detected and replaced by the pattern-stmt
6872 : : (STMT_INFO).
6873 : :
6874 : : This function also handles reduction of condition expressions, for example:
6875 : : for (int i = 0; i < N; i++)
6876 : : if (a[i] < value)
6877 : : last = a[i];
6878 : : This is handled by vectorising the loop and creating an additional vector
6879 : : containing the loop indexes for which "a[i] < value" was true. In the
6880 : : function epilogue this is reduced to a single max value and then used to
6881 : : index into the vector of results.
6882 : :
6883 : : In some cases of reduction patterns, the type of the reduction variable X is
6884 : : different than the type of the other arguments of STMT_INFO.
6885 : : In such cases, the vectype that is used when transforming STMT_INFO into
6886 : : a vector stmt is different than the vectype that is used to determine the
6887 : : vectorization factor, because it consists of a different number of elements
6888 : : than the actual number of elements that are being operated upon in parallel.
6889 : :
6890 : : For example, consider an accumulation of shorts into an int accumulator.
6891 : : On some targets it's possible to vectorize this pattern operating on 8
6892 : : shorts at a time (hence, the vectype for purposes of determining the
6893 : : vectorization factor should be V8HI); on the other hand, the vectype that
6894 : : is used to create the vector form is actually V4SI (the type of the result).
6895 : :
6896 : : Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6897 : : indicates what is the actual level of parallelism (V8HI in the example), so
6898 : : that the right vectorization factor would be derived. This vectype
6899 : : corresponds to the type of arguments to the reduction stmt, and should *NOT*
6900 : : be used to create the vectorized stmt. The right vectype for the vectorized
6901 : : stmt is obtained from the type of the result X:
6902 : : get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6903 : :
6904 : : This means that, contrary to "regular" reductions (or "regular" stmts in
6905 : : general), the following equation:
6906 : : STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6907 : : does *NOT* necessarily hold for reduction patterns. */
6908 : :
6909 : : bool
6910 : 306811 : vectorizable_reduction (loop_vec_info loop_vinfo,
6911 : : stmt_vec_info stmt_info, slp_tree slp_node,
6912 : : slp_instance slp_node_instance,
6913 : : stmt_vector_for_cost *cost_vec)
6914 : : {
6915 : 306811 : tree vectype_in = NULL_TREE;
6916 : 306811 : enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6917 : 306811 : stmt_vec_info cond_stmt_vinfo = NULL;
6918 : 306811 : int i;
6919 : 306811 : int ncopies;
6920 : 306811 : bool single_defuse_cycle = false;
6921 : 306811 : tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6922 : 306811 : tree cond_reduc_val = NULL_TREE;
6923 : :
6924 : : /* Make sure it was already recognized as a reduction computation. */
6925 : 306811 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6926 : : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6927 : 306811 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6928 : : return false;
6929 : :
6930 : : /* The reduction meta. */
6931 : 57002 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
6932 : :
6933 : 57002 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6934 : : {
6935 : 1339 : gcc_assert (is_a <gphi *> (stmt_info->stmt));
6936 : : /* We eventually need to set a vector type on invariant arguments. */
6937 : : unsigned j;
6938 : : slp_tree child;
6939 : 4017 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6940 : 2678 : if (!vect_maybe_update_slp_op_vectype (child,
6941 : : SLP_TREE_VECTYPE (slp_node)))
6942 : : {
6943 : 0 : if (dump_enabled_p ())
6944 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6945 : : "incompatible vector types for "
6946 : : "invariants\n");
6947 : 0 : return false;
6948 : : }
6949 : : /* Analysis for double-reduction is done on the outer
6950 : : loop PHI, nested cycles have no further restrictions. */
6951 : 1339 : SLP_TREE_TYPE (slp_node) = cycle_phi_info_type;
6952 : 1339 : return true;
6953 : : }
6954 : :
6955 : 55663 : if (!is_a <gphi *> (stmt_info->stmt))
6956 : : {
6957 : 7014 : gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def);
6958 : 7014 : SLP_TREE_TYPE (slp_node) = reduc_vec_info_type;
6959 : 7014 : return true;
6960 : : }
6961 : :
6962 : 48649 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6963 : 48649 : stmt_vec_info phi_info = stmt_info;
6964 : 48649 : bool double_reduc = false;
6965 : 48649 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6966 : : {
6967 : : /* We arrive here for both the inner loop LC PHI and the
6968 : : outer loop PHI. The latter is what we want to analyze the
6969 : : reduction with. The LC PHI is handled by vectorizable_lc_phi. */
6970 : 266 : if (gimple_bb (stmt_info->stmt) != loop->header)
6971 : 0 : return false;
6972 : :
6973 : : /* Set loop and phi_info to the inner loop. */
6974 : 266 : use_operand_p use_p;
6975 : 266 : gimple *use_stmt;
6976 : 266 : bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6977 : : &use_p, &use_stmt);
6978 : 266 : gcc_assert (res);
6979 : 266 : phi_info = loop_vinfo->lookup_stmt (use_stmt);
6980 : 266 : loop = loop->inner;
6981 : 266 : double_reduc = true;
6982 : : }
6983 : :
6984 : 48649 : const bool reduc_chain = reduc_info->is_reduc_chain;
6985 : 48649 : slp_node_instance->reduc_phis = slp_node;
6986 : : /* ??? We're leaving slp_node to point to the PHIs, we only
6987 : : need it to get at the number of vector stmts which wasn't
6988 : : yet initialized for the instance root. */
6989 : :
6990 : : /* PHIs should not participate in patterns. */
6991 : 48649 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6992 : 48649 : gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6993 : :
6994 : : /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6995 : : and compute the reduction chain length. Discover the real
6996 : : reduction operation stmt on the way (slp_for_stmt_info). */
6997 : 48649 : unsigned reduc_chain_length = 0;
6998 : 48649 : stmt_info = NULL;
6999 : 48649 : slp_tree slp_for_stmt_info = NULL;
7000 : 48649 : slp_tree vdef_slp = slp_node_instance->root;
7001 : 107068 : while (vdef_slp != slp_node)
7002 : : {
7003 : 59171 : int reduc_idx = SLP_TREE_REDUC_IDX (vdef_slp);
7004 : 59171 : if (reduc_idx == -1)
7005 : : {
7006 : 744 : if (dump_enabled_p ())
7007 : 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7008 : : "reduction chain broken by patterns.\n");
7009 : 752 : return false;
7010 : : }
7011 : 58427 : stmt_vec_info vdef = SLP_TREE_REPRESENTATIVE (vdef_slp);
7012 : 58427 : if (is_a <gphi *> (vdef->stmt))
7013 : : {
7014 : 532 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[reduc_idx];
7015 : : /* Do not count PHIs towards the chain length. */
7016 : 532 : continue;
7017 : : }
7018 : 57895 : gimple_match_op op;
7019 : 57895 : if (!gimple_extract_op (vdef->stmt, &op))
7020 : : {
7021 : 0 : if (dump_enabled_p ())
7022 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7023 : : "reduction chain includes unsupported"
7024 : : " statement type.\n");
7025 : 0 : return false;
7026 : : }
7027 : 57895 : if (CONVERT_EXPR_CODE_P (op.code))
7028 : : {
7029 : 3288 : if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7030 : : {
7031 : 8 : if (dump_enabled_p ())
7032 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7033 : : "conversion in the reduction chain.\n");
7034 : 8 : return false;
7035 : : }
7036 : 3280 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[0];
7037 : : }
7038 : : else
7039 : : {
7040 : : /* First non-conversion stmt. */
7041 : 54607 : if (!slp_for_stmt_info)
7042 : 47897 : slp_for_stmt_info = vdef_slp;
7043 : :
7044 : 54607 : if (lane_reducing_op_p (op.code))
7045 : : {
7046 : : /* The last operand of lane-reducing operation is for
7047 : : reduction. */
7048 : 462 : gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
7049 : :
7050 : 462 : slp_tree op_node = SLP_TREE_CHILDREN (vdef_slp)[0];
7051 : 462 : tree vectype_op = SLP_TREE_VECTYPE (op_node);
7052 : 462 : tree type_op = TREE_TYPE (op.ops[0]);
7053 : 462 : if (!vectype_op)
7054 : : {
7055 : 9 : vectype_op = get_vectype_for_scalar_type (loop_vinfo,
7056 : : type_op);
7057 : 9 : if (!vectype_op
7058 : 9 : || !vect_maybe_update_slp_op_vectype (op_node,
7059 : : vectype_op))
7060 : 0 : return false;
7061 : : }
7062 : :
7063 : : /* To accommodate lane-reducing operations of mixed input
7064 : : vectypes, choose input vectype with the least lanes for the
7065 : : reduction PHI statement, which would result in the most
7066 : : ncopies for vectorized reduction results. */
7067 : 462 : if (!vectype_in
7068 : 462 : || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7069 : 46 : < GET_MODE_SIZE (SCALAR_TYPE_MODE (type_op))))
7070 : 439 : vectype_in = vectype_op;
7071 : : }
7072 : 54145 : else if (!vectype_in)
7073 : 47458 : vectype_in = SLP_TREE_VECTYPE (slp_node);
7074 : 54607 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[reduc_idx];
7075 : : }
7076 : 57887 : reduc_chain_length++;
7077 : : }
7078 : 47897 : stmt_info = SLP_TREE_REPRESENTATIVE (slp_for_stmt_info);
7079 : :
7080 : : /* PHIs should not participate in patterns. */
7081 : 47897 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7082 : :
7083 : : /* 1. Is vectorizable reduction? */
7084 : : /* Not supportable if the reduction variable is used in the loop, unless
7085 : : it's a reduction chain. */
7086 : 47897 : if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7087 : 0 : && !reduc_chain)
7088 : : return false;
7089 : :
7090 : : /* Reductions that are not used even in an enclosing outer-loop,
7091 : : are expected to be "live" (used out of the loop). */
7092 : 47897 : if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7093 : 0 : && !STMT_VINFO_LIVE_P (stmt_info))
7094 : : return false;
7095 : :
7096 : : /* 2. Has this been recognized as a reduction pattern?
7097 : :
7098 : : Check if STMT represents a pattern that has been recognized
7099 : : in earlier analysis stages. For stmts that represent a pattern,
7100 : : the STMT_VINFO_RELATED_STMT field records the last stmt in
7101 : : the original sequence that constitutes the pattern. */
7102 : :
7103 : 47897 : stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7104 : 47897 : if (orig_stmt_info)
7105 : : {
7106 : 3253 : gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7107 : 3253 : gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7108 : : }
7109 : :
7110 : : /* 3. Check the operands of the operation. The first operands are defined
7111 : : inside the loop body. The last operand is the reduction variable,
7112 : : which is defined by the loop-header-phi. */
7113 : :
7114 : 47897 : tree vectype_out = SLP_TREE_VECTYPE (slp_for_stmt_info);
7115 : 47897 : VECT_REDUC_INFO_VECTYPE (reduc_info) = vectype_out;
7116 : :
7117 : 47897 : gimple_match_op op;
7118 : 47897 : if (!gimple_extract_op (stmt_info->stmt, &op))
7119 : 0 : gcc_unreachable ();
7120 : 47897 : bool lane_reducing = lane_reducing_op_p (op.code);
7121 : :
7122 : 47897 : if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7123 : 15140 : && !SCALAR_FLOAT_TYPE_P (op.type))
7124 : : return false;
7125 : :
7126 : : /* Do not try to vectorize bit-precision reductions. */
7127 : 47897 : if (!type_has_mode_precision_p (op.type)
7128 : 1477 : && op.code != BIT_AND_EXPR
7129 : 1411 : && op.code != BIT_IOR_EXPR
7130 : 48332 : && op.code != BIT_XOR_EXPR)
7131 : : return false;
7132 : :
7133 : : /* Lane-reducing ops also never can be used in a SLP reduction group
7134 : : since we'll mix lanes belonging to different reductions. But it's
7135 : : OK to use them in a reduction chain or when the reduction group
7136 : : has just one element. */
7137 : 47587 : if (lane_reducing
7138 : 47587 : && !reduc_chain
7139 : 412 : && SLP_TREE_LANES (slp_node) > 1)
7140 : : {
7141 : 0 : if (dump_enabled_p ())
7142 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7143 : : "lane-reducing reduction in reduction group.\n");
7144 : 0 : return false;
7145 : : }
7146 : :
7147 : : /* All uses but the last are expected to be defined in the loop.
7148 : : The last use is the reduction variable. In case of nested cycle this
7149 : : assumption is not true: we use reduc_index to record the index of the
7150 : : reduction variable. */
7151 : 47587 : slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7152 : 47587 : tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7153 : 47587 : gcc_assert (op.code != COND_EXPR || !COMPARISON_CLASS_P (op.ops[0]));
7154 : 151211 : for (i = 0; i < (int) op.num_ops; i++)
7155 : : {
7156 : : /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7157 : 103624 : if (i == 0 && op.code == COND_EXPR)
7158 : 51986 : continue;
7159 : :
7160 : 102829 : stmt_vec_info def_stmt_info;
7161 : 102829 : enum vect_def_type dt;
7162 : 102829 : if (!vect_is_simple_use (loop_vinfo, slp_for_stmt_info,
7163 : : i, &op.ops[i], &slp_op[i], &dt,
7164 : 102829 : &vectype_op[i], &def_stmt_info))
7165 : : {
7166 : 0 : if (dump_enabled_p ())
7167 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7168 : : "use not simple.\n");
7169 : 0 : return false;
7170 : : }
7171 : :
7172 : : /* Skip reduction operands, and for an IFN_COND_OP we might hit the
7173 : : reduction operand twice (once as definition, once as else). */
7174 : 102829 : if (SLP_TREE_CHILDREN (slp_for_stmt_info)[i]
7175 : 205658 : == SLP_TREE_CHILDREN
7176 : 102829 : (slp_for_stmt_info)[SLP_TREE_REDUC_IDX (slp_for_stmt_info)])
7177 : 51191 : continue;
7178 : :
7179 : : /* There should be only one cycle def in the stmt, the one
7180 : : leading to reduc_def. */
7181 : 51638 : if (SLP_TREE_CHILDREN (slp_for_stmt_info)[i]->cycle_info.id != -1)
7182 : : return false;
7183 : :
7184 : 51638 : if (!vectype_op[i])
7185 : 4427 : vectype_op[i]
7186 : 4427 : = get_vectype_for_scalar_type (loop_vinfo,
7187 : 4427 : TREE_TYPE (op.ops[i]), slp_op[i]);
7188 : :
7189 : : /* Record how the non-reduction-def value of COND_EXPR is defined.
7190 : : ??? For a chain of multiple CONDs we'd have to match them up all. */
7191 : 51638 : if (op.code == COND_EXPR && reduc_chain_length == 1)
7192 : : {
7193 : 772 : if (dt == vect_constant_def)
7194 : : {
7195 : 95 : cond_reduc_dt = dt;
7196 : 95 : cond_reduc_val = op.ops[i];
7197 : : }
7198 : 677 : else if (dt == vect_induction_def
7199 : 411 : && def_stmt_info
7200 : 1088 : && is_nonwrapping_integer_induction (def_stmt_info, loop))
7201 : : {
7202 : 112 : cond_reduc_dt = dt;
7203 : 112 : cond_stmt_vinfo = def_stmt_info;
7204 : : }
7205 : : }
7206 : : }
7207 : :
7208 : 47587 : enum vect_reduction_type reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7209 : : /* If we have a condition reduction, see if we can simplify it further. */
7210 : 47587 : if (reduction_type == COND_REDUCTION)
7211 : : {
7212 : 783 : if (SLP_TREE_LANES (slp_node) != 1)
7213 : : return false;
7214 : :
7215 : : /* When the condition uses the reduction value in the condition, fail. */
7216 : 759 : if (SLP_TREE_REDUC_IDX (slp_node) == 0)
7217 : : {
7218 : 0 : if (dump_enabled_p ())
7219 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7220 : : "condition depends on previous iteration\n");
7221 : 0 : return false;
7222 : : }
7223 : :
7224 : 759 : if (reduc_chain_length == 1
7225 : 759 : && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7226 : : OPTIMIZE_FOR_SPEED)
7227 : 736 : || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7228 : : vectype_in,
7229 : : OPTIMIZE_FOR_SPEED)))
7230 : : {
7231 : 0 : if (dump_enabled_p ())
7232 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7233 : : "optimizing condition reduction with"
7234 : : " FOLD_EXTRACT_LAST.\n");
7235 : 0 : VECT_REDUC_INFO_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7236 : : }
7237 : 759 : else if (cond_reduc_dt == vect_induction_def)
7238 : : {
7239 : 112 : tree base
7240 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7241 : 112 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7242 : :
7243 : 112 : gcc_assert (TREE_CODE (base) == INTEGER_CST
7244 : : && TREE_CODE (step) == INTEGER_CST);
7245 : 112 : cond_reduc_val = NULL_TREE;
7246 : 112 : enum tree_code cond_reduc_op_code = ERROR_MARK;
7247 : 112 : tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7248 : 112 : if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7249 : : ;
7250 : : /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7251 : : above base; punt if base is the minimum value of the type for
7252 : : MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7253 : 100 : else if (tree_int_cst_sgn (step) == -1)
7254 : : {
7255 : 20 : cond_reduc_op_code = MIN_EXPR;
7256 : 20 : if (tree_int_cst_sgn (base) == -1)
7257 : 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7258 : 20 : else if (tree_int_cst_lt (base,
7259 : 20 : TYPE_MAX_VALUE (TREE_TYPE (base))))
7260 : 20 : cond_reduc_val
7261 : 20 : = int_const_binop (PLUS_EXPR, base, integer_one_node);
7262 : : }
7263 : : else
7264 : : {
7265 : 80 : cond_reduc_op_code = MAX_EXPR;
7266 : 80 : if (tree_int_cst_sgn (base) == 1)
7267 : 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7268 : 80 : else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7269 : : base))
7270 : 80 : cond_reduc_val
7271 : 80 : = int_const_binop (MINUS_EXPR, base, integer_one_node);
7272 : : }
7273 : 100 : if (cond_reduc_val)
7274 : : {
7275 : 100 : if (dump_enabled_p ())
7276 : 64 : dump_printf_loc (MSG_NOTE, vect_location,
7277 : : "condition expression based on "
7278 : : "integer induction.\n");
7279 : 100 : VECT_REDUC_INFO_CODE (reduc_info) = cond_reduc_op_code;
7280 : 100 : VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info)
7281 : 100 : = cond_reduc_val;
7282 : 100 : VECT_REDUC_INFO_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7283 : : }
7284 : : }
7285 : 647 : else if (cond_reduc_dt == vect_constant_def)
7286 : : {
7287 : 85 : enum vect_def_type cond_initial_dt;
7288 : 85 : tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7289 : 85 : vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7290 : 85 : if (cond_initial_dt == vect_constant_def
7291 : 107 : && types_compatible_p (TREE_TYPE (cond_initial_val),
7292 : 22 : TREE_TYPE (cond_reduc_val)))
7293 : : {
7294 : 22 : tree e = fold_binary (LE_EXPR, boolean_type_node,
7295 : : cond_initial_val, cond_reduc_val);
7296 : 22 : if (e && (integer_onep (e) || integer_zerop (e)))
7297 : : {
7298 : 22 : if (dump_enabled_p ())
7299 : 16 : dump_printf_loc (MSG_NOTE, vect_location,
7300 : : "condition expression based on "
7301 : : "compile time constant.\n");
7302 : : /* Record reduction code at analysis stage. */
7303 : 22 : VECT_REDUC_INFO_CODE (reduc_info)
7304 : 22 : = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7305 : 22 : VECT_REDUC_INFO_TYPE (reduc_info) = CONST_COND_REDUCTION;
7306 : : }
7307 : : }
7308 : : }
7309 : : }
7310 : :
7311 : 47563 : if (STMT_VINFO_LIVE_P (phi_info))
7312 : : return false;
7313 : :
7314 : 47563 : ncopies = vect_get_num_copies (loop_vinfo, slp_node);
7315 : :
7316 : 47563 : gcc_assert (ncopies >= 1);
7317 : :
7318 : 47563 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7319 : :
7320 : : /* 4.2. Check support for the epilog operation.
7321 : :
7322 : : If STMT represents a reduction pattern, then the type of the
7323 : : reduction variable may be different than the type of the rest
7324 : : of the arguments. For example, consider the case of accumulation
7325 : : of shorts into an int accumulator; The original code:
7326 : : S1: int_a = (int) short_a;
7327 : : orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7328 : :
7329 : : was replaced with:
7330 : : STMT: int_acc = widen_sum <short_a, int_acc>
7331 : :
7332 : : This means that:
7333 : : 1. The tree-code that is used to create the vector operation in the
7334 : : epilog code (that reduces the partial results) is not the
7335 : : tree-code of STMT, but is rather the tree-code of the original
7336 : : stmt from the pattern that STMT is replacing. I.e, in the example
7337 : : above we want to use 'widen_sum' in the loop, but 'plus' in the
7338 : : epilog.
7339 : : 2. The type (mode) we use to check available target support
7340 : : for the vector operation to be created in the *epilog*, is
7341 : : determined by the type of the reduction variable (in the example
7342 : : above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7343 : : However the type (mode) we use to check available target support
7344 : : for the vector operation to be created *inside the loop*, is
7345 : : determined by the type of the other arguments to STMT (in the
7346 : : example we'd check this: optab_handler (widen_sum_optab,
7347 : : vect_short_mode)).
7348 : :
7349 : : This is contrary to "regular" reductions, in which the types of all
7350 : : the arguments are the same as the type of the reduction variable.
7351 : : For "regular" reductions we can therefore use the same vector type
7352 : : (and also the same tree-code) when generating the epilog code and
7353 : : when generating the code inside the loop. */
7354 : :
7355 : 47563 : code_helper orig_code = VECT_REDUC_INFO_CODE (reduc_info);
7356 : :
7357 : : /* If conversion might have created a conditional operation like
7358 : : IFN_COND_ADD already. Use the internal code for the following checks. */
7359 : 47563 : if (orig_code.is_internal_fn ())
7360 : : {
7361 : 3660 : tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7362 : 3660 : orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7363 : : }
7364 : :
7365 : 47563 : VECT_REDUC_INFO_CODE (reduc_info) = orig_code;
7366 : :
7367 : 47563 : reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7368 : 47563 : if (reduction_type == TREE_CODE_REDUCTION)
7369 : : {
7370 : : /* Check whether it's ok to change the order of the computation.
7371 : : Generally, when vectorizing a reduction we change the order of the
7372 : : computation. This may change the behavior of the program in some
7373 : : cases, so we need to check that this is ok. One exception is when
7374 : : vectorizing an outer-loop: the inner-loop is executed sequentially,
7375 : : and therefore vectorizing reductions in the inner-loop during
7376 : : outer-loop vectorization is safe. Likewise when we are vectorizing
7377 : : a series of reductions using SLP and the VF is one the reductions
7378 : : are performed in scalar order. */
7379 : 46804 : if (!reduc_chain
7380 : 46804 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7381 : : ;
7382 : 46662 : else if (needs_fold_left_reduction_p (op.type, orig_code))
7383 : : {
7384 : : /* When vectorizing a reduction chain w/o SLP the reduction PHI
7385 : : is not directy used in stmt. */
7386 : 4793 : if (reduc_chain_length != 1)
7387 : : {
7388 : 67 : if (dump_enabled_p ())
7389 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7390 : : "in-order reduction chain without SLP.\n");
7391 : 67 : return false;
7392 : : }
7393 : : /* Code generation doesn't support function calls other
7394 : : than .COND_*. */
7395 : 4726 : if (!op.code.is_tree_code ()
7396 : 4840 : && !(op.code.is_internal_fn ()
7397 : 57 : && conditional_internal_fn_code (internal_fn (op.code))
7398 : : != ERROR_MARK))
7399 : : {
7400 : 10 : if (dump_enabled_p ())
7401 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7402 : : "in-order reduction chain operation not "
7403 : : "supported.\n");
7404 : 10 : return false;
7405 : : }
7406 : 4716 : VECT_REDUC_INFO_TYPE (reduc_info)
7407 : 4716 : = reduction_type = FOLD_LEFT_REDUCTION;
7408 : : }
7409 : 41869 : else if (!commutative_binary_op_p (orig_code, op.type)
7410 : 41869 : || !associative_binary_op_p (orig_code, op.type))
7411 : : {
7412 : 152 : if (dump_enabled_p ())
7413 : 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7414 : : "reduction: not commutative/associative\n");
7415 : 152 : return false;
7416 : : }
7417 : : }
7418 : :
7419 : 4716 : if ((reduction_type == COND_REDUCTION
7420 : : || reduction_type == INTEGER_INDUC_COND_REDUCTION
7421 : : || reduction_type == CONST_COND_REDUCTION
7422 : 42618 : || reduction_type == EXTRACT_LAST_REDUCTION)
7423 : : && 1
7424 : 759 : && ncopies > 1)
7425 : : {
7426 : 276 : if (dump_enabled_p ())
7427 : 60 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7428 : : "multiple types in condition reduction.\n");
7429 : 276 : return false;
7430 : : }
7431 : :
7432 : : /* See if we can convert a mask vector to a corresponding bool data vector
7433 : : to perform the epilogue reduction. */
7434 : 47058 : tree alt_vectype_out = NULL_TREE;
7435 : 47058 : if (VECTOR_BOOLEAN_TYPE_P (vectype_out))
7436 : : {
7437 : 944 : alt_vectype_out
7438 : 1888 : = get_related_vectype_for_scalar_type (loop_vinfo->vector_mode,
7439 : 944 : TREE_TYPE (vectype_out),
7440 : : TYPE_VECTOR_SUBPARTS
7441 : : (vectype_out));
7442 : 944 : if (!alt_vectype_out
7443 : 944 : || maybe_ne (TYPE_VECTOR_SUBPARTS (alt_vectype_out),
7444 : 1873 : TYPE_VECTOR_SUBPARTS (vectype_out))
7445 : 1888 : || !expand_vec_cond_expr_p (alt_vectype_out, vectype_out))
7446 : 15 : alt_vectype_out = NULL_TREE;
7447 : : }
7448 : :
7449 : 47058 : internal_fn reduc_fn = IFN_LAST;
7450 : 47058 : if (reduction_type == TREE_CODE_REDUCTION
7451 : 47058 : || reduction_type == FOLD_LEFT_REDUCTION
7452 : : || reduction_type == INTEGER_INDUC_COND_REDUCTION
7453 : 483 : || reduction_type == CONST_COND_REDUCTION)
7454 : : {
7455 : 41973 : if (reduction_type == FOLD_LEFT_REDUCTION
7456 : 50615 : ? fold_left_reduction_fn (orig_code, &reduc_fn)
7457 : 41973 : : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7458 : : {
7459 : 46013 : internal_fn sbool_fn = IFN_LAST;
7460 : 46013 : if (reduc_fn == IFN_LAST)
7461 : : ;
7462 : 44191 : else if ((!VECTOR_BOOLEAN_TYPE_P (vectype_out)
7463 : 944 : || (GET_MODE_CLASS (TYPE_MODE (vectype_out))
7464 : : == MODE_VECTOR_BOOL))
7465 : 87438 : && direct_internal_fn_supported_p (reduc_fn, vectype_out,
7466 : : OPTIMIZE_FOR_SPEED))
7467 : : ;
7468 : 10108 : else if (VECTOR_BOOLEAN_TYPE_P (vectype_out)
7469 : 944 : && sbool_reduction_fn_for_fn (reduc_fn, &sbool_fn)
7470 : 11052 : && direct_internal_fn_supported_p (sbool_fn, vectype_out,
7471 : : OPTIMIZE_FOR_SPEED))
7472 : 65 : reduc_fn = sbool_fn;
7473 : 10043 : else if (reduction_type != FOLD_LEFT_REDUCTION
7474 : 10043 : && alt_vectype_out
7475 : 10043 : && direct_internal_fn_supported_p (reduc_fn, alt_vectype_out,
7476 : : OPTIMIZE_FOR_SPEED))
7477 : 714 : VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info) = alt_vectype_out;
7478 : : else
7479 : : {
7480 : 9329 : if (dump_enabled_p ())
7481 : 800 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7482 : : "reduc op not supported by target.\n");
7483 : :
7484 : 9329 : reduc_fn = IFN_LAST;
7485 : : }
7486 : : }
7487 : : else
7488 : : {
7489 : 676 : if (dump_enabled_p ())
7490 : 48 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7491 : : "no reduc code for scalar code.\n");
7492 : :
7493 : 676 : return false;
7494 : : }
7495 : 46013 : if (reduc_fn == IFN_LAST
7496 : 46013 : && VECTOR_BOOLEAN_TYPE_P (vectype_out))
7497 : : {
7498 : 165 : if (!alt_vectype_out)
7499 : : {
7500 : 8 : if (dump_enabled_p ())
7501 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7502 : : "cannot turn mask into bool data vector for "
7503 : : "reduction epilogue.\n");
7504 : 8 : return false;
7505 : : }
7506 : 157 : VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info) = alt_vectype_out;
7507 : : }
7508 : : }
7509 : 369 : else if (reduction_type == COND_REDUCTION)
7510 : : {
7511 : 369 : int scalar_precision
7512 : 369 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7513 : 369 : cr_index_scalar_type = make_unsigned_type (scalar_precision);
7514 : 369 : cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7515 : : vectype_out);
7516 : :
7517 : 369 : if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7518 : : OPTIMIZE_FOR_SPEED))
7519 : 8 : reduc_fn = IFN_REDUC_MAX;
7520 : : }
7521 : 46374 : VECT_REDUC_INFO_FN (reduc_info) = reduc_fn;
7522 : :
7523 : 46374 : if (reduction_type != EXTRACT_LAST_REDUCTION
7524 : : && reduc_fn == IFN_LAST
7525 : : && !nunits_out.is_constant ())
7526 : : {
7527 : : if (dump_enabled_p ())
7528 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7529 : : "missing target support for reduction on"
7530 : : " variable-length vectors.\n");
7531 : : return false;
7532 : : }
7533 : :
7534 : : /* For SLP reductions, see if there is a neutral value we can use. */
7535 : 46374 : tree neutral_op = NULL_TREE;
7536 : 46374 : tree initial_value = NULL_TREE;
7537 : 46374 : if (reduc_chain)
7538 : 1366 : initial_value = vect_phi_initial_value (reduc_def_phi);
7539 : 46374 : neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7540 : : orig_code, initial_value);
7541 : :
7542 : 46374 : if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7543 : : {
7544 : : /* We can't support in-order reductions of code such as this:
7545 : :
7546 : : for (int i = 0; i < n1; ++i)
7547 : : for (int j = 0; j < n2; ++j)
7548 : : l += a[j];
7549 : :
7550 : : since GCC effectively transforms the loop when vectorizing:
7551 : :
7552 : : for (int i = 0; i < n1 / VF; ++i)
7553 : : for (int j = 0; j < n2; ++j)
7554 : : for (int k = 0; k < VF; ++k)
7555 : : l += a[j];
7556 : :
7557 : : which is a reassociation of the original operation. */
7558 : 56 : if (dump_enabled_p ())
7559 : 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7560 : : "in-order double reduction not supported.\n");
7561 : :
7562 : 56 : return false;
7563 : : }
7564 : :
7565 : 46318 : if (reduction_type == FOLD_LEFT_REDUCTION
7566 : 3984 : && SLP_TREE_LANES (slp_node) > 1
7567 : 119 : && !reduc_chain)
7568 : : {
7569 : : /* We cannot use in-order reductions in this case because there is
7570 : : an implicit reassociation of the operations involved. */
7571 : 57 : if (dump_enabled_p ())
7572 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7573 : : "in-order unchained SLP reductions not supported.\n");
7574 : 57 : return false;
7575 : : }
7576 : :
7577 : : /* For double reductions, and for SLP reductions with a neutral value,
7578 : : we construct a variable-length initial vector by loading a vector
7579 : : full of the neutral value and then shift-and-inserting the start
7580 : : values into the low-numbered elements. */
7581 : 46261 : if ((double_reduc || neutral_op)
7582 : : && !nunits_out.is_constant ()
7583 : : && (SLP_TREE_LANES (slp_node) != 1 && !reduc_chain)
7584 : : && (!neutral_op
7585 : : || !operand_equal_p (neutral_op,
7586 : : vect_phi_initial_value (reduc_def_phi)))
7587 : : && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7588 : : vectype_out, OPTIMIZE_FOR_SPEED))
7589 : : {
7590 : : if (dump_enabled_p ())
7591 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7592 : : "reduction on variable-length vectors requires"
7593 : : " target support for a vector-shift-and-insert"
7594 : : " operation.\n");
7595 : : return false;
7596 : : }
7597 : :
7598 : : /* Check extra constraints for variable-length unchained SLP reductions. */
7599 : 46261 : if (!reduc_chain
7600 : : && !nunits_out.is_constant ())
7601 : : {
7602 : : /* We checked above that we could build the initial vector when
7603 : : there's a neutral element value. Check here for the case in
7604 : : which each SLP statement has its own initial value and in which
7605 : : that value needs to be repeated for every instance of the
7606 : : statement within the initial vector. */
7607 : : unsigned int group_size = SLP_TREE_LANES (slp_node);
7608 : : if (!neutral_op
7609 : : && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7610 : : TREE_TYPE (vectype_out)))
7611 : : {
7612 : : if (dump_enabled_p ())
7613 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7614 : : "unsupported form of SLP reduction for"
7615 : : " variable-length vectors: cannot build"
7616 : : " initial vector.\n");
7617 : : return false;
7618 : : }
7619 : : /* The epilogue code relies on the number of elements being a multiple
7620 : : of the group size. The duplicate-and-interleave approach to setting
7621 : : up the initial vector does too. */
7622 : : if (!multiple_p (nunits_out, group_size))
7623 : : {
7624 : : if (dump_enabled_p ())
7625 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7626 : : "unsupported form of SLP reduction for"
7627 : : " variable-length vectors: the vector size"
7628 : : " is not a multiple of the number of results.\n");
7629 : : return false;
7630 : : }
7631 : : }
7632 : :
7633 : 46261 : if (reduction_type == COND_REDUCTION)
7634 : : {
7635 : 369 : widest_int ni;
7636 : :
7637 : 369 : if (! max_loop_iterations (loop, &ni))
7638 : : {
7639 : 0 : if (dump_enabled_p ())
7640 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
7641 : : "loop count not known, cannot create cond "
7642 : : "reduction.\n");
7643 : 0 : return false;
7644 : : }
7645 : : /* Convert backedges to iterations. */
7646 : 369 : ni += 1;
7647 : :
7648 : : /* The additional index will be the same type as the condition. Check
7649 : : that the loop can fit into this less one (because we'll use up the
7650 : : zero slot for when there are no matches). */
7651 : 369 : tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7652 : 369 : if (wi::geu_p (ni, wi::to_widest (max_index)))
7653 : : {
7654 : 90 : if (dump_enabled_p ())
7655 : 54 : dump_printf_loc (MSG_NOTE, vect_location,
7656 : : "loop size is greater than data size.\n");
7657 : 90 : return false;
7658 : : }
7659 : 369 : }
7660 : :
7661 : : /* In case the vectorization factor (VF) is bigger than the number
7662 : : of elements that we can fit in a vectype (nunits), we have to generate
7663 : : more than one vector stmt - i.e - we need to "unroll" the
7664 : : vector stmt by a factor VF/nunits. For more details see documentation
7665 : : in vectorizable_operation. */
7666 : :
7667 : : /* If the reduction is used in an outer loop we need to generate
7668 : : VF intermediate results, like so (e.g. for ncopies=2):
7669 : : r0 = phi (init, r0)
7670 : : r1 = phi (init, r1)
7671 : : r0 = x0 + r0;
7672 : : r1 = x1 + r1;
7673 : : (i.e. we generate VF results in 2 registers).
7674 : : In this case we have a separate def-use cycle for each copy, and therefore
7675 : : for each copy we get the vector def for the reduction variable from the
7676 : : respective phi node created for this copy.
7677 : :
7678 : : Otherwise (the reduction is unused in the loop nest), we can combine
7679 : : together intermediate results, like so (e.g. for ncopies=2):
7680 : : r = phi (init, r)
7681 : : r = x0 + r;
7682 : : r = x1 + r;
7683 : : (i.e. we generate VF/2 results in a single register).
7684 : : In this case for each copy we get the vector def for the reduction variable
7685 : : from the vectorized reduction operation generated in the previous iteration.
7686 : :
7687 : : This only works when we see both the reduction PHI and its only consumer
7688 : : in vectorizable_reduction and there are no intermediate stmts
7689 : : participating. When unrolling we want each unrolled iteration to have its
7690 : : own reduction accumulator since one of the main goals of unrolling a
7691 : : reduction is to reduce the aggregate loop-carried latency. */
7692 : 46171 : if (ncopies > 1
7693 : 46171 : && !reduc_chain
7694 : 5387 : && SLP_TREE_LANES (slp_node) == 1
7695 : 5237 : && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7696 : 5218 : && reduc_chain_length == 1
7697 : 4917 : && loop_vinfo->suggested_unroll_factor == 1)
7698 : 46171 : single_defuse_cycle = true;
7699 : :
7700 : 46171 : if (single_defuse_cycle && !lane_reducing)
7701 : : {
7702 : 4342 : gcc_assert (op.code != COND_EXPR);
7703 : :
7704 : : /* 4. check support for the operation in the loop
7705 : :
7706 : : This isn't necessary for the lane reduction codes, since they
7707 : : can only be produced by pattern matching, and it's up to the
7708 : : pattern matcher to test for support. The main reason for
7709 : : specifically skipping this step is to avoid rechecking whether
7710 : : mixed-sign dot-products can be implemented using signed
7711 : : dot-products. */
7712 : 4342 : machine_mode vec_mode = TYPE_MODE (vectype_in);
7713 : 4342 : if (!directly_supported_p (op.code, vectype_in, optab_vector))
7714 : : {
7715 : 701 : if (dump_enabled_p ())
7716 : 10 : dump_printf (MSG_NOTE, "op not supported by target.\n");
7717 : 1402 : if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7718 : 701 : || !vect_can_vectorize_without_simd_p (op.code))
7719 : : single_defuse_cycle = false;
7720 : : else
7721 : 5 : if (dump_enabled_p ())
7722 : 0 : dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7723 : : }
7724 : :
7725 : 4342 : if (vect_emulated_vector_p (vectype_in)
7726 : 4342 : && !vect_can_vectorize_without_simd_p (op.code))
7727 : : {
7728 : 0 : if (dump_enabled_p ())
7729 : 0 : dump_printf (MSG_NOTE, "using word mode not possible.\n");
7730 : 0 : return false;
7731 : : }
7732 : : }
7733 : 46171 : if (dump_enabled_p () && single_defuse_cycle)
7734 : 636 : dump_printf_loc (MSG_NOTE, vect_location,
7735 : : "using single def-use cycle for reduction by reducing "
7736 : : "multiple vectors to one in the loop body\n");
7737 : 46171 : VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7738 : :
7739 : : /* For lane-reducing operation, the below processing related to single
7740 : : defuse-cycle will be done in its own vectorizable function. One more
7741 : : thing to note is that the operation must not be involved in fold-left
7742 : : reduction. */
7743 : 46171 : single_defuse_cycle &= !lane_reducing;
7744 : :
7745 : 46171 : if (single_defuse_cycle || reduction_type == FOLD_LEFT_REDUCTION)
7746 : 24416 : for (i = 0; i < (int) op.num_ops; i++)
7747 : 16900 : if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7748 : : {
7749 : 0 : if (dump_enabled_p ())
7750 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7751 : : "incompatible vector types for invariants\n");
7752 : 0 : return false;
7753 : : }
7754 : :
7755 : 46171 : vect_model_reduction_cost (loop_vinfo, slp_for_stmt_info, reduc_fn,
7756 : : reduction_type, ncopies, cost_vec);
7757 : : /* Cost the reduction op inside the loop if transformed via
7758 : : vect_transform_reduction for non-lane-reducing operation. Otherwise
7759 : : this is costed by the separate vectorizable_* routines. */
7760 : 46171 : if (single_defuse_cycle)
7761 : 3646 : record_stmt_cost (cost_vec, ncopies, vector_stmt,
7762 : : slp_for_stmt_info, 0, vect_body);
7763 : :
7764 : 46171 : if (dump_enabled_p ()
7765 : 46171 : && reduction_type == FOLD_LEFT_REDUCTION)
7766 : 212 : dump_printf_loc (MSG_NOTE, vect_location,
7767 : : "using an in-order (fold-left) reduction.\n");
7768 : 46171 : SLP_TREE_TYPE (slp_node) = cycle_phi_info_type;
7769 : :
7770 : : /* All but single defuse-cycle optimized and fold-left reductions go
7771 : : through their own vectorizable_* routines. */
7772 : 46171 : stmt_vec_info tem
7773 : 46171 : = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (slp_node_instance));
7774 : 46171 : if (!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
7775 : 38655 : STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7776 : : else
7777 : : {
7778 : 7516 : STMT_VINFO_DEF_TYPE (tem) = vect_reduction_def;
7779 : 7516 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7780 : 3349 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
7781 : : slp_node, op.code, op.type,
7782 : : vectype_in);
7783 : : }
7784 : : return true;
7785 : : }
7786 : :
7787 : : /* STMT_INFO is a dot-product reduction whose multiplication operands
7788 : : have different signs. Emit a sequence to emulate the operation
7789 : : using a series of signed DOT_PROD_EXPRs and return the last
7790 : : statement generated. VEC_DEST is the result of the vector operation
7791 : : and VOP lists its inputs. */
7792 : :
7793 : : static gassign *
7794 : 4 : vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7795 : : gimple_stmt_iterator *gsi, tree vec_dest,
7796 : : tree vop[3])
7797 : : {
7798 : 4 : tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
7799 : 4 : tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
7800 : 4 : tree narrow_elttype = TREE_TYPE (narrow_vectype);
7801 : 4 : gimple *new_stmt;
7802 : :
7803 : : /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
7804 : 4 : if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
7805 : 0 : std::swap (vop[0], vop[1]);
7806 : :
7807 : : /* Convert all inputs to signed types. */
7808 : 16 : for (int i = 0; i < 3; ++i)
7809 : 12 : if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
7810 : : {
7811 : 4 : tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
7812 : 4 : new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
7813 : 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7814 : 4 : vop[i] = tmp;
7815 : : }
7816 : :
7817 : : /* In the comments below we assume 8-bit inputs for simplicity,
7818 : : but the approach works for any full integer type. */
7819 : :
7820 : : /* Create a vector of -128. */
7821 : 4 : tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
7822 : 4 : tree min_narrow = build_vector_from_val (narrow_vectype,
7823 : : min_narrow_elttype);
7824 : :
7825 : : /* Create a vector of 64. */
7826 : 4 : auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
7827 : 4 : tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
7828 : 4 : half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
7829 : :
7830 : : /* Emit: SUB_RES = VOP[0] - 128. */
7831 : 4 : tree sub_res = make_ssa_name (narrow_vectype);
7832 : 4 : new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
7833 : 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7834 : :
7835 : : /* Emit:
7836 : :
7837 : : STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
7838 : : STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
7839 : : STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
7840 : :
7841 : : on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
7842 : : Doing the two 64 * y steps first allows more time to compute x. */
7843 : 4 : tree stage1 = make_ssa_name (wide_vectype);
7844 : 4 : new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
7845 : : vop[1], half_narrow, vop[2]);
7846 : 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7847 : :
7848 : 4 : tree stage2 = make_ssa_name (wide_vectype);
7849 : 4 : new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
7850 : : vop[1], half_narrow, stage1);
7851 : 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7852 : :
7853 : 4 : tree stage3 = make_ssa_name (wide_vectype);
7854 : 4 : new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
7855 : : sub_res, vop[1], stage2);
7856 : 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7857 : :
7858 : : /* Convert STAGE3 to the reduction type. */
7859 : 4 : return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
7860 : 4 : }
7861 : :
7862 : : /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7863 : : value. */
7864 : :
7865 : : bool
7866 : 2547 : vect_transform_reduction (loop_vec_info loop_vinfo,
7867 : : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7868 : : slp_tree slp_node)
7869 : : {
7870 : 2547 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
7871 : 2547 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7872 : 2547 : unsigned vec_num;
7873 : :
7874 : 2547 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
7875 : :
7876 : 2547 : if (nested_in_vect_loop_p (loop, stmt_info))
7877 : : {
7878 : 0 : loop = loop->inner;
7879 : 0 : gcc_assert (VECT_REDUC_INFO_DEF_TYPE (reduc_info)
7880 : : == vect_double_reduction_def);
7881 : : }
7882 : :
7883 : 2547 : gimple_match_op op;
7884 : 2547 : if (!gimple_extract_op (stmt_info->stmt, &op))
7885 : 0 : gcc_unreachable ();
7886 : :
7887 : : /* All uses but the last are expected to be defined in the loop.
7888 : : The last use is the reduction variable. In case of nested cycle this
7889 : : assumption is not true: we use reduc_index to record the index of the
7890 : : reduction variable. */
7891 : 2547 : int reduc_index = SLP_TREE_REDUC_IDX (slp_node);
7892 : 2547 : tree vectype_in = SLP_TREE_VECTYPE (slp_node);
7893 : 2547 : if (lane_reducing_op_p (op.code))
7894 : 255 : vectype_in = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (slp_node)[0]);
7895 : :
7896 : 2547 : vec_num = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
7897 : :
7898 : 2547 : code_helper code = canonicalize_code (op.code, op.type);
7899 : 2547 : internal_fn cond_fn
7900 : 468 : = ((code.is_internal_fn ()
7901 : 468 : && internal_fn_mask_index ((internal_fn)code) != -1)
7902 : 2547 : ? (internal_fn)code : get_conditional_internal_fn (code, op.type));
7903 : :
7904 : 2547 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7905 : 2547 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
7906 : 2547 : bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7907 : :
7908 : : /* Transform. */
7909 : 2547 : tree new_temp = NULL_TREE;
7910 : 17829 : auto_vec<tree> vec_oprnds[3];
7911 : :
7912 : 2547 : if (dump_enabled_p ())
7913 : 726 : dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7914 : :
7915 : : /* A binary COND_OP reduction must have the same definition and else
7916 : : value. */
7917 : 3015 : bool cond_fn_p = code.is_internal_fn ()
7918 : 468 : && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
7919 : 468 : if (cond_fn_p)
7920 : : {
7921 : 468 : gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
7922 : : || code == IFN_COND_MUL || code == IFN_COND_AND
7923 : : || code == IFN_COND_IOR || code == IFN_COND_XOR
7924 : : || code == IFN_COND_MIN || code == IFN_COND_MAX);
7925 : 468 : gcc_assert (op.num_ops == 4
7926 : : && (op.ops[reduc_index]
7927 : : == op.ops[internal_fn_else_index ((internal_fn) code)]));
7928 : : }
7929 : :
7930 : 2547 : bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7931 : :
7932 : 2547 : vect_reduction_type reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7933 : 2547 : if (reduction_type == FOLD_LEFT_REDUCTION)
7934 : : {
7935 : 830 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
7936 : 830 : gcc_assert (code.is_tree_code () || cond_fn_p);
7937 : 830 : return vectorize_fold_left_reduction
7938 : 830 : (loop_vinfo, stmt_info, gsi, slp_node,
7939 : 830 : code, reduc_fn, op.num_ops, vectype_in,
7940 : 830 : reduc_index, masks, lens);
7941 : : }
7942 : :
7943 : 1717 : bool single_defuse_cycle = VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info);
7944 : 1717 : bool lane_reducing = lane_reducing_op_p (code);
7945 : 1462 : gcc_assert (single_defuse_cycle || lane_reducing);
7946 : :
7947 : 1717 : if (lane_reducing)
7948 : : {
7949 : : /* The last operand of lane-reducing op is for reduction. */
7950 : 255 : gcc_assert (reduc_index == (int) op.num_ops - 1);
7951 : : }
7952 : :
7953 : : /* Create the destination vector */
7954 : 1717 : tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
7955 : 1717 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7956 : :
7957 : : /* Get NCOPIES vector definitions for all operands except the reduction
7958 : : definition. */
7959 : 1717 : if (!cond_fn_p)
7960 : : {
7961 : 1264 : gcc_assert (reduc_index >= 0 && reduc_index <= 2);
7962 : 2108 : vect_get_vec_defs (loop_vinfo, slp_node,
7963 : 1264 : single_defuse_cycle && reduc_index == 0
7964 : : ? NULL_TREE : op.ops[0], &vec_oprnds[0],
7965 : 1264 : single_defuse_cycle && reduc_index == 1
7966 : : ? NULL_TREE : op.ops[1], &vec_oprnds[1],
7967 : 1264 : op.num_ops == 3
7968 : 255 : && !(single_defuse_cycle && reduc_index == 2)
7969 : : ? op.ops[2] : NULL_TREE, &vec_oprnds[2]);
7970 : : }
7971 : : else
7972 : : {
7973 : : /* For a conditional operation pass the truth type as mask
7974 : : vectype. */
7975 : 453 : gcc_assert (single_defuse_cycle
7976 : : && (reduc_index == 1 || reduc_index == 2));
7977 : 453 : vect_get_vec_defs (loop_vinfo, slp_node, op.ops[0],
7978 : : &vec_oprnds[0],
7979 : : reduc_index == 1 ? NULL_TREE : op.ops[1],
7980 : : &vec_oprnds[1],
7981 : : reduc_index == 2 ? NULL_TREE : op.ops[2],
7982 : : &vec_oprnds[2]);
7983 : : }
7984 : :
7985 : : /* For single def-use cycles get one copy of the vectorized reduction
7986 : : definition. */
7987 : 1717 : if (single_defuse_cycle)
7988 : : {
7989 : 1632 : vect_get_vec_defs (loop_vinfo, slp_node,
7990 : : reduc_index == 0 ? op.ops[0] : NULL_TREE,
7991 : : &vec_oprnds[0],
7992 : : reduc_index == 1 ? op.ops[1] : NULL_TREE,
7993 : : &vec_oprnds[1],
7994 : : reduc_index == 2 ? op.ops[2] : NULL_TREE,
7995 : : &vec_oprnds[2]);
7996 : : }
7997 : 85 : else if (lane_reducing)
7998 : : {
7999 : : /* For normal reduction, consistency between vectorized def/use is
8000 : : naturally ensured when mapping from scalar statement. But if lane-
8001 : : reducing op is involved in reduction, thing would become somewhat
8002 : : complicated in that the op's result and operand for accumulation are
8003 : : limited to less lanes than other operands, which certainly causes
8004 : : def/use mismatch on adjacent statements around the op if do not have
8005 : : any kind of specific adjustment. One approach is to refit lane-
8006 : : reducing op in the way of introducing new trivial pass-through copies
8007 : : to fix possible def/use gap, so as to make it behave like a normal op.
8008 : : And vector reduction PHIs are always generated to the full extent, no
8009 : : matter lane-reducing op exists or not. If some copies or PHIs are
8010 : : actually superfluous, they would be cleaned up by passes after
8011 : : vectorization. An example for single-lane slp, lane-reducing ops
8012 : : with mixed input vectypes in a reduction chain, is given as below.
8013 : : Similarly, this handling is applicable for multiple-lane slp as well.
8014 : :
8015 : : int sum = 1;
8016 : : for (i)
8017 : : {
8018 : : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
8019 : : sum += w[i]; // widen-sum <vector(16) char>
8020 : : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
8021 : : sum += n[i]; // normal <vector(4) int>
8022 : : }
8023 : :
8024 : : The vector size is 128-bit,vectorization factor is 16. Reduction
8025 : : statements would be transformed as:
8026 : :
8027 : : vector<4> int sum_v0 = { 0, 0, 0, 1 };
8028 : : vector<4> int sum_v1 = { 0, 0, 0, 0 };
8029 : : vector<4> int sum_v2 = { 0, 0, 0, 0 };
8030 : : vector<4> int sum_v3 = { 0, 0, 0, 0 };
8031 : :
8032 : : for (i / 16)
8033 : : {
8034 : : sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
8035 : : sum_v1 = sum_v1; // copy
8036 : : sum_v2 = sum_v2; // copy
8037 : : sum_v3 = sum_v3; // copy
8038 : :
8039 : : sum_v0 = sum_v0; // copy
8040 : : sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
8041 : : sum_v2 = sum_v2; // copy
8042 : : sum_v3 = sum_v3; // copy
8043 : :
8044 : : sum_v0 = sum_v0; // copy
8045 : : sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1);
8046 : : sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2);
8047 : : sum_v3 = sum_v3; // copy
8048 : :
8049 : : sum_v0 += n_v0[i: 0 ~ 3 ];
8050 : : sum_v1 += n_v1[i: 4 ~ 7 ];
8051 : : sum_v2 += n_v2[i: 8 ~ 11];
8052 : : sum_v3 += n_v3[i: 12 ~ 15];
8053 : : }
8054 : :
8055 : : Moreover, for a higher instruction parallelism in final vectorized
8056 : : loop, it is considered to make those effective vector lane-reducing
8057 : : ops be distributed evenly among all def-use cycles. In the above
8058 : : example, DOT_PROD, WIDEN_SUM and SADs are generated into disparate
8059 : : cycles, instruction dependency among them could be eliminated. */
8060 : 85 : unsigned effec_ncopies = vec_oprnds[0].length ();
8061 : 85 : unsigned total_ncopies = vec_oprnds[reduc_index].length ();
8062 : :
8063 : 85 : gcc_assert (effec_ncopies <= total_ncopies);
8064 : :
8065 : 85 : if (effec_ncopies < total_ncopies)
8066 : : {
8067 : 255 : for (unsigned i = 0; i < op.num_ops - 1; i++)
8068 : : {
8069 : 340 : gcc_assert (vec_oprnds[i].length () == effec_ncopies);
8070 : 170 : vec_oprnds[i].safe_grow_cleared (total_ncopies);
8071 : : }
8072 : : }
8073 : :
8074 : 85 : tree reduc_vectype_in = vectype_in;
8075 : 85 : gcc_assert (reduc_vectype_in);
8076 : :
8077 : 85 : unsigned effec_reduc_ncopies
8078 : 85 : = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
8079 : :
8080 : 85 : gcc_assert (effec_ncopies <= effec_reduc_ncopies);
8081 : :
8082 : 85 : if (effec_ncopies < effec_reduc_ncopies)
8083 : : {
8084 : : /* Find suitable def-use cycles to generate vectorized statements
8085 : : into, and reorder operands based on the selection. */
8086 : 0 : unsigned curr_pos = VECT_REDUC_INFO_RESULT_POS (reduc_info);
8087 : 0 : unsigned next_pos = (curr_pos + effec_ncopies) % effec_reduc_ncopies;
8088 : :
8089 : 0 : gcc_assert (curr_pos < effec_reduc_ncopies);
8090 : 0 : VECT_REDUC_INFO_RESULT_POS (reduc_info) = next_pos;
8091 : :
8092 : 0 : if (curr_pos)
8093 : : {
8094 : 0 : unsigned count = effec_reduc_ncopies - effec_ncopies;
8095 : 0 : unsigned start = curr_pos - count;
8096 : :
8097 : 0 : if ((int) start < 0)
8098 : : {
8099 : 0 : count = curr_pos;
8100 : 0 : start = 0;
8101 : : }
8102 : :
8103 : 0 : for (unsigned i = 0; i < op.num_ops - 1; i++)
8104 : : {
8105 : 0 : for (unsigned j = effec_ncopies; j > start; j--)
8106 : : {
8107 : 0 : unsigned k = j - 1;
8108 : 0 : std::swap (vec_oprnds[i][k], vec_oprnds[i][k + count]);
8109 : 0 : gcc_assert (!vec_oprnds[i][k]);
8110 : : }
8111 : : }
8112 : : }
8113 : : }
8114 : : }
8115 : :
8116 : 1717 : bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (slp_node);
8117 : 2930 : unsigned num = vec_oprnds[reduc_index == 0 ? 1 : 0].length ();
8118 : 1717 : unsigned mask_index = 0;
8119 : :
8120 : 7540 : for (unsigned i = 0; i < num; ++i)
8121 : : {
8122 : 5823 : gimple *new_stmt;
8123 : 5823 : tree vop[3] = { vec_oprnds[0][i], vec_oprnds[1][i], NULL_TREE };
8124 : 5823 : if (!vop[0] || !vop[1])
8125 : : {
8126 : 456 : tree reduc_vop = vec_oprnds[reduc_index][i];
8127 : :
8128 : : /* If could not generate an effective vector statement for current
8129 : : portion of reduction operand, insert a trivial copy to simply
8130 : : handle over the operand to other dependent statements. */
8131 : 456 : gcc_assert (reduc_vop);
8132 : :
8133 : 456 : if (TREE_CODE (reduc_vop) == SSA_NAME
8134 : 456 : && !SSA_NAME_IS_DEFAULT_DEF (reduc_vop))
8135 : 456 : new_stmt = SSA_NAME_DEF_STMT (reduc_vop);
8136 : : else
8137 : : {
8138 : 0 : new_temp = make_ssa_name (vec_dest);
8139 : 0 : new_stmt = gimple_build_assign (new_temp, reduc_vop);
8140 : 0 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
8141 : : gsi);
8142 : : }
8143 : : }
8144 : 5367 : else if (masked_loop_p && !mask_by_cond_expr)
8145 : : {
8146 : : /* No conditional ifns have been defined for lane-reducing op
8147 : : yet. */
8148 : 16 : gcc_assert (!lane_reducing);
8149 : :
8150 : 16 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8151 : : vec_num, vectype_in,
8152 : : mask_index++);
8153 : 16 : gcall *call;
8154 : 24 : if (code.is_internal_fn () && cond_fn_p)
8155 : : {
8156 : 16 : gcc_assert (op.num_ops >= 3
8157 : : && internal_fn_mask_index (internal_fn (code)) == 0);
8158 : 8 : vop[2] = vec_oprnds[2][i];
8159 : 8 : mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask),
8160 : : mask, vop[0], gsi);
8161 : 8 : call = gimple_build_call_internal (cond_fn, 4, mask, vop[1],
8162 : : vop[2], vop[reduc_index]);
8163 : : }
8164 : : else
8165 : : {
8166 : 8 : gcc_assert (code.is_tree_code ());
8167 : 8 : call = gimple_build_call_internal (cond_fn, 4, mask, vop[0],
8168 : : vop[1], vop[reduc_index]);
8169 : : }
8170 : 16 : new_temp = make_ssa_name (vec_dest, call);
8171 : 16 : gimple_call_set_lhs (call, new_temp);
8172 : 16 : gimple_call_set_nothrow (call, true);
8173 : 16 : vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8174 : 16 : new_stmt = call;
8175 : : }
8176 : : else
8177 : : {
8178 : 5351 : if (op.num_ops >= 3)
8179 : 1753 : vop[2] = vec_oprnds[2][i];
8180 : :
8181 : 5351 : if (masked_loop_p && mask_by_cond_expr)
8182 : : {
8183 : 4 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8184 : : vec_num, vectype_in,
8185 : : mask_index++);
8186 : 4 : build_vect_cond_expr (code, vop, mask, gsi);
8187 : : }
8188 : :
8189 : 5351 : if (emulated_mixed_dot_prod)
8190 : 4 : new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8191 : : vec_dest, vop);
8192 : :
8193 : 6689 : else if (code.is_internal_fn () && !cond_fn_p)
8194 : 0 : new_stmt = gimple_build_call_internal (internal_fn (code),
8195 : : op.num_ops,
8196 : : vop[0], vop[1], vop[2]);
8197 : 6689 : else if (code.is_internal_fn () && cond_fn_p)
8198 : 1342 : new_stmt = gimple_build_call_internal (internal_fn (code),
8199 : : op.num_ops,
8200 : : vop[0], vop[1], vop[2],
8201 : : vop[reduc_index]);
8202 : : else
8203 : 4005 : new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8204 : : vop[0], vop[1], vop[2]);
8205 : 5351 : new_temp = make_ssa_name (vec_dest, new_stmt);
8206 : 5351 : gimple_set_lhs (new_stmt, new_temp);
8207 : 5351 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8208 : : }
8209 : :
8210 : 5823 : if (single_defuse_cycle && i < num - 1)
8211 : 3507 : vec_oprnds[reduc_index].safe_push (gimple_get_lhs (new_stmt));
8212 : : else
8213 : 2316 : slp_node->push_vec_def (new_stmt);
8214 : : }
8215 : :
8216 : : return true;
8217 : 10188 : }
8218 : :
8219 : : /* Transform phase of a cycle PHI. */
8220 : :
8221 : : bool
8222 : 23360 : vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8223 : : stmt_vec_info stmt_info,
8224 : : slp_tree slp_node, slp_instance slp_node_instance)
8225 : : {
8226 : 23360 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
8227 : 23360 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8228 : 23360 : int i;
8229 : 23360 : bool nested_cycle = false;
8230 : 23360 : int vec_num;
8231 : :
8232 : 23476 : if (nested_in_vect_loop_p (loop, stmt_info))
8233 : : {
8234 : : loop = loop->inner;
8235 : : nested_cycle = true;
8236 : : }
8237 : :
8238 : 23360 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
8239 : 23360 : if (reduc_info
8240 : 22771 : && (VECT_REDUC_INFO_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8241 : 22771 : || VECT_REDUC_INFO_TYPE (reduc_info) == FOLD_LEFT_REDUCTION))
8242 : : /* Leave the scalar phi in place. */
8243 : : return true;
8244 : :
8245 : 21941 : if (reduc_info && reduc_info->is_reduc_chain && dump_enabled_p ())
8246 : 109 : dump_printf_loc (MSG_NOTE, vect_location,
8247 : : "vectorizing a reduction chain\n");
8248 : :
8249 : 22530 : vec_num = vect_get_num_copies (loop_vinfo, slp_node);
8250 : :
8251 : : /* Check whether we should use a single PHI node and accumulate
8252 : : vectors to one before the backedge. */
8253 : 22530 : if (reduc_info && VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info))
8254 : 22530 : vec_num = 1;
8255 : :
8256 : : /* Create the destination vector */
8257 : 22530 : gphi *phi = as_a <gphi *> (stmt_info->stmt);
8258 : 22530 : tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8259 : : vectype_out);
8260 : :
8261 : : /* Get the loop-entry arguments. */
8262 : 22530 : tree vec_initial_def = NULL_TREE;
8263 : 22530 : auto_vec<tree> vec_initial_defs;
8264 : 22530 : vec_initial_defs.reserve (vec_num);
8265 : : /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8266 : : and we can't use zero for induc_val, use initial_def. Similarly
8267 : : for REDUC_MIN and initial_def larger than the base. */
8268 : 22530 : if (reduc_info
8269 : 21941 : && VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8270 : : {
8271 : 66 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
8272 : 66 : tree initial_def = vect_phi_initial_value (phi);
8273 : 66 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).safe_push (initial_def);
8274 : 66 : tree induc_val = VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info);
8275 : 66 : if (TREE_CODE (initial_def) == INTEGER_CST
8276 : 64 : && !integer_zerop (induc_val)
8277 : 130 : && ((VECT_REDUC_INFO_CODE (reduc_info) == MAX_EXPR
8278 : 44 : && tree_int_cst_lt (initial_def, induc_val))
8279 : 61 : || (VECT_REDUC_INFO_CODE (reduc_info) == MIN_EXPR
8280 : 20 : && tree_int_cst_lt (induc_val, initial_def))))
8281 : : {
8282 : 3 : induc_val = initial_def;
8283 : : /* Communicate we used the initial_def to epilouge
8284 : : generation. */
8285 : 3 : VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8286 : : }
8287 : 66 : vec_initial_defs.quick_push
8288 : 66 : (build_vector_from_val (vectype_out, induc_val));
8289 : 66 : }
8290 : 22464 : else if (nested_cycle)
8291 : : {
8292 : 670 : unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8293 : 670 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8294 : : &vec_initial_defs);
8295 : : }
8296 : : else
8297 : : {
8298 : 21794 : gcc_assert (slp_node == slp_node_instance->reduc_phis);
8299 : 21794 : vec<tree> &initial_values = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info);
8300 : 21794 : vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8301 : :
8302 : 21794 : unsigned int num_phis = stmts.length ();
8303 : 21794 : if (reduc_info->is_reduc_chain)
8304 : 179 : num_phis = 1;
8305 : 21794 : initial_values.reserve (num_phis);
8306 : 44044 : for (unsigned int i = 0; i < num_phis; ++i)
8307 : : {
8308 : 22250 : gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8309 : 22250 : initial_values.quick_push (vect_phi_initial_value (this_phi));
8310 : : }
8311 : 21794 : if (vec_num == 1)
8312 : 21216 : vect_find_reusable_accumulator (loop_vinfo, reduc_info, vectype_out);
8313 : 21794 : if (!initial_values.is_empty ())
8314 : : {
8315 : 21579 : tree initial_value
8316 : 42934 : = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8317 : 21579 : code_helper code = VECT_REDUC_INFO_CODE (reduc_info);
8318 : 21579 : tree neutral_op
8319 : 21579 : = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8320 : : code, initial_value);
8321 : : /* Try to simplify the vector initialization by applying an
8322 : : adjustment after the reduction has been performed. This
8323 : : can also break a critical path but on the other hand
8324 : : requires to keep the initial value live across the loop. */
8325 : 21579 : if (neutral_op
8326 : 21492 : && initial_values.length () == 1
8327 : 21288 : && !VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)
8328 : 17349 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8329 : 38850 : && !operand_equal_p (neutral_op, initial_values[0]))
8330 : : {
8331 : 12143 : VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info)
8332 : 12143 : = initial_values[0];
8333 : 12143 : initial_values[0] = neutral_op;
8334 : : }
8335 : 43158 : get_initial_defs_for_reduction (loop_vinfo, reduc_info, vectype_out,
8336 : : &vec_initial_defs, vec_num,
8337 : : stmts.length (), neutral_op);
8338 : : }
8339 : : }
8340 : :
8341 : 22530 : if (vec_initial_def)
8342 : : {
8343 : 0 : vec_initial_defs.create (1);
8344 : 0 : vec_initial_defs.quick_push (vec_initial_def);
8345 : : }
8346 : :
8347 : 22530 : if (reduc_info)
8348 : 21941 : if (auto *accumulator = VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info))
8349 : : {
8350 : 4171 : tree def = accumulator->reduc_input;
8351 : 4171 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8352 : : {
8353 : 4168 : unsigned int nreduc;
8354 : 8336 : bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8355 : 4168 : (TREE_TYPE (def)),
8356 : 4168 : TYPE_VECTOR_SUBPARTS (vectype_out),
8357 : : &nreduc);
8358 : 0 : gcc_assert (res);
8359 : 4168 : gimple_seq stmts = NULL;
8360 : : /* Reduce the single vector to a smaller one. */
8361 : 4168 : if (nreduc != 1)
8362 : : {
8363 : : /* Perform the reduction in the appropriate type. */
8364 : 4168 : tree rvectype = vectype_out;
8365 : 4168 : if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8366 : 4168 : TREE_TYPE (TREE_TYPE (def))))
8367 : 235 : rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8368 : : TYPE_VECTOR_SUBPARTS
8369 : 470 : (vectype_out));
8370 : 4168 : def = vect_create_partial_epilog (def, rvectype,
8371 : : VECT_REDUC_INFO_CODE
8372 : : (reduc_info),
8373 : : &stmts);
8374 : : }
8375 : : /* The epilogue loop might use a different vector mode, like
8376 : : VNx2DI vs. V2DI. */
8377 : 4168 : if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8378 : : {
8379 : 0 : tree reduc_type = build_vector_type_for_mode
8380 : 0 : (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8381 : 0 : def = gimple_convert (&stmts, reduc_type, def);
8382 : : }
8383 : : /* Adjust the input so we pick up the partially reduced value
8384 : : for the skip edge in vect_create_epilog_for_reduction. */
8385 : 4168 : accumulator->reduc_input = def;
8386 : : /* And the reduction could be carried out using a different sign. */
8387 : 4168 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8388 : 235 : def = gimple_convert (&stmts, vectype_out, def);
8389 : 4168 : edge e;
8390 : 4168 : if ((e = loop_vinfo->main_loop_edge)
8391 : 4168 : || (e = loop_vinfo->skip_this_loop_edge))
8392 : : {
8393 : : /* While we'd like to insert on the edge this will split
8394 : : blocks and disturb bookkeeping, we also will eventually
8395 : : need this on the skip edge. Rely on sinking to
8396 : : fixup optimal placement and insert in the pred. */
8397 : 3953 : gimple_stmt_iterator gsi = gsi_last_bb (e->src);
8398 : : /* Insert before a cond that eventually skips the
8399 : : epilogue. */
8400 : 3953 : if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8401 : 3936 : gsi_prev (&gsi);
8402 : 3953 : gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8403 : : }
8404 : : else
8405 : 215 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8406 : : stmts);
8407 : : }
8408 : 4171 : if (loop_vinfo->main_loop_edge)
8409 : 3956 : vec_initial_defs[0]
8410 : 3956 : = vect_get_main_loop_result (loop_vinfo, def,
8411 : 3956 : vec_initial_defs[0]);
8412 : : else
8413 : 215 : vec_initial_defs.safe_push (def);
8414 : : }
8415 : :
8416 : : /* Generate the reduction PHIs upfront. */
8417 : 46819 : for (i = 0; i < vec_num; i++)
8418 : : {
8419 : 24289 : tree vec_init_def = vec_initial_defs[i];
8420 : : /* Create the reduction-phi that defines the reduction
8421 : : operand. */
8422 : 24289 : gphi *new_phi = create_phi_node (vec_dest, loop->header);
8423 : 24289 : add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8424 : : UNKNOWN_LOCATION);
8425 : :
8426 : : /* The loop-latch arg is set in epilogue processing. */
8427 : :
8428 : 24289 : slp_node->push_vec_def (new_phi);
8429 : : }
8430 : :
8431 : 22530 : return true;
8432 : 22530 : }
8433 : :
8434 : : /* Vectorizes LC PHIs. */
8435 : :
8436 : : bool
8437 : 157402 : vectorizable_lc_phi (loop_vec_info loop_vinfo,
8438 : : stmt_vec_info stmt_info,
8439 : : slp_tree slp_node)
8440 : : {
8441 : 157402 : if (!loop_vinfo
8442 : 157402 : || !is_a <gphi *> (stmt_info->stmt)
8443 : 188330 : || gimple_phi_num_args (stmt_info->stmt) != 1)
8444 : : return false;
8445 : :
8446 : 704 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8447 : 0 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8448 : : return false;
8449 : :
8450 : : /* Deal with copies from externs or constants that disguise as
8451 : : loop-closed PHI nodes (PR97886). */
8452 : 704 : if (!vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8453 : : SLP_TREE_VECTYPE (slp_node)))
8454 : : {
8455 : 0 : if (dump_enabled_p ())
8456 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8457 : : "incompatible vector types for invariants\n");
8458 : 0 : return false;
8459 : : }
8460 : :
8461 : : /* ??? This can happen with data vs. mask uses of boolean. */
8462 : 704 : if (!useless_type_conversion_p (SLP_TREE_VECTYPE (slp_node),
8463 : 704 : SLP_TREE_VECTYPE
8464 : : (SLP_TREE_CHILDREN (slp_node)[0])))
8465 : : {
8466 : 0 : if (dump_enabled_p ())
8467 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8468 : : "missed mask promotion\n");
8469 : 0 : return false;
8470 : : }
8471 : :
8472 : 704 : SLP_TREE_TYPE (slp_node) = lc_phi_info_type;
8473 : 704 : return true;
8474 : : }
8475 : :
8476 : : bool
8477 : 447 : vect_transform_lc_phi (loop_vec_info loop_vinfo,
8478 : : stmt_vec_info stmt_info,
8479 : : slp_tree slp_node)
8480 : : {
8481 : :
8482 : 447 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8483 : 447 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8484 : 447 : basic_block bb = gimple_bb (stmt_info->stmt);
8485 : 447 : edge e = single_pred_edge (bb);
8486 : 447 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8487 : 447 : auto_vec<tree> vec_oprnds;
8488 : 894 : vect_get_vec_defs (loop_vinfo, slp_node,
8489 : 447 : gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8490 : 1001 : for (unsigned i = 0; i < vec_oprnds.length (); i++)
8491 : : {
8492 : : /* Create the vectorized LC PHI node. */
8493 : 554 : gphi *new_phi = create_phi_node (vec_dest, bb);
8494 : 554 : add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8495 : 554 : slp_node->push_vec_def (new_phi);
8496 : : }
8497 : :
8498 : 447 : return true;
8499 : 447 : }
8500 : :
8501 : : /* Vectorizes PHIs. */
8502 : :
8503 : : bool
8504 : 139066 : vectorizable_phi (bb_vec_info vinfo,
8505 : : stmt_vec_info stmt_info,
8506 : : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8507 : : {
8508 : 139066 : if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8509 : : return false;
8510 : :
8511 : 67822 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8512 : : return false;
8513 : :
8514 : 67822 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8515 : :
8516 : 67822 : if (cost_vec) /* transformation not required. */
8517 : : {
8518 : : slp_tree child;
8519 : : unsigned i;
8520 : 184270 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8521 : 129955 : if (!child)
8522 : : {
8523 : 0 : if (dump_enabled_p ())
8524 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8525 : : "PHI node with unvectorized backedge def\n");
8526 : 0 : return false;
8527 : : }
8528 : 129955 : else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8529 : : {
8530 : 18 : if (dump_enabled_p ())
8531 : 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8532 : : "incompatible vector types for invariants\n");
8533 : 18 : return false;
8534 : : }
8535 : 129937 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8536 : 129937 : && !useless_type_conversion_p (vectype,
8537 : : SLP_TREE_VECTYPE (child)))
8538 : : {
8539 : : /* With bools we can have mask and non-mask precision vectors
8540 : : or different non-mask precisions. while pattern recog is
8541 : : supposed to guarantee consistency here bugs in it can cause
8542 : : mismatches (PR103489 and PR103800 for example).
8543 : : Deal with them here instead of ICEing later. */
8544 : 18 : if (dump_enabled_p ())
8545 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8546 : : "incompatible vector type setup from "
8547 : : "bool pattern detection\n");
8548 : 18 : return false;
8549 : : }
8550 : :
8551 : : /* For single-argument PHIs assume coalescing which means zero cost
8552 : : for the scalar and the vector PHIs. This avoids artificially
8553 : : favoring the vector path (but may pessimize it in some cases). */
8554 : 54315 : if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8555 : 49393 : record_stmt_cost (cost_vec, vect_get_num_copies (vinfo, slp_node),
8556 : : vector_stmt, slp_node, vectype, 0, vect_body);
8557 : 54315 : SLP_TREE_TYPE (slp_node) = phi_info_type;
8558 : 54315 : return true;
8559 : : }
8560 : :
8561 : 13471 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8562 : 13471 : basic_block bb = gimple_bb (stmt_info->stmt);
8563 : 13471 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8564 : 13471 : auto_vec<gphi *> new_phis;
8565 : 48580 : for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8566 : : {
8567 : 35109 : slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8568 : :
8569 : : /* Skip not yet vectorized defs. */
8570 : 35483 : if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8571 : 35109 : && SLP_TREE_VEC_DEFS (child).is_empty ())
8572 : 374 : continue;
8573 : :
8574 : 34735 : auto_vec<tree> vec_oprnds;
8575 : 34735 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8576 : 34735 : if (!new_phis.exists ())
8577 : : {
8578 : 13471 : new_phis.create (vec_oprnds.length ());
8579 : 28479 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
8580 : : {
8581 : : /* Create the vectorized LC PHI node. */
8582 : 15008 : new_phis.quick_push (create_phi_node (vec_dest, bb));
8583 : 15008 : slp_node->push_vec_def (new_phis[j]);
8584 : : }
8585 : : }
8586 : 34735 : edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8587 : 75126 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
8588 : 40391 : add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8589 : 34735 : }
8590 : : /* We should have at least one already vectorized child. */
8591 : 13471 : gcc_assert (new_phis.exists ());
8592 : :
8593 : 13471 : return true;
8594 : 13471 : }
8595 : :
8596 : : /* Vectorizes first order recurrences. An overview of the transformation
8597 : : is described below. Suppose we have the following loop.
8598 : :
8599 : : int t = 0;
8600 : : for (int i = 0; i < n; ++i)
8601 : : {
8602 : : b[i] = a[i] - t;
8603 : : t = a[i];
8604 : : }
8605 : :
8606 : : There is a first-order recurrence on 'a'. For this loop, the scalar IR
8607 : : looks (simplified) like:
8608 : :
8609 : : scalar.preheader:
8610 : : init = 0;
8611 : :
8612 : : scalar.body:
8613 : : i = PHI <0(scalar.preheader), i+1(scalar.body)>
8614 : : _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8615 : : _1 = a[i]
8616 : : b[i] = _1 - _2
8617 : : if (i < n) goto scalar.body
8618 : :
8619 : : In this example, _2 is a recurrence because it's value depends on the
8620 : : previous iteration. We vectorize this as (VF = 4)
8621 : :
8622 : : vector.preheader:
8623 : : vect_init = vect_cst(..., ..., ..., 0)
8624 : :
8625 : : vector.body
8626 : : i = PHI <0(vector.preheader), i+4(vector.body)>
8627 : : vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8628 : : vect_2 = a[i, i+1, i+2, i+3];
8629 : : vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8630 : : b[i, i+1, i+2, i+3] = vect_2 - vect_3
8631 : : if (..) goto vector.body
8632 : :
8633 : : In this function, vectorizable_recurr, we code generate both the
8634 : : vector PHI node and the permute since those together compute the
8635 : : vectorized value of the scalar PHI. We do not yet have the
8636 : : backedge value to fill in there nor into the vec_perm. Those
8637 : : are filled in vect_schedule_scc.
8638 : :
8639 : : TODO: Since the scalar loop does not have a use of the recurrence
8640 : : outside of the loop the natural way to implement peeling via
8641 : : vectorizing the live value doesn't work. For now peeling of loops
8642 : : with a recurrence is not implemented. For SLP the supported cases
8643 : : are restricted to those requiring a single vector recurrence PHI. */
8644 : :
8645 : : bool
8646 : 156738 : vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8647 : : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8648 : : {
8649 : 156738 : if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8650 : : return false;
8651 : :
8652 : 30264 : gphi *phi = as_a<gphi *> (stmt_info->stmt);
8653 : :
8654 : : /* So far we only support first-order recurrence auto-vectorization. */
8655 : 30264 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8656 : : return false;
8657 : :
8658 : 408 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8659 : 408 : unsigned ncopies = vect_get_num_copies (loop_vinfo, slp_node);
8660 : 408 : poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8661 : 408 : unsigned dist = SLP_TREE_LANES (slp_node);
8662 : : /* We need to be able to make progress with a single vector. */
8663 : 408 : if (maybe_gt (dist * 2, nunits))
8664 : : {
8665 : 0 : if (dump_enabled_p ())
8666 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8667 : : "first order recurrence exceeds half of "
8668 : : "a vector\n");
8669 : 0 : return false;
8670 : : }
8671 : :
8672 : : /* We need to be able to build a { ..., a, b } init vector with
8673 : : dist number of distinct trailing values. Always possible
8674 : : when dist == 1 or when nunits is constant or when the initializations
8675 : : are uniform. */
8676 : 408 : tree uniform_initval = NULL_TREE;
8677 : 408 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8678 : 1656 : for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
8679 : : {
8680 : 444 : gphi *phi = as_a <gphi *> (s->stmt);
8681 : 444 : if (! uniform_initval)
8682 : 408 : uniform_initval = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8683 : 36 : else if (! operand_equal_p (uniform_initval,
8684 : 36 : PHI_ARG_DEF_FROM_EDGE (phi, pe)))
8685 : : {
8686 : : uniform_initval = NULL_TREE;
8687 : : break;
8688 : : }
8689 : : }
8690 : 408 : if (!uniform_initval && !nunits.is_constant ())
8691 : : {
8692 : : if (dump_enabled_p ())
8693 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8694 : : "cannot build initialization vector for "
8695 : : "first order recurrence\n");
8696 : : return false;
8697 : : }
8698 : :
8699 : : /* First-order recurrence autovectorization needs to handle permutation
8700 : : with indices = [nunits-1, nunits, nunits+1, ...]. */
8701 : 408 : vec_perm_builder sel (nunits, 1, 3);
8702 : 1632 : for (int i = 0; i < 3; ++i)
8703 : 1224 : sel.quick_push (nunits - dist + i);
8704 : 408 : vec_perm_indices indices (sel, 2, nunits);
8705 : :
8706 : 408 : if (cost_vec) /* transformation not required. */
8707 : : {
8708 : 368 : if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8709 : : indices))
8710 : : return false;
8711 : :
8712 : : /* We eventually need to set a vector type on invariant
8713 : : arguments. */
8714 : : unsigned j;
8715 : : slp_tree child;
8716 : 768 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8717 : 512 : if (!vect_maybe_update_slp_op_vectype (child, vectype))
8718 : : {
8719 : 0 : if (dump_enabled_p ())
8720 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8721 : : "incompatible vector types for "
8722 : : "invariants\n");
8723 : 0 : return false;
8724 : : }
8725 : :
8726 : : /* Verify we have set up compatible types. */
8727 : 256 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8728 : 256 : slp_tree latch_def = SLP_TREE_CHILDREN (slp_node)[le->dest_idx];
8729 : 256 : tree latch_vectype = SLP_TREE_VECTYPE (latch_def);
8730 : 256 : if (!types_compatible_p (latch_vectype, vectype))
8731 : : return false;
8732 : :
8733 : : /* The recurrence costs the initialization vector and one permute
8734 : : for each copy. With SLP the prologue value is explicitly
8735 : : represented and costed separately. */
8736 : 256 : unsigned prologue_cost = 0;
8737 : 256 : unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8738 : : slp_node, 0, vect_body);
8739 : 256 : if (dump_enabled_p ())
8740 : 48 : dump_printf_loc (MSG_NOTE, vect_location,
8741 : : "vectorizable_recurr: inside_cost = %d, "
8742 : : "prologue_cost = %d .\n", inside_cost,
8743 : : prologue_cost);
8744 : :
8745 : 256 : SLP_TREE_TYPE (slp_node) = recurr_info_type;
8746 : 256 : return true;
8747 : : }
8748 : :
8749 : 40 : tree vec_init;
8750 : 40 : if (! uniform_initval)
8751 : : {
8752 : 6 : vec<constructor_elt, va_gc> *v = NULL;
8753 : 6 : vec_alloc (v, nunits.to_constant ());
8754 : 33 : for (unsigned i = 0; i < nunits.to_constant () - dist; ++i)
8755 : 27 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
8756 : : build_zero_cst (TREE_TYPE (vectype)));
8757 : 39 : for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
8758 : : {
8759 : 21 : gphi *phi = as_a <gphi *> (s->stmt);
8760 : 21 : tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8761 : 21 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
8762 : 21 : TREE_TYPE (preheader)))
8763 : : {
8764 : 0 : gimple_seq stmts = NULL;
8765 : 0 : preheader = gimple_convert (&stmts,
8766 : 0 : TREE_TYPE (vectype), preheader);
8767 : 0 : gsi_insert_seq_on_edge_immediate (pe, stmts);
8768 : : }
8769 : 21 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, preheader);
8770 : : }
8771 : 6 : vec_init = build_constructor (vectype, v);
8772 : : }
8773 : : else
8774 : : vec_init = uniform_initval;
8775 : 40 : vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8776 : :
8777 : : /* Create the vectorized first-order PHI node. */
8778 : 40 : tree vec_dest = vect_get_new_vect_var (vectype,
8779 : : vect_simple_var, "vec_recur_");
8780 : 40 : basic_block bb = gimple_bb (phi);
8781 : 40 : gphi *new_phi = create_phi_node (vec_dest, bb);
8782 : 40 : add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8783 : :
8784 : : /* Insert shuffles the first-order recurrence autovectorization.
8785 : : result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
8786 : 40 : tree perm = vect_gen_perm_mask_checked (vectype, indices);
8787 : :
8788 : : /* Insert the required permute after the latch definition. The
8789 : : second and later operands are tentative and will be updated when we have
8790 : : vectorized the latch definition. */
8791 : 40 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8792 : 40 : gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8793 : 40 : gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8794 : 40 : gsi_next (&gsi2);
8795 : :
8796 : 117 : for (unsigned i = 0; i < ncopies; ++i)
8797 : : {
8798 : 77 : vec_dest = make_ssa_name (vectype);
8799 : 77 : gassign *vperm
8800 : 117 : = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8801 : 40 : i == 0 ? gimple_phi_result (new_phi) : NULL,
8802 : : NULL, perm);
8803 : 77 : vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8804 : :
8805 : 77 : slp_node->push_vec_def (vperm);
8806 : : }
8807 : :
8808 : : return true;
8809 : 408 : }
8810 : :
8811 : : /* Return true if VECTYPE represents a vector that requires lowering
8812 : : by the vector lowering pass. */
8813 : :
8814 : : bool
8815 : 635951 : vect_emulated_vector_p (tree vectype)
8816 : : {
8817 : 1271902 : return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8818 : 638650 : && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8819 : 2681 : || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8820 : : }
8821 : :
8822 : : /* Return true if we can emulate CODE on an integer mode representation
8823 : : of a vector. */
8824 : :
8825 : : bool
8826 : 10706 : vect_can_vectorize_without_simd_p (tree_code code)
8827 : : {
8828 : 10706 : switch (code)
8829 : : {
8830 : : case PLUS_EXPR:
8831 : : case MINUS_EXPR:
8832 : : case NEGATE_EXPR:
8833 : : case BIT_AND_EXPR:
8834 : : case BIT_IOR_EXPR:
8835 : : case BIT_XOR_EXPR:
8836 : : case BIT_NOT_EXPR:
8837 : : return true;
8838 : :
8839 : 9945 : default:
8840 : 9945 : return false;
8841 : : }
8842 : : }
8843 : :
8844 : : /* Likewise, but taking a code_helper. */
8845 : :
8846 : : bool
8847 : 155 : vect_can_vectorize_without_simd_p (code_helper code)
8848 : : {
8849 : 155 : return (code.is_tree_code ()
8850 : 155 : && vect_can_vectorize_without_simd_p (tree_code (code)));
8851 : : }
8852 : :
8853 : : /* Create vector init for vectorized iv. */
8854 : : static tree
8855 : 916 : vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8856 : : tree step_expr, poly_uint64 nunits,
8857 : : tree vectype,
8858 : : enum vect_induction_op_type induction_type)
8859 : : {
8860 : 916 : unsigned HOST_WIDE_INT const_nunits;
8861 : 916 : tree vec_shift, vec_init, new_name;
8862 : 916 : unsigned i;
8863 : 916 : tree itype = TREE_TYPE (vectype);
8864 : :
8865 : : /* iv_loop is the loop to be vectorized. Create:
8866 : : vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
8867 : 916 : new_name = gimple_convert (stmts, itype, init_expr);
8868 : 916 : switch (induction_type)
8869 : : {
8870 : 18 : case vect_step_op_shr:
8871 : 18 : case vect_step_op_shl:
8872 : : /* Build the Initial value from shift_expr. */
8873 : 18 : vec_init = gimple_build_vector_from_val (stmts,
8874 : : vectype,
8875 : : new_name);
8876 : 18 : vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
8877 : : build_zero_cst (itype), step_expr);
8878 : 18 : vec_init = gimple_build (stmts,
8879 : : (induction_type == vect_step_op_shr
8880 : : ? RSHIFT_EXPR : LSHIFT_EXPR),
8881 : : vectype, vec_init, vec_shift);
8882 : 18 : break;
8883 : :
8884 : 822 : case vect_step_op_neg:
8885 : 822 : {
8886 : 822 : vec_init = gimple_build_vector_from_val (stmts,
8887 : : vectype,
8888 : : new_name);
8889 : 822 : tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
8890 : : vectype, vec_init);
8891 : : /* The encoding has 2 interleaved stepped patterns. */
8892 : 822 : vec_perm_builder sel (nunits, 2, 3);
8893 : 822 : sel.quick_grow (6);
8894 : 4110 : for (i = 0; i < 3; i++)
8895 : : {
8896 : 2466 : sel[2 * i] = i;
8897 : 2466 : sel[2 * i + 1] = i + nunits;
8898 : : }
8899 : 822 : vec_perm_indices indices (sel, 2, nunits);
8900 : : /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
8901 : : fail when vec_init is const vector. In that situation vec_perm is not
8902 : : really needed. */
8903 : 822 : tree perm_mask_even
8904 : 822 : = vect_gen_perm_mask_any (vectype, indices);
8905 : 822 : vec_init = gimple_build (stmts, VEC_PERM_EXPR,
8906 : : vectype,
8907 : : vec_init, vec_neg,
8908 : : perm_mask_even);
8909 : 822 : }
8910 : 822 : break;
8911 : :
8912 : 76 : case vect_step_op_mul:
8913 : 76 : {
8914 : : /* Use unsigned mult to avoid UD integer overflow. */
8915 : 76 : gcc_assert (nunits.is_constant (&const_nunits));
8916 : 76 : tree utype = unsigned_type_for (itype);
8917 : 76 : tree uvectype = build_vector_type (utype,
8918 : 76 : TYPE_VECTOR_SUBPARTS (vectype));
8919 : 76 : new_name = gimple_convert (stmts, utype, new_name);
8920 : 76 : vec_init = gimple_build_vector_from_val (stmts,
8921 : : uvectype,
8922 : : new_name);
8923 : 76 : tree_vector_builder elts (uvectype, const_nunits, 1);
8924 : 76 : tree elt_step = build_one_cst (utype);
8925 : :
8926 : 76 : elts.quick_push (elt_step);
8927 : 660 : for (i = 1; i < const_nunits; i++)
8928 : : {
8929 : : /* Create: new_name_i = new_name + step_expr. */
8930 : 508 : elt_step = gimple_build (stmts, MULT_EXPR,
8931 : : utype, elt_step, step_expr);
8932 : 508 : elts.quick_push (elt_step);
8933 : : }
8934 : : /* Create a vector from [new_name_0, new_name_1, ...,
8935 : : new_name_nunits-1]. */
8936 : 76 : tree vec_mul = gimple_build_vector (stmts, &elts);
8937 : 76 : vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
8938 : : vec_init, vec_mul);
8939 : 76 : vec_init = gimple_convert (stmts, vectype, vec_init);
8940 : 76 : }
8941 : 76 : break;
8942 : :
8943 : 0 : default:
8944 : 0 : gcc_unreachable ();
8945 : : }
8946 : :
8947 : 916 : return vec_init;
8948 : : }
8949 : :
8950 : : /* Peel init_expr by skip_niter for induction_type. */
8951 : : tree
8952 : 84 : vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8953 : : tree skip_niters, tree step_expr,
8954 : : enum vect_induction_op_type induction_type)
8955 : : {
8956 : 84 : gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
8957 : 84 : tree type = TREE_TYPE (init_expr);
8958 : 84 : unsigned prec = TYPE_PRECISION (type);
8959 : 84 : switch (induction_type)
8960 : : {
8961 : 0 : case vect_step_op_neg:
8962 : 0 : if (TREE_INT_CST_LOW (skip_niters) % 2)
8963 : 0 : init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
8964 : : /* else no change. */
8965 : : break;
8966 : :
8967 : 12 : case vect_step_op_shr:
8968 : 12 : case vect_step_op_shl:
8969 : 12 : skip_niters = gimple_convert (stmts, type, skip_niters);
8970 : 12 : step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
8971 : : /* When shift mount >= precision, need to avoid UD.
8972 : : In the original loop, there's no UD, and according to semantic,
8973 : : init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
8974 : 12 : if (!tree_fits_uhwi_p (step_expr)
8975 : 12 : || tree_to_uhwi (step_expr) >= prec)
8976 : : {
8977 : 6 : if (induction_type == vect_step_op_shl
8978 : 6 : || TYPE_UNSIGNED (type))
8979 : 4 : init_expr = build_zero_cst (type);
8980 : : else
8981 : 2 : init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
8982 : : init_expr,
8983 : 4 : wide_int_to_tree (type, prec - 1));
8984 : : }
8985 : : else
8986 : 8 : init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
8987 : : ? RSHIFT_EXPR : LSHIFT_EXPR),
8988 : : type, init_expr, step_expr);
8989 : : break;
8990 : :
8991 : 72 : case vect_step_op_mul:
8992 : 72 : {
8993 : 72 : tree utype = unsigned_type_for (type);
8994 : 72 : init_expr = gimple_convert (stmts, utype, init_expr);
8995 : 72 : wide_int skipn = wi::to_wide (skip_niters);
8996 : 72 : wide_int begin = wi::to_wide (step_expr);
8997 : 72 : auto_mpz base, exp, mod, res;
8998 : 72 : wi::to_mpz (begin, base, TYPE_SIGN (type));
8999 : 72 : wi::to_mpz (skipn, exp, UNSIGNED);
9000 : 72 : mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9001 : 72 : mpz_powm (res, base, exp, mod);
9002 : 72 : begin = wi::from_mpz (utype, res, true);
9003 : 72 : tree mult_expr = wide_int_to_tree (utype, begin);
9004 : 72 : init_expr = gimple_build (stmts, MULT_EXPR, utype,
9005 : : init_expr, mult_expr);
9006 : 72 : init_expr = gimple_convert (stmts, type, init_expr);
9007 : 72 : }
9008 : 72 : break;
9009 : :
9010 : 0 : default:
9011 : 0 : gcc_unreachable ();
9012 : : }
9013 : :
9014 : 84 : return init_expr;
9015 : : }
9016 : :
9017 : : /* Create vector step for vectorized iv. */
9018 : : static tree
9019 : 1202 : vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9020 : : poly_uint64 vf,
9021 : : enum vect_induction_op_type induction_type)
9022 : : {
9023 : 1202 : tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9024 : 1202 : tree new_name = NULL;
9025 : : /* Step should be pow (step, vf) for mult induction. */
9026 : 1202 : if (induction_type == vect_step_op_mul)
9027 : : {
9028 : 76 : gcc_assert (vf.is_constant ());
9029 : 76 : wide_int begin = wi::to_wide (step_expr);
9030 : :
9031 : 584 : for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9032 : 508 : begin = wi::mul (begin, wi::to_wide (step_expr));
9033 : :
9034 : 76 : new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9035 : 76 : }
9036 : 1126 : else if (induction_type == vect_step_op_neg)
9037 : : /* Do nothing. */
9038 : : ;
9039 : : else
9040 : 18 : new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9041 : : expr, step_expr);
9042 : 1202 : return new_name;
9043 : : }
9044 : :
9045 : : static tree
9046 : 1202 : vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9047 : : stmt_vec_info stmt_info,
9048 : : tree new_name, tree vectype,
9049 : : enum vect_induction_op_type induction_type)
9050 : : {
9051 : : /* No step is needed for neg induction. */
9052 : 1202 : if (induction_type == vect_step_op_neg)
9053 : : return NULL;
9054 : :
9055 : 94 : tree t = unshare_expr (new_name);
9056 : 94 : gcc_assert (CONSTANT_CLASS_P (new_name)
9057 : : || TREE_CODE (new_name) == SSA_NAME);
9058 : 94 : tree new_vec = build_vector_from_val (vectype, t);
9059 : 94 : tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9060 : : new_vec, vectype, NULL);
9061 : 94 : return vec_step;
9062 : : }
9063 : :
9064 : : /* Update vectorized iv with vect_step, induc_def is init. */
9065 : : static tree
9066 : 1390 : vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9067 : : tree induc_def, tree vec_step,
9068 : : enum vect_induction_op_type induction_type)
9069 : : {
9070 : 1390 : tree vec_def = induc_def;
9071 : 1390 : switch (induction_type)
9072 : : {
9073 : 76 : case vect_step_op_mul:
9074 : 76 : {
9075 : : /* Use unsigned mult to avoid UD integer overflow. */
9076 : 76 : tree uvectype
9077 : 76 : = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9078 : 76 : TYPE_VECTOR_SUBPARTS (vectype));
9079 : 76 : vec_def = gimple_convert (stmts, uvectype, vec_def);
9080 : 76 : vec_step = gimple_convert (stmts, uvectype, vec_step);
9081 : 76 : vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9082 : : vec_def, vec_step);
9083 : 76 : vec_def = gimple_convert (stmts, vectype, vec_def);
9084 : : }
9085 : 76 : break;
9086 : :
9087 : 12 : case vect_step_op_shr:
9088 : 12 : vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9089 : : vec_def, vec_step);
9090 : 12 : break;
9091 : :
9092 : 6 : case vect_step_op_shl:
9093 : 6 : vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9094 : : vec_def, vec_step);
9095 : 6 : break;
9096 : : case vect_step_op_neg:
9097 : : vec_def = induc_def;
9098 : : /* Do nothing. */
9099 : : break;
9100 : 0 : default:
9101 : 0 : gcc_unreachable ();
9102 : : }
9103 : :
9104 : 1390 : return vec_def;
9105 : :
9106 : : }
9107 : :
9108 : : /* Function vectorizable_nonlinear_induction
9109 : :
9110 : : Check if STMT_INFO performs an nonlinear induction computation that can be
9111 : : vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9112 : : a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9113 : : basic block.
9114 : : Return true if STMT_INFO is vectorizable in this way. */
9115 : :
9116 : : static bool
9117 : 10412 : vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9118 : : stmt_vec_info stmt_info,
9119 : : slp_tree slp_node,
9120 : : stmt_vector_for_cost *cost_vec)
9121 : : {
9122 : 10412 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9123 : 10412 : unsigned ncopies;
9124 : 10412 : bool nested_in_vect_loop = false;
9125 : 10412 : class loop *iv_loop;
9126 : 10412 : tree vec_def;
9127 : 10412 : edge pe = loop_preheader_edge (loop);
9128 : 10412 : basic_block new_bb;
9129 : 10412 : tree vec_init, vec_step;
9130 : 10412 : tree new_name;
9131 : 10412 : gimple *new_stmt;
9132 : 10412 : gphi *induction_phi;
9133 : 10412 : tree induc_def, vec_dest;
9134 : 10412 : tree init_expr, step_expr;
9135 : 10412 : tree niters_skip;
9136 : 10412 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9137 : 10412 : unsigned i;
9138 : 10412 : gimple_stmt_iterator si;
9139 : :
9140 : 10412 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9141 : :
9142 : 10412 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9143 : 10412 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9144 : 10412 : enum vect_induction_op_type induction_type
9145 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9146 : :
9147 : 10412 : gcc_assert (induction_type > vect_step_op_add);
9148 : :
9149 : 10412 : ncopies = vect_get_num_copies (loop_vinfo, slp_node);
9150 : 10412 : gcc_assert (ncopies >= 1);
9151 : :
9152 : : /* FORNOW. Only handle nonlinear induction in the same loop. */
9153 : 10412 : if (nested_in_vect_loop_p (loop, stmt_info))
9154 : : {
9155 : 0 : if (dump_enabled_p ())
9156 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9157 : : "nonlinear induction in nested loop.\n");
9158 : 0 : return false;
9159 : : }
9160 : :
9161 : 10412 : iv_loop = loop;
9162 : 10412 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9163 : :
9164 : : /* TODO: Support multi-lane SLP for nonlinear iv. There should be separate
9165 : : vector iv update for each iv and a permutation to generate wanted
9166 : : vector iv. */
9167 : 10412 : if (SLP_TREE_LANES (slp_node) > 1)
9168 : : {
9169 : 0 : if (dump_enabled_p ())
9170 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9171 : : "SLP induction not supported for nonlinear"
9172 : : " induction.\n");
9173 : 0 : return false;
9174 : : }
9175 : :
9176 : 10412 : if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9177 : : {
9178 : 0 : if (dump_enabled_p ())
9179 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9180 : : "floating point nonlinear induction vectorization"
9181 : : " not supported.\n");
9182 : 0 : return false;
9183 : : }
9184 : :
9185 : 10412 : step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9186 : 10412 : init_expr = vect_phi_initial_value (phi);
9187 : 10412 : gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9188 : : && TREE_CODE (step_expr) == INTEGER_CST);
9189 : : /* step_expr should be aligned with init_expr,
9190 : : .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9191 : 10412 : step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9192 : :
9193 : 10412 : if (TREE_CODE (init_expr) == INTEGER_CST)
9194 : 2837 : init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9195 : 7575 : else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9196 : : {
9197 : : /* INIT_EXPR could be a bit_field, bail out for such case. */
9198 : 4 : if (dump_enabled_p ())
9199 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9200 : : "nonlinear induction vectorization failed:"
9201 : : " component type of vectype is not a nop conversion"
9202 : : " from type of init_expr.\n");
9203 : 4 : return false;
9204 : : }
9205 : :
9206 : 10408 : switch (induction_type)
9207 : : {
9208 : 2538 : case vect_step_op_neg:
9209 : 2538 : if (maybe_eq (TYPE_VECTOR_SUBPARTS (vectype), 1u))
9210 : : return false;
9211 : 2534 : if (TREE_CODE (init_expr) != INTEGER_CST
9212 : 190 : && TREE_CODE (init_expr) != REAL_CST)
9213 : : {
9214 : : /* Check for backend support of NEGATE_EXPR and vec_perm. */
9215 : 190 : if (!directly_supported_p (NEGATE_EXPR, vectype))
9216 : 0 : return false;
9217 : :
9218 : : /* The encoding has 2 interleaved stepped patterns. */
9219 : 190 : vec_perm_builder sel (nunits, 2, 3);
9220 : 190 : machine_mode mode = TYPE_MODE (vectype);
9221 : 190 : sel.quick_grow (6);
9222 : 950 : for (i = 0; i < 3; i++)
9223 : : {
9224 : 570 : sel[i * 2] = i;
9225 : 570 : sel[i * 2 + 1] = i + nunits;
9226 : : }
9227 : 190 : vec_perm_indices indices (sel, 2, nunits);
9228 : 190 : if (!can_vec_perm_const_p (mode, mode, indices))
9229 : 0 : return false;
9230 : 190 : }
9231 : : break;
9232 : :
9233 : 744 : case vect_step_op_mul:
9234 : 744 : {
9235 : : /* Check for backend support of MULT_EXPR. */
9236 : 744 : if (!directly_supported_p (MULT_EXPR, vectype))
9237 : : return false;
9238 : :
9239 : : /* ?? How to construct vector step for variable number vector.
9240 : : [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9241 : : if (!vf.is_constant ())
9242 : : return false;
9243 : : }
9244 : : break;
9245 : :
9246 : 7022 : case vect_step_op_shr:
9247 : : /* Check for backend support of RSHIFT_EXPR. */
9248 : 7022 : if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9249 : : return false;
9250 : :
9251 : : /* Don't shift more than type precision to avoid UD. */
9252 : 26 : if (!tree_fits_uhwi_p (step_expr)
9253 : 26 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9254 : : TYPE_PRECISION (TREE_TYPE (init_expr))))
9255 : : return false;
9256 : : break;
9257 : :
9258 : 104 : case vect_step_op_shl:
9259 : : /* Check for backend support of RSHIFT_EXPR. */
9260 : 104 : if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9261 : : return false;
9262 : :
9263 : : /* Don't shift more than type precision to avoid UD. */
9264 : 12 : if (!tree_fits_uhwi_p (step_expr)
9265 : 12 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9266 : : TYPE_PRECISION (TREE_TYPE (init_expr))))
9267 : : return false;
9268 : :
9269 : : break;
9270 : :
9271 : 0 : default:
9272 : 0 : gcc_unreachable ();
9273 : : }
9274 : :
9275 : 3152 : if (cost_vec) /* transformation not required. */
9276 : : {
9277 : 2236 : unsigned inside_cost = 0, prologue_cost = 0;
9278 : : /* loop cost for vec_loop. Neg induction doesn't have any
9279 : : inside_cost. */
9280 : 2236 : inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9281 : : slp_node, 0, vect_body);
9282 : :
9283 : : /* loop cost for vec_loop. Neg induction doesn't have any
9284 : : inside_cost. */
9285 : 2236 : if (induction_type == vect_step_op_neg)
9286 : 1712 : inside_cost = 0;
9287 : :
9288 : : /* prologue cost for vec_init and vec_step. */
9289 : 2236 : prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9290 : : slp_node, 0, vect_prologue);
9291 : :
9292 : 2236 : if (dump_enabled_p ())
9293 : 60 : dump_printf_loc (MSG_NOTE, vect_location,
9294 : : "vect_model_induction_cost: inside_cost = %d, "
9295 : : "prologue_cost = %d. \n", inside_cost,
9296 : : prologue_cost);
9297 : :
9298 : 2236 : SLP_TREE_TYPE (slp_node) = induc_vec_info_type;
9299 : 2236 : DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9300 : 2236 : return true;
9301 : : }
9302 : :
9303 : : /* Transform. */
9304 : :
9305 : : /* Compute a vector variable, initialized with the first VF values of
9306 : : the induction variable. E.g., for an iv with IV_PHI='X' and
9307 : : evolution S, for a vector of 4 units, we want to compute:
9308 : : [X, X + S, X + 2*S, X + 3*S]. */
9309 : :
9310 : 916 : if (dump_enabled_p ())
9311 : 32 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9312 : :
9313 : 916 : pe = loop_preheader_edge (iv_loop);
9314 : : /* Find the first insertion point in the BB. */
9315 : 916 : basic_block bb = gimple_bb (phi);
9316 : 916 : si = gsi_after_labels (bb);
9317 : :
9318 : 916 : gimple_seq stmts = NULL;
9319 : :
9320 : 916 : niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9321 : : /* If we are using the loop mask to "peel" for alignment then we need
9322 : : to adjust the start value here. */
9323 : 916 : if (niters_skip != NULL_TREE)
9324 : 0 : init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9325 : : step_expr, induction_type);
9326 : :
9327 : 916 : vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9328 : : step_expr, nunits, vectype,
9329 : : induction_type);
9330 : 916 : if (stmts)
9331 : : {
9332 : 162 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9333 : 162 : gcc_assert (!new_bb);
9334 : : }
9335 : :
9336 : 916 : stmts = NULL;
9337 : 916 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9338 : : vf, induction_type);
9339 : 916 : if (stmts)
9340 : : {
9341 : 0 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9342 : 0 : gcc_assert (!new_bb);
9343 : : }
9344 : :
9345 : 916 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9346 : : new_name, vectype,
9347 : : induction_type);
9348 : : /* Create the following def-use cycle:
9349 : : loop prolog:
9350 : : vec_init = ...
9351 : : vec_step = ...
9352 : : loop:
9353 : : vec_iv = PHI <vec_init, vec_loop>
9354 : : ...
9355 : : STMT
9356 : : ...
9357 : : vec_loop = vec_iv + vec_step; */
9358 : :
9359 : : /* Create the induction-phi that defines the induction-operand. */
9360 : 916 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9361 : 916 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
9362 : 916 : induc_def = PHI_RESULT (induction_phi);
9363 : :
9364 : : /* Create the iv update inside the loop. */
9365 : 916 : stmts = NULL;
9366 : 916 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9367 : : induc_def, vec_step,
9368 : : induction_type);
9369 : :
9370 : 916 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9371 : 916 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9372 : :
9373 : : /* Set the arguments of the phi node: */
9374 : 916 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9375 : 916 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9376 : : UNKNOWN_LOCATION);
9377 : :
9378 : 916 : slp_node->push_vec_def (induction_phi);
9379 : :
9380 : : /* In case that vectorization factor (VF) is bigger than the number
9381 : : of elements that we can fit in a vectype (nunits), we have to generate
9382 : : more than one vector stmt - i.e - we need to "unroll" the
9383 : : vector stmt by a factor VF/nunits. For more details see documentation
9384 : : in vectorizable_operation. */
9385 : :
9386 : 916 : if (ncopies > 1)
9387 : : {
9388 : 286 : stmts = NULL;
9389 : : /* FORNOW. This restriction should be relaxed. */
9390 : 286 : gcc_assert (!nested_in_vect_loop);
9391 : :
9392 : 286 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9393 : : nunits, induction_type);
9394 : :
9395 : 286 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9396 : : new_name, vectype,
9397 : : induction_type);
9398 : 286 : vec_def = induc_def;
9399 : 1046 : for (i = 1; i < ncopies; i++)
9400 : : {
9401 : : /* vec_i = vec_prev + vec_step. */
9402 : 474 : stmts = NULL;
9403 : 474 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9404 : : vec_def, vec_step,
9405 : : induction_type);
9406 : 474 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9407 : 474 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9408 : 474 : slp_node->push_vec_def (new_stmt);
9409 : : }
9410 : : }
9411 : :
9412 : 916 : if (dump_enabled_p ())
9413 : 64 : dump_printf_loc (MSG_NOTE, vect_location,
9414 : : "transform induction: created def-use cycle: %G%G",
9415 : 32 : (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9416 : :
9417 : : return true;
9418 : : }
9419 : :
9420 : : /* Function vectorizable_induction
9421 : :
9422 : : Check if STMT_INFO performs an induction computation that can be vectorized.
9423 : : If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9424 : : phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9425 : : Return true if STMT_INFO is vectorizable in this way. */
9426 : :
9427 : : bool
9428 : 270271 : vectorizable_induction (loop_vec_info loop_vinfo,
9429 : : stmt_vec_info stmt_info,
9430 : : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9431 : : {
9432 : 270271 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9433 : 270271 : bool nested_in_vect_loop = false;
9434 : 270271 : class loop *iv_loop;
9435 : 270271 : tree vec_def;
9436 : 270271 : edge pe = loop_preheader_edge (loop);
9437 : 270271 : basic_block new_bb;
9438 : 270271 : tree vec_init = NULL_TREE, vec_step, t;
9439 : 270271 : tree new_name;
9440 : 270271 : gphi *induction_phi;
9441 : 270271 : tree induc_def, vec_dest;
9442 : 270271 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9443 : 270271 : unsigned i;
9444 : 270271 : tree expr;
9445 : 270271 : tree index_vectype = NULL_TREE;
9446 : 270271 : gimple_stmt_iterator si;
9447 : 270271 : enum vect_induction_op_type induction_type
9448 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9449 : :
9450 : 293939 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9451 : 143797 : if (!phi)
9452 : : return false;
9453 : :
9454 : 143797 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
9455 : : return false;
9456 : :
9457 : : /* Make sure it was recognized as induction computation. */
9458 : 143797 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9459 : : return false;
9460 : :
9461 : : /* Handle nonlinear induction in a separate place. */
9462 : 140247 : if (induction_type != vect_step_op_add)
9463 : 10412 : return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9464 : 10412 : slp_node, cost_vec);
9465 : :
9466 : 129835 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9467 : 129835 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9468 : :
9469 : : /* FORNOW. These restrictions should be relaxed. */
9470 : 129835 : if (nested_in_vect_loop_p (loop, stmt_info))
9471 : : {
9472 : 602 : imm_use_iterator imm_iter;
9473 : 602 : use_operand_p use_p;
9474 : 602 : gimple *exit_phi;
9475 : 602 : edge latch_e;
9476 : 602 : tree loop_arg;
9477 : :
9478 : 602 : exit_phi = NULL;
9479 : 602 : latch_e = loop_latch_edge (loop->inner);
9480 : 602 : loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9481 : 1848 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9482 : : {
9483 : 654 : gimple *use_stmt = USE_STMT (use_p);
9484 : 654 : if (is_gimple_debug (use_stmt))
9485 : 36 : continue;
9486 : :
9487 : 618 : if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9488 : : {
9489 : : exit_phi = use_stmt;
9490 : : break;
9491 : : }
9492 : 602 : }
9493 : 602 : if (exit_phi)
9494 : : {
9495 : 10 : stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9496 : 10 : if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9497 : 6 : && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9498 : : {
9499 : 4 : if (dump_enabled_p ())
9500 : 4 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9501 : : "inner-loop induction only used outside "
9502 : : "of the outer vectorized loop.\n");
9503 : 4 : return false;
9504 : : }
9505 : : }
9506 : :
9507 : 598 : nested_in_vect_loop = true;
9508 : 598 : iv_loop = loop->inner;
9509 : : }
9510 : : else
9511 : : iv_loop = loop;
9512 : 129831 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9513 : :
9514 : 129831 : if (!nunits.is_constant () && SLP_TREE_LANES (slp_node) != 1)
9515 : : {
9516 : : /* The current SLP code creates the step value element-by-element. */
9517 : : if (dump_enabled_p ())
9518 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9519 : : "SLP induction not supported for variable-length"
9520 : : " vectors.\n");
9521 : : return false;
9522 : : }
9523 : :
9524 : 129831 : if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9525 : : {
9526 : 12 : if (dump_enabled_p ())
9527 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9528 : : "floating point induction vectorization disabled\n");
9529 : 12 : return false;
9530 : : }
9531 : :
9532 : 129819 : tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9533 : 129819 : gcc_assert (step_expr != NULL_TREE);
9534 : 259592 : if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
9535 : 259500 : && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
9536 : : {
9537 : 12 : if (dump_enabled_p ())
9538 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9539 : : "bit-precision induction vectorization not "
9540 : : "supported.\n");
9541 : 12 : return false;
9542 : : }
9543 : 129807 : tree stept = TREE_TYPE (step_expr);
9544 : 129807 : tree step_vectype = get_same_sized_vectype (stept, vectype);
9545 : 129807 : stept = TREE_TYPE (step_vectype);
9546 : :
9547 : : /* Check for target support of the vectorized arithmetic used here. */
9548 : 129807 : if (!target_supports_op_p (step_vectype, PLUS_EXPR, optab_default)
9549 : 129807 : || !target_supports_op_p (step_vectype, MINUS_EXPR, optab_default))
9550 : 20090 : return false;
9551 : 109717 : if (!nunits.is_constant ())
9552 : : {
9553 : : if (!target_supports_op_p (step_vectype, MULT_EXPR, optab_default))
9554 : : return false;
9555 : : /* FLOAT_EXPR when computing VEC_INIT for float inductions. */
9556 : : if (SCALAR_FLOAT_TYPE_P (stept))
9557 : : {
9558 : : tree index_type = build_nonstandard_integer_type
9559 : : (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (stept)), 1);
9560 : :
9561 : : index_vectype = build_vector_type (index_type, nunits);
9562 : : if (!can_float_p (TYPE_MODE (step_vectype),
9563 : : TYPE_MODE (index_vectype), 1))
9564 : : return false;
9565 : : }
9566 : : }
9567 : :
9568 : 109717 : unsigned nvects = vect_get_num_copies (loop_vinfo, slp_node);
9569 : 109717 : if (cost_vec) /* transformation not required. */
9570 : : {
9571 : 277947 : unsigned inside_cost = 0, prologue_cost = 0;
9572 : : /* We eventually need to set a vector type on invariant
9573 : : arguments. */
9574 : : unsigned j;
9575 : : slp_tree child;
9576 : 277947 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9577 : 185298 : if (!vect_maybe_update_slp_op_vectype
9578 : 185298 : (child, SLP_TREE_VECTYPE (slp_node)))
9579 : : {
9580 : 0 : if (dump_enabled_p ())
9581 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9582 : : "incompatible vector types for "
9583 : : "invariants\n");
9584 : 0 : return false;
9585 : : }
9586 : : /* loop cost for vec_loop. */
9587 : 92649 : inside_cost = record_stmt_cost (cost_vec, nvects,
9588 : : vector_stmt, slp_node, 0, vect_body);
9589 : : /* prologue cost for vec_init (if not nested) and step. */
9590 : 92649 : prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9591 : : scalar_to_vec,
9592 : : slp_node, 0, vect_prologue);
9593 : 92649 : if (dump_enabled_p ())
9594 : 3954 : dump_printf_loc (MSG_NOTE, vect_location,
9595 : : "vect_model_induction_cost: inside_cost = %d, "
9596 : : "prologue_cost = %d .\n", inside_cost,
9597 : : prologue_cost);
9598 : :
9599 : 92649 : SLP_TREE_TYPE (slp_node) = induc_vec_info_type;
9600 : 92649 : DUMP_VECT_SCOPE ("vectorizable_induction");
9601 : 92649 : return true;
9602 : : }
9603 : :
9604 : : /* Transform. */
9605 : :
9606 : : /* Compute a vector variable, initialized with the first VF values of
9607 : : the induction variable. E.g., for an iv with IV_PHI='X' and
9608 : : evolution S, for a vector of 4 units, we want to compute:
9609 : : [X, X + S, X + 2*S, X + 3*S]. */
9610 : :
9611 : 17068 : if (dump_enabled_p ())
9612 : 2835 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9613 : :
9614 : 17068 : pe = loop_preheader_edge (iv_loop);
9615 : : /* Find the first insertion point in the BB. */
9616 : 17068 : basic_block bb = gimple_bb (phi);
9617 : 17068 : si = gsi_after_labels (bb);
9618 : :
9619 : : /* For SLP induction we have to generate several IVs as for example
9620 : : with group size 3 we need
9621 : : [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9622 : : [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9623 : 17068 : gimple_stmt_iterator incr_si;
9624 : 17068 : bool insert_after;
9625 : 17068 : standard_iv_increment_position (iv_loop, &incr_si, &insert_after);
9626 : :
9627 : : /* The initial values are vectorized, but any lanes > group_size
9628 : : need adjustment. */
9629 : 17068 : slp_tree init_node
9630 : 17068 : = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9631 : :
9632 : : /* Gather steps. Since we do not vectorize inductions as
9633 : : cycles we have to reconstruct the step from SCEV data. */
9634 : 17068 : unsigned group_size = SLP_TREE_LANES (slp_node);
9635 : 17068 : tree *steps = XALLOCAVEC (tree, group_size);
9636 : 17068 : tree *inits = XALLOCAVEC (tree, group_size);
9637 : 17068 : stmt_vec_info phi_info;
9638 : 52409 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9639 : : {
9640 : 18273 : steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9641 : 18273 : if (!init_node)
9642 : 18097 : inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9643 : : pe->dest_idx);
9644 : : }
9645 : :
9646 : : /* Now generate the IVs. */
9647 : 34136 : gcc_assert (multiple_p (nunits * nvects, group_size));
9648 : 17068 : unsigned nivs;
9649 : 17068 : unsigned HOST_WIDE_INT const_nunits;
9650 : 17068 : if (nested_in_vect_loop)
9651 : : nivs = nvects;
9652 : 16906 : else if (nunits.is_constant (&const_nunits))
9653 : : {
9654 : : /* Compute the number of distinct IVs we need. First reduce
9655 : : group_size if it is a multiple of const_nunits so we get
9656 : : one IV for a group_size of 4 but const_nunits 2. */
9657 : 16906 : unsigned group_sizep = group_size;
9658 : 16906 : if (group_sizep % const_nunits == 0)
9659 : 109 : group_sizep = group_sizep / const_nunits;
9660 : 16906 : nivs = least_common_multiple (group_sizep, const_nunits) / const_nunits;
9661 : : }
9662 : : else
9663 : : {
9664 : : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
9665 : : nivs = 1;
9666 : : }
9667 : 17068 : gimple_seq init_stmts = NULL;
9668 : 17068 : tree lupdate_mul = NULL_TREE;
9669 : 162 : if (!nested_in_vect_loop)
9670 : : {
9671 : 16906 : if (nunits.is_constant (&const_nunits))
9672 : : {
9673 : : /* The number of iterations covered in one vector iteration. */
9674 : 16906 : unsigned lup_mul = (nvects * const_nunits) / group_size;
9675 : 16906 : lupdate_mul
9676 : 16906 : = build_vector_from_val (step_vectype,
9677 : 16906 : SCALAR_FLOAT_TYPE_P (stept)
9678 : 27 : ? build_real_from_wide (stept, lup_mul,
9679 : : UNSIGNED)
9680 : 33785 : : build_int_cstu (stept, lup_mul));
9681 : : }
9682 : : else
9683 : : {
9684 : : if (SCALAR_FLOAT_TYPE_P (stept))
9685 : : {
9686 : : tree tem = build_int_cst (integer_type_node, vf);
9687 : : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
9688 : : }
9689 : : else
9690 : : lupdate_mul = build_int_cst (stept, vf);
9691 : : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
9692 : : lupdate_mul);
9693 : : }
9694 : : }
9695 : 17068 : tree peel_mul = NULL_TREE;
9696 : 17068 : if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9697 : : {
9698 : 0 : if (SCALAR_FLOAT_TYPE_P (stept))
9699 : 0 : peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9700 : : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9701 : : else
9702 : 0 : peel_mul = gimple_convert (&init_stmts, stept,
9703 : : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9704 : 0 : peel_mul = gimple_build_vector_from_val (&init_stmts,
9705 : : step_vectype, peel_mul);
9706 : :
9707 : : /* If early break then we have to create a new PHI which we can use as
9708 : : an offset to adjust the induction reduction in early exits.
9709 : :
9710 : : This is because when peeling for alignment using masking, the first
9711 : : few elements of the vector can be inactive. As such if we find the
9712 : : entry in the first iteration we have adjust the starting point of
9713 : : the scalar code.
9714 : :
9715 : : We do this by creating a new scalar PHI that keeps track of whether
9716 : : we are the first iteration of the loop (with the additional masking)
9717 : : or whether we have taken a loop iteration already.
9718 : :
9719 : : The generated sequence:
9720 : :
9721 : : pre-header:
9722 : : bb1:
9723 : : i_1 = <number of leading inactive elements>
9724 : :
9725 : : header:
9726 : : bb2:
9727 : : i_2 = PHI <i_1(bb1), 0(latch)>
9728 : : …
9729 : :
9730 : : early-exit:
9731 : : bb3:
9732 : : i_3 = iv_step * i_2 + PHI<vector-iv>
9733 : :
9734 : : The first part of the adjustment to create i_1 and i_2 are done here
9735 : : and the last part creating i_3 is done in
9736 : : vectorizable_live_operations when the induction extraction is
9737 : : materialized. */
9738 : 0 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
9739 : 0 : && !LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo))
9740 : : {
9741 : 0 : auto skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9742 : 0 : tree ty_skip_niters = TREE_TYPE (skip_niters);
9743 : 0 : tree break_lhs_phi = vect_get_new_vect_var (ty_skip_niters,
9744 : : vect_scalar_var,
9745 : : "pfa_iv_offset");
9746 : 0 : gphi *nphi = create_phi_node (break_lhs_phi, bb);
9747 : 0 : add_phi_arg (nphi, skip_niters, pe, UNKNOWN_LOCATION);
9748 : 0 : add_phi_arg (nphi, build_zero_cst (ty_skip_niters),
9749 : : loop_latch_edge (iv_loop), UNKNOWN_LOCATION);
9750 : :
9751 : 0 : LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo) = PHI_RESULT (nphi);
9752 : : }
9753 : : }
9754 : 17068 : tree step_mul = NULL_TREE;
9755 : 17068 : unsigned ivn;
9756 : 17068 : auto_vec<tree> vec_steps;
9757 : 34702 : for (ivn = 0; ivn < nivs; ++ivn)
9758 : : {
9759 : 17634 : gimple_seq stmts = NULL;
9760 : 17634 : bool invariant = true;
9761 : 17634 : if (nunits.is_constant (&const_nunits))
9762 : : {
9763 : 17634 : tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9764 : 17634 : tree_vector_builder init_elts (vectype, const_nunits, 1);
9765 : 17634 : tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9766 : 117800 : for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9767 : : {
9768 : : /* The scalar steps of the IVs. */
9769 : 100166 : tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9770 : 100166 : elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9771 : 100166 : step_elts.quick_push (elt);
9772 : 100166 : if (!init_node)
9773 : : {
9774 : : /* The scalar inits of the IVs if not vectorized. */
9775 : 99204 : elt = inits[(ivn*const_nunits + eltn) % group_size];
9776 : 99204 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
9777 : 99204 : TREE_TYPE (elt)))
9778 : 266 : elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9779 : 266 : TREE_TYPE (vectype), elt);
9780 : 99204 : init_elts.quick_push (elt);
9781 : : }
9782 : : /* The number of steps to add to the initial values. */
9783 : 100166 : unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9784 : 200332 : mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9785 : 200234 : ? build_real_from_wide (stept, mul_elt,
9786 : : UNSIGNED)
9787 : 200234 : : build_int_cstu (stept, mul_elt));
9788 : : }
9789 : 17634 : vec_step = gimple_build_vector (&init_stmts, &step_elts);
9790 : 17634 : step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9791 : 17634 : if (!init_node)
9792 : 17440 : vec_init = gimple_build_vector (&init_stmts, &init_elts);
9793 : 17634 : }
9794 : : else
9795 : : {
9796 : : if (init_node)
9797 : : ;
9798 : : else if (INTEGRAL_TYPE_P (TREE_TYPE (steps[0])))
9799 : : {
9800 : : new_name = gimple_convert (&init_stmts, stept, inits[0]);
9801 : : /* Build the initial value directly as a VEC_SERIES_EXPR. */
9802 : : vec_init = gimple_build (&init_stmts, VEC_SERIES_EXPR,
9803 : : step_vectype, new_name, steps[0]);
9804 : : if (!useless_type_conversion_p (vectype, step_vectype))
9805 : : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9806 : : vectype, vec_init);
9807 : : }
9808 : : else
9809 : : {
9810 : : /* Build:
9811 : : [base, base, base, ...]
9812 : : + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
9813 : : gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (steps[0])));
9814 : : gcc_assert (flag_associative_math);
9815 : : gcc_assert (index_vectype != NULL_TREE);
9816 : :
9817 : : tree index = build_index_vector (index_vectype, 0, 1);
9818 : : new_name = gimple_convert (&init_stmts, TREE_TYPE (steps[0]),
9819 : : inits[0]);
9820 : : tree base_vec = gimple_build_vector_from_val (&init_stmts,
9821 : : step_vectype,
9822 : : new_name);
9823 : : tree step_vec = gimple_build_vector_from_val (&init_stmts,
9824 : : step_vectype,
9825 : : steps[0]);
9826 : : vec_init = gimple_build (&init_stmts, FLOAT_EXPR,
9827 : : step_vectype, index);
9828 : : vec_init = gimple_build (&init_stmts, MULT_EXPR,
9829 : : step_vectype, vec_init, step_vec);
9830 : : vec_init = gimple_build (&init_stmts, PLUS_EXPR,
9831 : : step_vectype, vec_init, base_vec);
9832 : : if (!useless_type_conversion_p (vectype, step_vectype))
9833 : : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9834 : : vectype, vec_init);
9835 : : }
9836 : : /* iv_loop is nested in the loop to be vectorized. Generate:
9837 : : vec_step = [S, S, S, S] */
9838 : : t = unshare_expr (steps[0]);
9839 : : gcc_assert (CONSTANT_CLASS_P (t)
9840 : : || TREE_CODE (t) == SSA_NAME);
9841 : : vec_step = gimple_build_vector_from_val (&init_stmts,
9842 : : step_vectype, t);
9843 : : }
9844 : 17634 : vec_steps.safe_push (vec_step);
9845 : 17634 : if (peel_mul)
9846 : : {
9847 : 0 : if (!step_mul)
9848 : : {
9849 : 0 : gcc_assert (!nunits.is_constant ());
9850 : : step_mul = gimple_build (&init_stmts,
9851 : : MINUS_EXPR, step_vectype,
9852 : : build_zero_cst (step_vectype), peel_mul);
9853 : : }
9854 : : else
9855 : 0 : step_mul = gimple_build (&init_stmts,
9856 : : MINUS_EXPR, step_vectype,
9857 : : step_mul, peel_mul);
9858 : : }
9859 : :
9860 : : /* Create the induction-phi that defines the induction-operand. */
9861 : 17634 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9862 : : "vec_iv_");
9863 : 17634 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
9864 : 17634 : induc_def = PHI_RESULT (induction_phi);
9865 : :
9866 : : /* Create the iv update inside the loop */
9867 : 17634 : tree up = vec_step;
9868 : 17634 : if (lupdate_mul)
9869 : : {
9870 : 17440 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
9871 : : {
9872 : : /* When we're using loop_len produced by SELEC_VL, the
9873 : : non-final iterations are not always processing VF
9874 : : elements. So vectorize induction variable instead of
9875 : :
9876 : : _21 = vect_vec_iv_.6_22 + { VF, ... };
9877 : :
9878 : : We should generate:
9879 : :
9880 : : _35 = .SELECT_VL (ivtmp_33, VF);
9881 : : vect_cst__22 = [vec_duplicate_expr] _35;
9882 : : _21 = vect_vec_iv_.6_22 + vect_cst__22; */
9883 : 0 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
9884 : 0 : tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1,
9885 : : vectype, 0, 0);
9886 : 0 : if (SCALAR_FLOAT_TYPE_P (stept))
9887 : 0 : expr = gimple_build (&stmts, FLOAT_EXPR, stept, len);
9888 : : else
9889 : 0 : expr = gimple_convert (&stmts, stept, len);
9890 : 0 : lupdate_mul = gimple_build_vector_from_val (&stmts, step_vectype,
9891 : : expr);
9892 : 0 : up = gimple_build (&stmts, MULT_EXPR,
9893 : : step_vectype, vec_step, lupdate_mul);
9894 : : }
9895 : : else
9896 : 17440 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9897 : : vec_step, lupdate_mul);
9898 : : }
9899 : 17634 : vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9900 : 17634 : vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, up);
9901 : 17634 : vec_def = gimple_convert (&stmts, vectype, vec_def);
9902 : 17634 : insert_iv_increment (&incr_si, insert_after, stmts);
9903 : 17634 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9904 : : UNKNOWN_LOCATION);
9905 : :
9906 : 17634 : if (init_node)
9907 : 194 : vec_init = vect_get_slp_vect_def (init_node, ivn);
9908 : 17634 : if (!nested_in_vect_loop
9909 : 17634 : && step_mul
9910 : 17634 : && !integer_zerop (step_mul))
9911 : : {
9912 : 17006 : gcc_assert (invariant);
9913 : 17006 : vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9914 : 17006 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9915 : : vec_step, step_mul);
9916 : 17006 : vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9917 : : vec_def, up);
9918 : 17006 : vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9919 : : }
9920 : :
9921 : : /* Set the arguments of the phi node: */
9922 : 17634 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9923 : :
9924 : 17634 : slp_node->push_vec_def (induction_phi);
9925 : : }
9926 : 17068 : if (!nested_in_vect_loop)
9927 : : {
9928 : : /* Fill up to the number of vectors we need for the whole group. */
9929 : 16906 : if (nunits.is_constant (&const_nunits))
9930 : 16906 : nivs = least_common_multiple (group_size, const_nunits) / const_nunits;
9931 : : else
9932 : : nivs = 1;
9933 : 16906 : vec_steps.reserve (nivs-ivn);
9934 : 33833 : for (; ivn < nivs; ++ivn)
9935 : : {
9936 : 21 : slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
9937 : 21 : vec_steps.quick_push (vec_steps[0]);
9938 : : }
9939 : : }
9940 : :
9941 : : /* Re-use IVs when we can. We are generating further vector
9942 : : stmts by adding VF' * stride to the IVs generated above. */
9943 : 17068 : if (ivn < nvects)
9944 : : {
9945 : 4105 : if (nunits.is_constant (&const_nunits))
9946 : : {
9947 : 4105 : unsigned vfp = (least_common_multiple (group_size, const_nunits)
9948 : 4105 : / group_size);
9949 : 4105 : lupdate_mul
9950 : 4105 : = build_vector_from_val (step_vectype,
9951 : 4105 : SCALAR_FLOAT_TYPE_P (stept)
9952 : 8 : ? build_real_from_wide (stept,
9953 : 8 : vfp, UNSIGNED)
9954 : 8202 : : build_int_cstu (stept, vfp));
9955 : : }
9956 : : else
9957 : : {
9958 : : if (SCALAR_FLOAT_TYPE_P (stept))
9959 : : {
9960 : : tree tem = build_int_cst (integer_type_node, nunits);
9961 : : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
9962 : : }
9963 : : else
9964 : : lupdate_mul = build_int_cst (stept, nunits);
9965 : : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
9966 : : lupdate_mul);
9967 : : }
9968 : 12854 : for (; ivn < nvects; ++ivn)
9969 : : {
9970 : 8749 : gimple *iv
9971 : 8749 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
9972 : 8749 : tree def = gimple_get_lhs (iv);
9973 : 8749 : if (ivn < 2*nivs)
9974 : 4197 : vec_steps[ivn - nivs]
9975 : 4197 : = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9976 : 4197 : vec_steps[ivn - nivs], lupdate_mul);
9977 : 8749 : gimple_seq stmts = NULL;
9978 : 8749 : def = gimple_convert (&stmts, step_vectype, def);
9979 : 26247 : def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9980 : 8749 : def, vec_steps[ivn % nivs]);
9981 : 8749 : def = gimple_convert (&stmts, vectype, def);
9982 : 8749 : if (gimple_code (iv) == GIMPLE_PHI)
9983 : 4197 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9984 : : else
9985 : : {
9986 : 4552 : gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
9987 : 4552 : gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
9988 : : }
9989 : 8749 : slp_node->push_vec_def (def);
9990 : : }
9991 : : }
9992 : :
9993 : 17068 : new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
9994 : 17068 : gcc_assert (!new_bb);
9995 : :
9996 : 17068 : return true;
9997 : 17068 : }
9998 : :
9999 : : /* Function vectorizable_live_operation_1.
10000 : :
10001 : : helper function for vectorizable_live_operation. */
10002 : :
10003 : : static tree
10004 : 5259 : vectorizable_live_operation_1 (loop_vec_info loop_vinfo, basic_block exit_bb,
10005 : : tree vectype, slp_tree slp_node,
10006 : : tree bitsize, tree bitstart, tree vec_lhs,
10007 : : tree lhs_type, gimple_stmt_iterator *exit_gsi)
10008 : : {
10009 : 5259 : gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10010 : :
10011 : 5259 : tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10012 : 5259 : gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10013 : 10911 : for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10014 : 5652 : SET_PHI_ARG_DEF (phi, i, vec_lhs);
10015 : :
10016 : 5259 : gimple_seq stmts = NULL;
10017 : 5259 : tree new_tree;
10018 : :
10019 : : /* If bitstart is 0 then we can use a BIT_FIELD_REF */
10020 : 5259 : if (integer_zerop (bitstart))
10021 : : {
10022 : 2735 : tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10023 : : vec_lhs_phi, bitsize, bitstart);
10024 : :
10025 : : /* Convert the extracted vector element to the scalar type. */
10026 : 2735 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10027 : : }
10028 : 2524 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10029 : : {
10030 : : /* Emit:
10031 : :
10032 : : SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - (BIAS + 1)>
10033 : :
10034 : : where VEC_LHS is the vectorized live-out result, LEN is the length of
10035 : : the vector, BIAS is the load-store bias. The bias should not be used
10036 : : at all since we are not using load/store operations, but LEN will be
10037 : : REALLEN + BIAS, so subtract it to get to the correct position. */
10038 : 0 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
10039 : 0 : gimple_seq tem = NULL;
10040 : 0 : gimple_stmt_iterator gsi = gsi_last (tem);
10041 : 0 : tree len = vect_get_loop_len (loop_vinfo, &gsi,
10042 : : &LOOP_VINFO_LENS (loop_vinfo),
10043 : : 1, vectype, 0, 1);
10044 : 0 : gimple_seq_add_seq (&stmts, tem);
10045 : :
10046 : : /* BIAS + 1. */
10047 : 0 : signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10048 : 0 : tree bias_plus_one
10049 : 0 : = int_const_binop (PLUS_EXPR,
10050 : 0 : build_int_cst (TREE_TYPE (len), biasval),
10051 : 0 : build_one_cst (TREE_TYPE (len)));
10052 : :
10053 : : /* LAST_INDEX = LEN - (BIAS + 1). */
10054 : 0 : tree last_index = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (len),
10055 : : len, bias_plus_one);
10056 : :
10057 : : /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - (BIAS + 1)>. */
10058 : 0 : tree scalar_res
10059 : 0 : = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10060 : : vec_lhs_phi, last_index);
10061 : :
10062 : : /* Convert the extracted vector element to the scalar type. */
10063 : 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10064 : : }
10065 : 2524 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10066 : : {
10067 : : /* Emit:
10068 : :
10069 : : SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10070 : :
10071 : : where VEC_LHS is the vectorized live-out result and MASK is
10072 : : the loop mask for the final iteration. */
10073 : 0 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
10074 : 0 : tree scalar_type = TREE_TYPE (vectype);
10075 : 0 : gimple_seq tem = NULL;
10076 : 0 : gimple_stmt_iterator gsi = gsi_last (tem);
10077 : 0 : tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10078 : : &LOOP_VINFO_MASKS (loop_vinfo),
10079 : : 1, vectype, 0);
10080 : 0 : tree scalar_res;
10081 : 0 : gimple_seq_add_seq (&stmts, tem);
10082 : :
10083 : 0 : scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10084 : : mask, vec_lhs_phi);
10085 : :
10086 : : /* Convert the extracted vector element to the scalar type. */
10087 : 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10088 : : }
10089 : : else
10090 : : {
10091 : 2524 : tree bftype = TREE_TYPE (vectype);
10092 : 2524 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10093 : 85 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10094 : 2524 : new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10095 : 2524 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10096 : : &stmts, true, NULL_TREE);
10097 : : }
10098 : :
10099 : 5259 : *exit_gsi = gsi_after_labels (exit_bb);
10100 : 5259 : if (stmts)
10101 : 5259 : gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10102 : :
10103 : 5259 : return new_tree;
10104 : : }
10105 : :
10106 : : /* Function vectorizable_live_operation.
10107 : :
10108 : : STMT_INFO computes a value that is used outside the loop. Check if
10109 : : it can be supported. */
10110 : :
10111 : : bool
10112 : 277452 : vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10113 : : slp_tree slp_node, slp_instance slp_node_instance,
10114 : : int slp_index, bool vec_stmt_p,
10115 : : stmt_vector_for_cost *cost_vec)
10116 : : {
10117 : 277452 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10118 : 277452 : imm_use_iterator imm_iter;
10119 : 277452 : tree lhs, lhs_type, bitsize;
10120 : 277452 : tree vectype = SLP_TREE_VECTYPE (slp_node);
10121 : 277452 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10122 : 277452 : gimple *use_stmt;
10123 : 277452 : use_operand_p use_p;
10124 : 277452 : auto_vec<tree> vec_oprnds;
10125 : 277452 : int vec_entry = 0;
10126 : 277452 : poly_uint64 vec_index = 0;
10127 : :
10128 : 277452 : gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10129 : : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10130 : :
10131 : : /* If a stmt of a reduction is live, vectorize it via
10132 : : vect_create_epilog_for_reduction. vectorizable_reduction assessed
10133 : : validity so just trigger the transform here. */
10134 : 277452 : if (vect_is_reduction (slp_node))
10135 : : {
10136 : 56796 : if (!vec_stmt_p)
10137 : : return true;
10138 : : /* For SLP reductions we vectorize the epilogue for all involved stmts
10139 : : together. For SLP reduction chains we only get here once. */
10140 : 23188 : if (SLP_INSTANCE_KIND (slp_node_instance) == slp_inst_kind_reduc_group
10141 : 22945 : && slp_index != 0)
10142 : : return true;
10143 : 22729 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
10144 : 22729 : if (VECT_REDUC_INFO_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10145 : 22729 : || VECT_REDUC_INFO_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10146 : : return true;
10147 : :
10148 : 21899 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
10149 : 21899 : || !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10150 : 21895 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10151 : : slp_node_instance,
10152 : : LOOP_VINFO_IV_EXIT (loop_vinfo));
10153 : :
10154 : : /* If early break we only have to materialize the reduction on the merge
10155 : : block, but we have to find an alternate exit first. */
10156 : 21899 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10157 : : {
10158 : 23 : slp_tree phis_node = slp_node_instance->reduc_phis;
10159 : 23 : stmt_info = SLP_TREE_REPRESENTATIVE (phis_node);
10160 : 69 : for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10161 : 23 : if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
10162 : : {
10163 : 23 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10164 : : phis_node, slp_node_instance,
10165 : : exit);
10166 : 23 : break;
10167 : 23 : }
10168 : 23 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10169 : 4 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10170 : : phis_node, slp_node_instance,
10171 : : LOOP_VINFO_IV_EXIT (loop_vinfo));
10172 : : }
10173 : :
10174 : 21899 : return true;
10175 : : }
10176 : :
10177 : : /* If STMT is not relevant and it is a simple assignment and its inputs are
10178 : : invariant then it can remain in place, unvectorized. The original last
10179 : : scalar value that it computes will be used. */
10180 : 220656 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
10181 : : {
10182 : 0 : gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10183 : 0 : if (dump_enabled_p ())
10184 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
10185 : : "statement is simple and uses invariant. Leaving in "
10186 : : "place.\n");
10187 : 0 : return true;
10188 : : }
10189 : :
10190 : 220656 : gcc_assert (slp_index >= 0);
10191 : :
10192 : : /* Get the last occurrence of the scalar index from the concatenation of
10193 : : all the slp vectors. Calculate which slp vector it is and the index
10194 : : within. */
10195 : 220656 : int num_scalar = SLP_TREE_LANES (slp_node);
10196 : 220656 : int num_vec = vect_get_num_copies (vinfo, slp_node);
10197 : 220656 : poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10198 : :
10199 : : /* Calculate which vector contains the result, and which lane of
10200 : : that vector we need. */
10201 : 220656 : if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10202 : : {
10203 : : if (dump_enabled_p ())
10204 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10205 : : "Cannot determine which vector holds the"
10206 : : " final result.\n");
10207 : : return false;
10208 : : }
10209 : :
10210 : 220656 : if (!vec_stmt_p)
10211 : : {
10212 : : /* No transformation required. */
10213 : 179119 : if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10214 : : {
10215 : 33445 : if (SLP_TREE_LANES (slp_node) != 1)
10216 : : {
10217 : 15 : if (dump_enabled_p ())
10218 : 15 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10219 : : "can't operate on partial vectors "
10220 : : "because an SLP statement is live after "
10221 : : "the loop.\n");
10222 : 15 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10223 : : }
10224 : 33430 : else if (num_vec > 1)
10225 : : {
10226 : 18437 : if (dump_enabled_p ())
10227 : 57 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10228 : : "can't operate on partial vectors "
10229 : : "because ncopies is greater than 1.\n");
10230 : 18437 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10231 : : }
10232 : : else
10233 : : {
10234 : 14993 : if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10235 : : OPTIMIZE_FOR_SPEED))
10236 : 0 : vect_record_loop_mask (loop_vinfo,
10237 : : &LOOP_VINFO_MASKS (loop_vinfo),
10238 : : 1, vectype, NULL);
10239 : 14993 : else if (can_vec_extract_var_idx_p (
10240 : 14993 : TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10241 : 0 : vect_record_loop_len (loop_vinfo,
10242 : : &LOOP_VINFO_LENS (loop_vinfo),
10243 : : 1, vectype, 1);
10244 : : else
10245 : : {
10246 : 14993 : if (dump_enabled_p ())
10247 : 840 : dump_printf_loc (
10248 : 840 : MSG_MISSED_OPTIMIZATION, vect_location,
10249 : : "can't operate on partial vectors "
10250 : : "because the target doesn't support extract "
10251 : : "last reduction.\n");
10252 : 14993 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10253 : : }
10254 : : }
10255 : : }
10256 : : /* ??? Enable for loop costing as well. */
10257 : 33445 : if (!loop_vinfo)
10258 : 88870 : record_stmt_cost (cost_vec, 1, vec_to_scalar, slp_node,
10259 : : 0, vect_epilogue);
10260 : 179119 : return true;
10261 : : }
10262 : :
10263 : : /* Use the lhs of the original scalar statement. */
10264 : 41537 : gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10265 : 41537 : if (dump_enabled_p ())
10266 : 1474 : dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10267 : : "stmt %G", stmt);
10268 : :
10269 : 41537 : lhs = gimple_get_lhs (stmt);
10270 : 41537 : lhs_type = TREE_TYPE (lhs);
10271 : :
10272 : 41537 : bitsize = vector_element_bits_tree (vectype);
10273 : :
10274 : : /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10275 : 41537 : gcc_assert (!loop_vinfo
10276 : : || ((!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10277 : : && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10278 : : || SLP_TREE_LANES (slp_node) == 1));
10279 : :
10280 : : /* Get the correct slp vectorized stmt. */
10281 : 41537 : tree vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10282 : 41537 : gimple *vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10283 : :
10284 : : /* In case we need to early break vectorize also get the first stmt. */
10285 : 41537 : tree vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10286 : :
10287 : : /* Get entry to use. */
10288 : 41537 : tree bitstart = bitsize_int (vec_index);
10289 : 41537 : bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10290 : :
10291 : 41537 : if (loop_vinfo)
10292 : : {
10293 : : /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10294 : : requirement, insert one phi node for it. It looks like:
10295 : : loop;
10296 : : BB:
10297 : : # lhs' = PHI <lhs>
10298 : : ==>
10299 : : loop;
10300 : : BB:
10301 : : # vec_lhs' = PHI <vec_lhs>
10302 : : new_tree = lane_extract <vec_lhs', ...>;
10303 : : lhs' = new_tree; */
10304 : :
10305 : 5304 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10306 : : /* Check if we have a loop where the chosen exit is not the main exit,
10307 : : in these cases for an early break we restart the iteration the vector code
10308 : : did. For the live values we want the value at the start of the iteration
10309 : : rather than at the end. */
10310 : 5304 : edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10311 : 5304 : bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10312 : 27844 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10313 : 17236 : if (!is_gimple_debug (use_stmt)
10314 : 17236 : && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10315 : 5259 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10316 : : {
10317 : 5259 : edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10318 : 5259 : phi_arg_index_from_use (use_p));
10319 : 5259 : gcc_assert (loop_exit_edge_p (loop, e));
10320 : 5259 : bool main_exit_edge = e == main_e;
10321 : 5259 : tree tmp_vec_lhs = vec_lhs;
10322 : 5259 : tree tmp_bitstart = bitstart;
10323 : :
10324 : : /* For early exit where the exit is not in the BB that leads
10325 : : to the latch then we're restarting the iteration in the
10326 : : scalar loop. So get the first live value. */
10327 : 13235 : bool early_break_first_element_p
10328 : 5259 : = (all_exits_as_early_p || !main_exit_edge)
10329 : 5259 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def;
10330 : 2717 : if (early_break_first_element_p)
10331 : : {
10332 : 2717 : tmp_vec_lhs = vec_lhs0;
10333 : 2717 : tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10334 : : }
10335 : :
10336 : 5259 : gimple_stmt_iterator exit_gsi;
10337 : 5259 : tree new_tree
10338 : 5259 : = vectorizable_live_operation_1 (loop_vinfo,
10339 : : e->dest, vectype,
10340 : : slp_node, bitsize,
10341 : : tmp_bitstart, tmp_vec_lhs,
10342 : : lhs_type, &exit_gsi);
10343 : :
10344 : 5259 : auto gsi = gsi_for_stmt (use_stmt);
10345 : 5259 : if (early_break_first_element_p
10346 : 2717 : && LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo))
10347 : : {
10348 : 0 : tree step_expr
10349 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10350 : 0 : tree break_lhs_phi
10351 : : = LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo);
10352 : 0 : tree ty_skip_niters = TREE_TYPE (break_lhs_phi);
10353 : 0 : gimple_seq iv_stmts = NULL;
10354 : :
10355 : : /* Now create the PHI for the outside loop usage to
10356 : : retrieve the value for the offset counter. */
10357 : 0 : tree rphi_step
10358 : 0 : = gimple_convert (&iv_stmts, ty_skip_niters, step_expr);
10359 : 0 : tree tmp2
10360 : 0 : = gimple_build (&iv_stmts, MULT_EXPR,
10361 : : ty_skip_niters, rphi_step,
10362 : : break_lhs_phi);
10363 : :
10364 : 0 : if (POINTER_TYPE_P (TREE_TYPE (new_tree)))
10365 : : {
10366 : 0 : tmp2 = gimple_convert (&iv_stmts, sizetype, tmp2);
10367 : 0 : tmp2 = gimple_build (&iv_stmts, POINTER_PLUS_EXPR,
10368 : 0 : TREE_TYPE (new_tree), new_tree,
10369 : : tmp2);
10370 : : }
10371 : : else
10372 : : {
10373 : 0 : tmp2 = gimple_convert (&iv_stmts, TREE_TYPE (new_tree),
10374 : : tmp2);
10375 : 0 : tmp2 = gimple_build (&iv_stmts, PLUS_EXPR,
10376 : 0 : TREE_TYPE (new_tree), new_tree,
10377 : : tmp2);
10378 : : }
10379 : :
10380 : 0 : new_tree = tmp2;
10381 : 0 : gsi_insert_seq_before (&exit_gsi, iv_stmts, GSI_SAME_STMT);
10382 : : }
10383 : :
10384 : 5259 : tree lhs_phi = gimple_phi_result (use_stmt);
10385 : 5259 : remove_phi_node (&gsi, false);
10386 : 5259 : gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10387 : 5259 : gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10388 : 5259 : break;
10389 : 5304 : }
10390 : :
10391 : : /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
10392 : 22585 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10393 : 11977 : gcc_assert (is_gimple_debug (use_stmt)
10394 : 5304 : || flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10395 : : }
10396 : : else
10397 : : {
10398 : : /* For basic-block vectorization simply insert the lane-extraction. */
10399 : 36233 : tree bftype = TREE_TYPE (vectype);
10400 : 36233 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10401 : 0 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10402 : 36233 : tree new_tree = build3 (BIT_FIELD_REF, bftype,
10403 : : vec_lhs, bitsize, bitstart);
10404 : 36233 : gimple_seq stmts = NULL;
10405 : 36233 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10406 : : &stmts, true, NULL_TREE);
10407 : 36233 : if (TREE_CODE (new_tree) == SSA_NAME
10408 : 72466 : && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10409 : 2 : SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10410 : 36233 : if (is_a <gphi *> (vec_stmt))
10411 : : {
10412 : 2621 : gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10413 : 2621 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10414 : : }
10415 : : else
10416 : : {
10417 : 33612 : gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10418 : 33612 : gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10419 : : }
10420 : :
10421 : : /* Replace use of lhs with newly computed result. If the use stmt is a
10422 : : single arg PHI, just replace all uses of PHI result. It's necessary
10423 : : because lcssa PHI defining lhs may be before newly inserted stmt. */
10424 : 36233 : use_operand_p use_p;
10425 : 36233 : stmt_vec_info use_stmt_info;
10426 : 235427 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10427 : 162961 : if (!is_gimple_debug (use_stmt)
10428 : 162961 : && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10429 : 108634 : || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10430 : : {
10431 : : /* ??? This can happen when the live lane ends up being
10432 : : rooted in a vector construction code-generated by an
10433 : : external SLP node (and code-generation for that already
10434 : : happened). See gcc.dg/vect/bb-slp-47.c.
10435 : : Doing this is what would happen if that vector CTOR
10436 : : were not code-generated yet so it is not too bad.
10437 : : ??? In fact we'd likely want to avoid this situation
10438 : : in the first place. */
10439 : 63232 : if (TREE_CODE (new_tree) == SSA_NAME
10440 : 62968 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10441 : 62968 : && gimple_code (use_stmt) != GIMPLE_PHI
10442 : 118233 : && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10443 : : use_stmt))
10444 : : {
10445 : 264 : if (dump_enabled_p ())
10446 : 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10447 : : "Using original scalar computation for "
10448 : : "live lane because use preceeds vector "
10449 : : "def\n");
10450 : 264 : continue;
10451 : : }
10452 : : /* ??? It can also happen that we end up pulling a def into
10453 : : a loop where replacing out-of-loop uses would require
10454 : : a new LC SSA PHI node. Retain the original scalar in
10455 : : those cases as well. PR98064. */
10456 : 64295 : if (TREE_CODE (new_tree) == SSA_NAME
10457 : 62704 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10458 : 62704 : && (gimple_bb (use_stmt)->loop_father
10459 : 62704 : != gimple_bb (vec_stmt)->loop_father)
10460 : 69747 : && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10461 : 7043 : gimple_bb (use_stmt)->loop_father))
10462 : : {
10463 : 1591 : if (dump_enabled_p ())
10464 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10465 : : "Using original scalar computation for "
10466 : : "live lane because there is an out-of-loop "
10467 : : "definition for it\n");
10468 : 1591 : continue;
10469 : : }
10470 : 187933 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10471 : 63410 : SET_USE (use_p, new_tree);
10472 : 61113 : update_stmt (use_stmt);
10473 : 36233 : }
10474 : : }
10475 : :
10476 : : return true;
10477 : 277452 : }
10478 : :
10479 : : /* Given loop represented by LOOP_VINFO, return true if computation of
10480 : : LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10481 : : otherwise. */
10482 : :
10483 : : static bool
10484 : 60623 : loop_niters_no_overflow (loop_vec_info loop_vinfo)
10485 : : {
10486 : : /* Constant case. */
10487 : 60623 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10488 : : {
10489 : 35471 : tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10490 : 35471 : tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10491 : :
10492 : 35471 : gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10493 : 35471 : gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10494 : 35471 : if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10495 : : return true;
10496 : : }
10497 : :
10498 : 25152 : widest_int max;
10499 : 25152 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10500 : : /* Check the upper bound of loop niters. */
10501 : 25152 : if (get_max_loop_iterations (loop, &max))
10502 : : {
10503 : 25152 : tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10504 : 25152 : signop sgn = TYPE_SIGN (type);
10505 : 25152 : widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10506 : 25152 : if (max < type_max)
10507 : 24931 : return true;
10508 : 25152 : }
10509 : : return false;
10510 : 25152 : }
10511 : :
10512 : : /* Return a mask type with half the number of elements as OLD_TYPE,
10513 : : given that it should have mode NEW_MODE. */
10514 : :
10515 : : tree
10516 : 3920 : vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10517 : : {
10518 : 3920 : poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10519 : 3920 : return build_truth_vector_type_for_mode (nunits, new_mode);
10520 : : }
10521 : :
10522 : : /* Return a mask type with twice as many elements as OLD_TYPE,
10523 : : given that it should have mode NEW_MODE. */
10524 : :
10525 : : tree
10526 : 5911 : vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10527 : : {
10528 : 5911 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10529 : 5911 : return build_truth_vector_type_for_mode (nunits, new_mode);
10530 : : }
10531 : :
10532 : : /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10533 : : contain a sequence of NVECTORS masks that each control a vector of type
10534 : : VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10535 : : these vector masks with the vector version of SCALAR_MASK. */
10536 : :
10537 : : void
10538 : 77662 : vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10539 : : unsigned int nvectors, tree vectype, tree scalar_mask)
10540 : : {
10541 : 77662 : gcc_assert (nvectors != 0);
10542 : :
10543 : 77662 : if (scalar_mask)
10544 : : {
10545 : 3504 : scalar_cond_masked_key cond (scalar_mask, nvectors);
10546 : 3504 : loop_vinfo->scalar_cond_masked_set.add (cond);
10547 : : }
10548 : :
10549 : 77662 : masks->mask_set.add (std::make_pair (vectype, nvectors));
10550 : 77662 : }
10551 : :
10552 : : /* Given a complete set of masks MASKS, extract mask number INDEX
10553 : : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10554 : : where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10555 : :
10556 : : See the comment above vec_loop_masks for more details about the mask
10557 : : arrangement. */
10558 : :
10559 : : tree
10560 : 203 : vect_get_loop_mask (loop_vec_info loop_vinfo,
10561 : : gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10562 : : unsigned int nvectors, tree vectype, unsigned int index)
10563 : : {
10564 : 203 : if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10565 : : == vect_partial_vectors_while_ult)
10566 : : {
10567 : 0 : rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10568 : 0 : tree mask_type = rgm->type;
10569 : :
10570 : : /* Populate the rgroup's mask array, if this is the first time we've
10571 : : used it. */
10572 : 0 : if (rgm->controls.is_empty ())
10573 : : {
10574 : 0 : rgm->controls.safe_grow_cleared (nvectors, true);
10575 : 0 : for (unsigned int i = 0; i < nvectors; ++i)
10576 : : {
10577 : 0 : tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10578 : : /* Provide a dummy definition until the real one is available. */
10579 : 0 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10580 : 0 : rgm->controls[i] = mask;
10581 : : }
10582 : : }
10583 : :
10584 : 0 : tree mask = rgm->controls[index];
10585 : 0 : if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10586 : 0 : TYPE_VECTOR_SUBPARTS (vectype)))
10587 : : {
10588 : : /* A loop mask for data type X can be reused for data type Y
10589 : : if X has N times more elements than Y and if Y's elements
10590 : : are N times bigger than X's. In this case each sequence
10591 : : of N elements in the loop mask will be all-zero or all-one.
10592 : : We can then view-convert the mask so that each sequence of
10593 : : N elements is replaced by a single element. */
10594 : 0 : gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10595 : : TYPE_VECTOR_SUBPARTS (vectype)));
10596 : 0 : gimple_seq seq = NULL;
10597 : 0 : mask_type = truth_type_for (vectype);
10598 : 0 : mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10599 : 0 : if (seq)
10600 : 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10601 : : }
10602 : 0 : return mask;
10603 : : }
10604 : 203 : else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10605 : : == vect_partial_vectors_avx512)
10606 : : {
10607 : : /* The number of scalars per iteration and the number of vectors are
10608 : : both compile-time constants. */
10609 : 203 : unsigned int nscalars_per_iter
10610 : 203 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10611 : 203 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10612 : :
10613 : 203 : rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10614 : :
10615 : : /* The stored nV is dependent on the mask type produced. */
10616 : 203 : gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10617 : : TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10618 : : == rgm->factor);
10619 : 203 : nvectors = rgm->factor;
10620 : :
10621 : : /* Populate the rgroup's mask array, if this is the first time we've
10622 : : used it. */
10623 : 203 : if (rgm->controls.is_empty ())
10624 : : {
10625 : 19 : rgm->controls.safe_grow_cleared (nvectors, true);
10626 : 104 : for (unsigned int i = 0; i < nvectors; ++i)
10627 : : {
10628 : 85 : tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
10629 : : /* Provide a dummy definition until the real one is available. */
10630 : 85 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10631 : 85 : rgm->controls[i] = mask;
10632 : : }
10633 : : }
10634 : 203 : if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
10635 : : TYPE_VECTOR_SUBPARTS (vectype)))
10636 : 155 : return rgm->controls[index];
10637 : :
10638 : : /* Split the vector if needed. Since we are dealing with integer mode
10639 : : masks with AVX512 we can operate on the integer representation
10640 : : performing the whole vector shifting. */
10641 : 48 : unsigned HOST_WIDE_INT factor;
10642 : 48 : bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
10643 : 48 : TYPE_VECTOR_SUBPARTS (vectype), &factor);
10644 : 0 : gcc_assert (ok);
10645 : 48 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
10646 : 48 : tree mask_type = truth_type_for (vectype);
10647 : 48 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
10648 : 48 : unsigned vi = index / factor;
10649 : 48 : unsigned vpart = index % factor;
10650 : 48 : tree vec = rgm->controls[vi];
10651 : 48 : gimple_seq seq = NULL;
10652 : 48 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
10653 : 48 : lang_hooks.types.type_for_mode
10654 : 48 : (TYPE_MODE (rgm->type), 1), vec);
10655 : : /* For integer mode masks simply shift the right bits into position. */
10656 : 48 : if (vpart != 0)
10657 : 40 : vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
10658 : : build_int_cst (integer_type_node,
10659 : 80 : (TYPE_VECTOR_SUBPARTS (vectype)
10660 : 40 : * vpart)));
10661 : 48 : vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
10662 : 48 : (TYPE_MODE (mask_type), 1), vec);
10663 : 48 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
10664 : 48 : if (seq)
10665 : 48 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10666 : 48 : return vec;
10667 : : }
10668 : : else
10669 : 0 : gcc_unreachable ();
10670 : : }
10671 : :
10672 : : /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10673 : : lengths for controlling an operation on VECTYPE. The operation splits
10674 : : each element of VECTYPE into FACTOR separate subelements, measuring the
10675 : : length as a number of these subelements. */
10676 : :
10677 : : void
10678 : 0 : vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10679 : : unsigned int nvectors, tree vectype, unsigned int factor)
10680 : : {
10681 : 0 : gcc_assert (nvectors != 0);
10682 : 0 : if (lens->length () < nvectors)
10683 : 0 : lens->safe_grow_cleared (nvectors, true);
10684 : 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
10685 : :
10686 : : /* The number of scalars per iteration, scalar occupied bytes and
10687 : : the number of vectors are both compile-time constants. */
10688 : 0 : unsigned int nscalars_per_iter
10689 : 0 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10690 : 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10691 : :
10692 : 0 : if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10693 : : {
10694 : : /* For now, we only support cases in which all loads and stores fall back
10695 : : to VnQI or none do. */
10696 : 0 : gcc_assert (!rgl->max_nscalars_per_iter
10697 : : || (rgl->factor == 1 && factor == 1)
10698 : : || (rgl->max_nscalars_per_iter * rgl->factor
10699 : : == nscalars_per_iter * factor));
10700 : 0 : rgl->max_nscalars_per_iter = nscalars_per_iter;
10701 : 0 : rgl->type = vectype;
10702 : 0 : rgl->factor = factor;
10703 : : }
10704 : 0 : }
10705 : :
10706 : : /* Given a complete set of lengths LENS, extract length number INDEX
10707 : : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10708 : : where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
10709 : : multipled by the number of elements that should be processed.
10710 : : Insert any set-up statements before GSI. */
10711 : :
10712 : : tree
10713 : 0 : vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10714 : : vec_loop_lens *lens, unsigned int nvectors, tree vectype,
10715 : : unsigned int index, unsigned int factor)
10716 : : {
10717 : 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
10718 : 0 : bool use_bias_adjusted_len =
10719 : 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10720 : :
10721 : : /* Populate the rgroup's len array, if this is the first time we've
10722 : : used it. */
10723 : 0 : if (rgl->controls.is_empty ())
10724 : : {
10725 : 0 : rgl->controls.safe_grow_cleared (nvectors, true);
10726 : 0 : for (unsigned int i = 0; i < nvectors; ++i)
10727 : : {
10728 : 0 : tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10729 : 0 : gcc_assert (len_type != NULL_TREE);
10730 : :
10731 : 0 : tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10732 : :
10733 : : /* Provide a dummy definition until the real one is available. */
10734 : 0 : SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10735 : 0 : rgl->controls[i] = len;
10736 : :
10737 : 0 : if (use_bias_adjusted_len)
10738 : : {
10739 : 0 : gcc_assert (i == 0);
10740 : 0 : tree adjusted_len =
10741 : 0 : make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10742 : 0 : SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10743 : 0 : rgl->bias_adjusted_ctrl = adjusted_len;
10744 : : }
10745 : : }
10746 : : }
10747 : :
10748 : 0 : if (use_bias_adjusted_len)
10749 : 0 : return rgl->bias_adjusted_ctrl;
10750 : :
10751 : 0 : tree loop_len = rgl->controls[index];
10752 : 0 : if (rgl->factor == 1 && factor == 1)
10753 : : {
10754 : 0 : poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
10755 : 0 : poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
10756 : 0 : if (maybe_ne (nunits1, nunits2))
10757 : : {
10758 : : /* A loop len for data type X can be reused for data type Y
10759 : : if X has N times more elements than Y and if Y's elements
10760 : : are N times bigger than X's. */
10761 : 0 : gcc_assert (multiple_p (nunits1, nunits2));
10762 : 0 : factor = exact_div (nunits1, nunits2).to_constant ();
10763 : 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10764 : 0 : gimple_seq seq = NULL;
10765 : 0 : loop_len = gimple_build (&seq, EXACT_DIV_EXPR, iv_type, loop_len,
10766 : 0 : build_int_cst (iv_type, factor));
10767 : 0 : if (seq)
10768 : 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10769 : : }
10770 : : }
10771 : : return loop_len;
10772 : : }
10773 : :
10774 : : /* Generate the tree for the loop len mask and return it. Given the lens,
10775 : : nvectors, vectype, index and factor to gen the len mask as below.
10776 : :
10777 : : tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
10778 : : */
10779 : : tree
10780 : 0 : vect_gen_loop_len_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10781 : : gimple_stmt_iterator *cond_gsi, vec_loop_lens *lens,
10782 : : unsigned int nvectors, tree vectype, tree stmt,
10783 : : unsigned int index, unsigned int factor)
10784 : : {
10785 : 0 : tree all_one_mask = build_all_ones_cst (vectype);
10786 : 0 : tree all_zero_mask = build_zero_cst (vectype);
10787 : 0 : tree len = vect_get_loop_len (loop_vinfo, gsi, lens, nvectors, vectype, index,
10788 : : factor);
10789 : 0 : tree bias = build_int_cst (intQI_type_node,
10790 : 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo));
10791 : 0 : tree len_mask = make_temp_ssa_name (TREE_TYPE (stmt), NULL, "vec_len_mask");
10792 : 0 : gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, stmt,
10793 : : all_one_mask, all_zero_mask, len,
10794 : : bias);
10795 : 0 : gimple_call_set_lhs (call, len_mask);
10796 : 0 : gsi_insert_before (cond_gsi, call, GSI_SAME_STMT);
10797 : :
10798 : 0 : return len_mask;
10799 : : }
10800 : :
10801 : : /* Scale profiling counters by estimation for LOOP which is vectorized
10802 : : by factor VF.
10803 : : If FLAT is true, the loop we started with had unrealistically flat
10804 : : profile. */
10805 : :
10806 : : static void
10807 : 60623 : scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
10808 : : {
10809 : : /* For flat profiles do not scale down proportionally by VF and only
10810 : : cap by known iteration count bounds. */
10811 : 60623 : if (flat)
10812 : : {
10813 : 33873 : if (dump_file && (dump_flags & TDF_DETAILS))
10814 : 5053 : fprintf (dump_file,
10815 : : "Vectorized loop profile seems flat; not scaling iteration "
10816 : : "count down by the vectorization factor %i\n", vf);
10817 : 33873 : scale_loop_profile (loop, profile_probability::always (),
10818 : : get_likely_max_loop_iterations_int (loop));
10819 : 33873 : return;
10820 : : }
10821 : : /* Loop body executes VF fewer times and exit increases VF times. */
10822 : 26750 : profile_count entry_count = loop_preheader_edge (loop)->count ();
10823 : :
10824 : : /* If we have unreliable loop profile avoid dropping entry
10825 : : count below header count. This can happen since loops
10826 : : has unrealistically low trip counts. */
10827 : 26750 : while (vf > 1
10828 : 28015 : && loop->header->count > entry_count
10829 : 56901 : && loop->header->count < entry_count * vf)
10830 : : {
10831 : 2136 : if (dump_file && (dump_flags & TDF_DETAILS))
10832 : 149 : fprintf (dump_file,
10833 : : "Vectorization factor %i seems too large for profile "
10834 : : "prevoiusly believed to be consistent; reducing.\n", vf);
10835 : 2136 : vf /= 2;
10836 : : }
10837 : :
10838 : 26750 : if (entry_count.nonzero_p ())
10839 : 26750 : set_edge_probability_and_rescale_others
10840 : 26750 : (exit_e,
10841 : 26750 : entry_count.probability_in (loop->header->count / vf));
10842 : : /* Avoid producing very large exit probability when we do not have
10843 : : sensible profile. */
10844 : 0 : else if (exit_e->probability < profile_probability::always () / (vf * 2))
10845 : 0 : set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
10846 : 26750 : loop->latch->count = single_pred_edge (loop->latch)->count ();
10847 : :
10848 : 26750 : scale_loop_profile (loop, profile_probability::always () / vf,
10849 : : get_likely_max_loop_iterations_int (loop));
10850 : : }
10851 : :
10852 : : /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
10853 : : original loop that has now been vectorized.
10854 : :
10855 : : The inits of the data_references need to be advanced with the number of
10856 : : iterations of the main loop. This has been computed in vect_do_peeling and
10857 : : is stored in parameter ADVANCE.
10858 : :
10859 : : Since the loop_vec_info of this EPILOGUE was constructed for the original
10860 : : loop, its stmt_vec_infos all point to the original statements. These need
10861 : : to be updated to point to their corresponding copies.
10862 : :
10863 : : The data_reference's connections also need to be updated. Their
10864 : : corresponding dr_vec_info need to be reconnected to the EPILOGUE's
10865 : : stmt_vec_infos, their statements need to point to their corresponding
10866 : : copy. */
10867 : :
10868 : : static void
10869 : 6855 : update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
10870 : : {
10871 : 6855 : loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
10872 : 6855 : hash_map<tree,tree> mapping;
10873 : 6855 : gimple *orig_stmt, *new_stmt;
10874 : 6855 : gimple_stmt_iterator epilogue_gsi;
10875 : 6855 : gphi_iterator epilogue_phi_gsi;
10876 : 6855 : stmt_vec_info stmt_vinfo = NULL, related_vinfo;
10877 : 6855 : basic_block *epilogue_bbs = get_loop_body (epilogue);
10878 : 6855 : unsigned i;
10879 : :
10880 : 6855 : free (LOOP_VINFO_BBS (epilogue_vinfo));
10881 : 6855 : LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
10882 : 6855 : LOOP_VINFO_NBBS (epilogue_vinfo) = epilogue->num_nodes;
10883 : :
10884 : : /* The EPILOGUE loop is a copy of the original loop so they share the same
10885 : : gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
10886 : : point to the copied statements. */
10887 : 20565 : for (unsigned i = 0; i < epilogue->num_nodes; ++i)
10888 : : {
10889 : 13710 : for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
10890 : 35334 : !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
10891 : : {
10892 : 21624 : new_stmt = epilogue_phi_gsi.phi ();
10893 : :
10894 : 21624 : gcc_assert (gimple_uid (new_stmt) > 0);
10895 : 21624 : stmt_vinfo
10896 : 21624 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10897 : :
10898 : 21624 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10899 : : }
10900 : :
10901 : 27420 : for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
10902 : 136284 : !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
10903 : : {
10904 : 122574 : new_stmt = gsi_stmt (epilogue_gsi);
10905 : 122574 : if (is_gimple_debug (new_stmt))
10906 : 21873 : continue;
10907 : :
10908 : 100701 : gcc_assert (gimple_uid (new_stmt) > 0);
10909 : 100701 : stmt_vinfo
10910 : 100701 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10911 : :
10912 : 100701 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10913 : :
10914 : 100701 : related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
10915 : 100701 : if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
10916 : : {
10917 : 1856 : gimple *stmt = STMT_VINFO_STMT (related_vinfo);
10918 : : /* Set BB such that the assert in
10919 : : 'get_initial_defs_for_reduction' is able to determine that
10920 : : the BB of the related stmt is inside this loop. */
10921 : 1856 : gimple_set_bb (stmt,
10922 : : gimple_bb (new_stmt));
10923 : 1856 : related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
10924 : 1856 : gcc_assert (related_vinfo == NULL
10925 : : || related_vinfo == stmt_vinfo);
10926 : : }
10927 : : }
10928 : : }
10929 : :
10930 : 6855 : struct data_reference *dr;
10931 : 6855 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
10932 : 29143 : FOR_EACH_VEC_ELT (datarefs, i, dr)
10933 : : {
10934 : 22288 : orig_stmt = DR_STMT (dr);
10935 : 22288 : gcc_assert (gimple_uid (orig_stmt) > 0);
10936 : 22288 : stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
10937 : 22288 : DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
10938 : : }
10939 : :
10940 : : /* Advance data_reference's with the number of iterations of the previous
10941 : : loop and its prologue. */
10942 : 6855 : vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
10943 : :
10944 : : /* Remember the advancement made. */
10945 : 6855 : LOOP_VINFO_DRS_ADVANCED_BY (epilogue_vinfo) = advance;
10946 : 6855 : }
10947 : :
10948 : : /* When vectorizing early break statements instructions that happen before
10949 : : the early break in the current BB need to be moved to after the early
10950 : : break. This function deals with that and assumes that any validity
10951 : : checks has already been performed.
10952 : :
10953 : : While moving the instructions if it encounters a VUSE or VDEF it then
10954 : : corrects the VUSES as it moves the statements along. GDEST is the location
10955 : : in which to insert the new statements. */
10956 : :
10957 : : static void
10958 : 1436 : move_early_exit_stmts (loop_vec_info loop_vinfo)
10959 : : {
10960 : 1436 : DUMP_VECT_SCOPE ("move_early_exit_stmts");
10961 : :
10962 : 1436 : if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
10963 : 1230 : return;
10964 : :
10965 : : /* Move all stmts that need moving. */
10966 : 206 : basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
10967 : 206 : gimple_stmt_iterator dest_gsi = gsi_after_labels (dest_bb);
10968 : :
10969 : 206 : tree last_seen_vuse = NULL_TREE;
10970 : 511 : for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
10971 : : {
10972 : : /* We have to update crossed degenerate virtual PHIs. Simply
10973 : : elide them. */
10974 : 305 : if (gphi *vphi = dyn_cast <gphi *> (stmt))
10975 : : {
10976 : 7 : tree vdef = gimple_phi_result (vphi);
10977 : 7 : tree vuse = gimple_phi_arg_def (vphi, 0);
10978 : 7 : imm_use_iterator iter;
10979 : 7 : use_operand_p use_p;
10980 : 7 : gimple *use_stmt;
10981 : 30 : FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef)
10982 : : {
10983 : 48 : FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
10984 : 16 : SET_USE (use_p, vuse);
10985 : 7 : }
10986 : 7 : auto gsi = gsi_for_stmt (stmt);
10987 : 7 : remove_phi_node (&gsi, true);
10988 : 7 : last_seen_vuse = vuse;
10989 : 7 : continue;
10990 : 7 : }
10991 : :
10992 : : /* Check to see if statement is still required for vect or has been
10993 : : elided. */
10994 : 298 : auto stmt_info = loop_vinfo->lookup_stmt (stmt);
10995 : 298 : if (!stmt_info)
10996 : 0 : continue;
10997 : :
10998 : 298 : if (dump_enabled_p ())
10999 : 147 : dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11000 : :
11001 : 298 : gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11002 : 298 : gsi_move_before (&stmt_gsi, &dest_gsi, GSI_NEW_STMT);
11003 : 596 : last_seen_vuse = gimple_vuse (stmt);
11004 : : }
11005 : :
11006 : : /* Update all the stmts with their new reaching VUSES. */
11007 : 630 : for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11008 : : {
11009 : 178 : if (dump_enabled_p ())
11010 : 142 : dump_printf_loc (MSG_NOTE, vect_location,
11011 : : "updating vuse to %T for load %G",
11012 : : last_seen_vuse, p);
11013 : 178 : gimple_set_vuse (p, last_seen_vuse);
11014 : 178 : update_stmt (p);
11015 : : }
11016 : :
11017 : : /* And update the LC PHIs on exits. */
11018 : 1036 : for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
11019 : 418 : if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
11020 : 220 : if (gphi *phi = get_virtual_phi (e->dest))
11021 : 426 : SET_PHI_ARG_DEF_ON_EDGE (phi, e, last_seen_vuse);
11022 : : }
11023 : :
11024 : : /* Function vect_transform_loop.
11025 : :
11026 : : The analysis phase has determined that the loop is vectorizable.
11027 : : Vectorize the loop - created vectorized stmts to replace the scalar
11028 : : stmts in the loop, and update the loop exit condition.
11029 : : Returns scalar epilogue loop if any. */
11030 : :
11031 : : class loop *
11032 : 60623 : vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11033 : : {
11034 : 60623 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11035 : 60623 : class loop *epilogue = NULL;
11036 : 60623 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11037 : 60623 : int nbbs = loop->num_nodes;
11038 : 60623 : int i;
11039 : 60623 : tree niters_vector = NULL_TREE;
11040 : 60623 : tree step_vector = NULL_TREE;
11041 : 60623 : tree niters_vector_mult_vf = NULL_TREE;
11042 : 60623 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11043 : 60623 : unsigned int lowest_vf = constant_lower_bound (vf);
11044 : 60623 : gimple *stmt;
11045 : 60623 : bool check_profitability = false;
11046 : 60623 : unsigned int th;
11047 : 60623 : bool flat = maybe_flat_loop_profile (loop);
11048 : :
11049 : 60623 : DUMP_VECT_SCOPE ("vec_transform_loop");
11050 : :
11051 : 60623 : if (! LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11052 : 53768 : loop_vinfo->shared->check_datarefs ();
11053 : :
11054 : : /* Use the more conservative vectorization threshold. If the number
11055 : : of iterations is constant assume the cost check has been performed
11056 : : by our caller. If the threshold makes all loops profitable that
11057 : : run at least the (estimated) vectorization factor number of times
11058 : : checking is pointless, too. */
11059 : 60623 : th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11060 : 60623 : if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11061 : : {
11062 : 18215 : if (dump_enabled_p ())
11063 : 172 : dump_printf_loc (MSG_NOTE, vect_location,
11064 : : "Profitability threshold is %d loop iterations.\n",
11065 : : th);
11066 : : check_profitability = true;
11067 : : }
11068 : :
11069 : : /* Make sure there exists a single-predecessor exit bb. Do this before
11070 : : versioning. */
11071 : 60623 : edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11072 : 60623 : if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11073 : : {
11074 : 18681 : split_loop_exit_edge (e, true);
11075 : 18681 : if (dump_enabled_p ())
11076 : 2221 : dump_printf (MSG_NOTE, "split exit edge\n");
11077 : : }
11078 : :
11079 : : /* Version the loop first, if required, so the profitability check
11080 : : comes first. */
11081 : :
11082 : 60623 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11083 : : {
11084 : 3673 : class loop *sloop
11085 : 3673 : = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11086 : 3673 : sloop->force_vectorize = false;
11087 : 3673 : check_profitability = false;
11088 : : }
11089 : :
11090 : : /* Make sure there exists a single-predecessor exit bb also on the
11091 : : scalar loop copy. Do this after versioning but before peeling
11092 : : so CFG structure is fine for both scalar and if-converted loop
11093 : : to make slpeel_duplicate_current_defs_from_edges face matched
11094 : : loop closed PHI nodes on the exit. */
11095 : 60623 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11096 : : {
11097 : 7989 : e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11098 : 7989 : if (! single_pred_p (e->dest))
11099 : : {
11100 : 7726 : split_loop_exit_edge (e, true);
11101 : 7726 : if (dump_enabled_p ())
11102 : 1124 : dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11103 : : }
11104 : : }
11105 : :
11106 : 60623 : tree niters = vect_build_loop_niters (loop_vinfo);
11107 : 60623 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11108 : 60623 : tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11109 : 60623 : bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11110 : 60623 : tree advance;
11111 : 60623 : drs_init_vec orig_drs_init;
11112 : :
11113 : 60623 : epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11114 : : &step_vector, &niters_vector_mult_vf, th,
11115 : : check_profitability, niters_no_overflow,
11116 : : &advance);
11117 : 60623 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11118 : 60623 : && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11119 : : {
11120 : : /* Ifcvt duplicates loop preheader, loop body and produces an basic
11121 : : block after loop exit. We need to scale all that. */
11122 : 89 : basic_block preheader
11123 : 89 : = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11124 : 89 : preheader->count
11125 : : = preheader->count.apply_probability
11126 : 89 : (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11127 : 89 : scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11128 : : LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11129 : 89 : LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo)->dest->count = preheader->count;
11130 : : }
11131 : :
11132 : 60623 : if (niters_vector == NULL_TREE)
11133 : : {
11134 : 26976 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11135 : 26976 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11136 : 54701 : && known_eq (lowest_vf, vf))
11137 : : {
11138 : 26973 : niters_vector
11139 : 26973 : = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11140 : 26973 : LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11141 : 26973 : step_vector = build_one_cst (TREE_TYPE (niters));
11142 : : }
11143 : 755 : else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11144 : 1 : vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11145 : : &step_vector, niters_no_overflow);
11146 : : else
11147 : : /* vect_do_peeling subtracted the number of peeled prologue
11148 : : iterations from LOOP_VINFO_NITERS. */
11149 : 754 : vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11150 : : &niters_vector, &step_vector,
11151 : : niters_no_overflow);
11152 : : }
11153 : :
11154 : : /* 1) Make sure the loop header has exactly two entries
11155 : : 2) Make sure we have a preheader basic block. */
11156 : :
11157 : 60623 : gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11158 : :
11159 : 60623 : split_edge (loop_preheader_edge (loop));
11160 : :
11161 : 60623 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11162 : : /* This will deal with any possible peeling. */
11163 : 1 : vect_prepare_for_masked_peels (loop_vinfo);
11164 : :
11165 : : /* Handle any code motion that we need to for early-break vectorization after
11166 : : we've done peeling but just before we start vectorizing. */
11167 : 60623 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11168 : 1436 : move_early_exit_stmts (loop_vinfo);
11169 : :
11170 : : /* Remove existing clobber stmts and prefetches. */
11171 : 185056 : for (i = 0; i < nbbs; i++)
11172 : : {
11173 : 124433 : basic_block bb = bbs[i];
11174 : 1069408 : for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);)
11175 : : {
11176 : 820542 : stmt = gsi_stmt (si);
11177 : 820542 : if (gimple_clobber_p (stmt)
11178 : 820542 : || gimple_call_builtin_p (stmt, BUILT_IN_PREFETCH))
11179 : : {
11180 : 88 : unlink_stmt_vdef (stmt);
11181 : 88 : gsi_remove (&si, true);
11182 : 88 : release_defs (stmt);
11183 : : }
11184 : : else
11185 : 820454 : gsi_next (&si);
11186 : : }
11187 : : }
11188 : :
11189 : : /* Schedule the SLP instances. */
11190 : 60623 : if (!loop_vinfo->slp_instances.is_empty ())
11191 : : {
11192 : 60623 : DUMP_VECT_SCOPE ("scheduling SLP instances");
11193 : 60623 : vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11194 : : }
11195 : :
11196 : : /* Generate the loop invariant statements. */
11197 : 60623 : if (!gimple_seq_empty_p (LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo)))
11198 : : {
11199 : 73 : if (dump_enabled_p ())
11200 : 30 : dump_printf_loc (MSG_NOTE, vect_location,
11201 : : "------>generating loop invariant statements\n");
11202 : 73 : gimple_stmt_iterator gsi;
11203 : 73 : gsi = gsi_after_labels (loop_preheader_edge (loop)->src);
11204 : 73 : gsi_insert_seq_before (&gsi, LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo),
11205 : : GSI_CONTINUE_LINKING);
11206 : : }
11207 : :
11208 : : /* Stub out scalar statements that must not survive vectorization and
11209 : : were not picked as relevant in any SLP instance.
11210 : : Doing this here helps with grouped statements, or statements that
11211 : : are involved in patterns. */
11212 : 185056 : for (i = 0; i < nbbs; i++)
11213 : : {
11214 : 124433 : basic_block bb = bbs[i];
11215 : 124433 : stmt_vec_info stmt_info;
11216 : 248866 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11217 : 1637730 : !gsi_end_p (gsi); gsi_next (&gsi))
11218 : : {
11219 : 1513297 : gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11220 : 6240 : if (!call || !gimple_call_internal_p (call))
11221 : 1508211 : continue;
11222 : 5086 : internal_fn ifn = gimple_call_internal_fn (call);
11223 : 5086 : if (ifn == IFN_MASK_LOAD)
11224 : : {
11225 : 657 : tree lhs = gimple_get_lhs (call);
11226 : 657 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11227 : : {
11228 : 0 : tree zero = build_zero_cst (TREE_TYPE (lhs));
11229 : 0 : gimple *new_stmt = gimple_build_assign (lhs, zero);
11230 : 0 : gsi_replace (&gsi, new_stmt, true);
11231 : : }
11232 : : }
11233 : 4429 : else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11234 : : {
11235 : 2295 : tree lhs = gimple_get_lhs (call);
11236 : 2295 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11237 : : {
11238 : 0 : tree else_arg
11239 : 0 : = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11240 : 0 : gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11241 : 0 : gsi_replace (&gsi, new_stmt, true);
11242 : : }
11243 : : }
11244 : 2134 : else if (ifn == IFN_MASK_CALL
11245 : 4 : && (stmt_info = loop_vinfo->lookup_stmt (call))
11246 : 4 : && !STMT_VINFO_RELEVANT_P (stmt_info)
11247 : 2138 : && !STMT_VINFO_LIVE_P (stmt_info))
11248 : : {
11249 : 4 : gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11250 : 4 : loop_vinfo->remove_stmt (stmt_info);
11251 : : }
11252 : : }
11253 : : }
11254 : :
11255 : : /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11256 : : a zero NITERS becomes a nonzero NITERS_VECTOR. */
11257 : 60623 : if (integer_onep (step_vector))
11258 : 60606 : niters_no_overflow = true;
11259 : 60623 : vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
11260 : : niters_vector, step_vector, niters_vector_mult_vf,
11261 : 60623 : !niters_no_overflow);
11262 : :
11263 : 60623 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11264 : :
11265 : : /* True if the final iteration might not handle a full vector's
11266 : : worth of scalar iterations. */
11267 : 121246 : bool final_iter_may_be_partial
11268 : 60623 : = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11269 : 60623 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
11270 : :
11271 : : /* +1 to convert latch counts to loop iteration counts. */
11272 : 60623 : int bias_for_lowest = 1;
11273 : :
11274 : : /* When we are peeling for gaps then we take away one scalar iteration
11275 : : from the vector loop. Thus we can adjust the upper bound by one
11276 : : scalar iteration. But only when we know the bound applies to the
11277 : : IV exit test which might not be true when we have multiple exits. */
11278 : 60623 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11279 : 118021 : bias_for_lowest -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11280 : :
11281 : 60623 : int bias_for_assumed = bias_for_lowest;
11282 : 60623 : int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11283 : 60623 : if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11284 : : {
11285 : : /* When the amount of peeling is known at compile time, the first
11286 : : iteration will have exactly alignment_npeels active elements.
11287 : : In the worst case it will have at least one. */
11288 : 1 : int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11289 : 1 : bias_for_lowest += lowest_vf - min_first_active;
11290 : 1 : bias_for_assumed += assumed_vf - min_first_active;
11291 : : }
11292 : : /* In these calculations the "- 1" converts loop iteration counts
11293 : : back to latch counts. */
11294 : 60623 : if (loop->any_upper_bound)
11295 : : {
11296 : 60623 : loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11297 : 60623 : loop->nb_iterations_upper_bound
11298 : 60623 : = (final_iter_may_be_partial
11299 : 62076 : ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11300 : 2906 : lowest_vf) - 1
11301 : 59170 : : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11302 : 118340 : lowest_vf) - 1);
11303 : 60623 : if (main_vinfo
11304 : : /* Both peeling for alignment and peeling for gaps can end up
11305 : : with the scalar epilogue running for more than VF-1 iterations. */
11306 : 6855 : && !main_vinfo->peeling_for_alignment
11307 : 6807 : && !main_vinfo->peeling_for_gaps)
11308 : : {
11309 : 6644 : unsigned int bound;
11310 : 6644 : poly_uint64 main_iters
11311 : 6644 : = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11312 : : LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11313 : 6644 : main_iters
11314 : 6644 : = upper_bound (main_iters,
11315 : 6644 : LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11316 : 13288 : if (can_div_away_from_zero_p (main_iters,
11317 : 6644 : LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11318 : : &bound))
11319 : 6644 : loop->nb_iterations_upper_bound
11320 : 6644 : = wi::umin ((bound_wide_int) (bound - 1),
11321 : 6644 : loop->nb_iterations_upper_bound);
11322 : : }
11323 : : }
11324 : 60623 : if (loop->any_likely_upper_bound)
11325 : 60623 : loop->nb_iterations_likely_upper_bound
11326 : 60623 : = (final_iter_may_be_partial
11327 : 62076 : ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11328 : 1453 : + bias_for_lowest, lowest_vf) - 1
11329 : 59170 : : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11330 : 60623 : + bias_for_lowest, lowest_vf) - 1);
11331 : 60623 : if (loop->any_estimate)
11332 : 35121 : loop->nb_iterations_estimate
11333 : 35121 : = (final_iter_may_be_partial
11334 : 35908 : ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11335 : 1574 : assumed_vf) - 1
11336 : 34334 : : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11337 : 69455 : assumed_vf) - 1);
11338 : 60623 : scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
11339 : : assumed_vf, flat);
11340 : :
11341 : 60623 : if (dump_enabled_p ())
11342 : : {
11343 : 10451 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11344 : : {
11345 : 9078 : dump_printf_loc (MSG_NOTE, vect_location,
11346 : : "LOOP VECTORIZED\n");
11347 : 9078 : if (loop->inner)
11348 : 286 : dump_printf_loc (MSG_NOTE, vect_location,
11349 : : "OUTER LOOP VECTORIZED\n");
11350 : 9078 : dump_printf (MSG_NOTE, "\n");
11351 : : }
11352 : : else
11353 : 1373 : dump_printf_loc (MSG_NOTE, vect_location,
11354 : : "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11355 : 1373 : GET_MODE_NAME (loop_vinfo->vector_mode));
11356 : : }
11357 : :
11358 : : /* Loops vectorized with a variable factor won't benefit from
11359 : : unrolling/peeling. */
11360 : 60623 : if (!vf.is_constant ())
11361 : : {
11362 : : loop->unroll = 1;
11363 : : if (dump_enabled_p ())
11364 : : dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11365 : : " variable-length vectorization factor\n");
11366 : : }
11367 : :
11368 : : /* When we have unrolled the loop due to a user requested value we should
11369 : : leave it up to the RTL unroll heuristics to determine if it's still worth
11370 : : while to unroll more. */
11371 : 60623 : if (LOOP_VINFO_USER_UNROLL (loop_vinfo))
11372 : 44 : loop->unroll = 0;
11373 : :
11374 : : /* Free SLP instances here because otherwise stmt reference counting
11375 : : won't work. */
11376 : : slp_instance instance;
11377 : 151793 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11378 : 91170 : vect_free_slp_instance (instance);
11379 : 60623 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11380 : : /* Clear-up safelen field since its value is invalid after vectorization
11381 : : since vectorized loop can have loop-carried dependencies. */
11382 : 60623 : loop->safelen = 0;
11383 : :
11384 : 60623 : if (epilogue)
11385 : : {
11386 : : /* Accumulate past advancements made. */
11387 : 6855 : if (LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo))
11388 : 89 : advance = fold_build2 (PLUS_EXPR, TREE_TYPE (advance),
11389 : : LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo),
11390 : : advance);
11391 : 6855 : update_epilogue_loop_vinfo (epilogue, advance);
11392 : :
11393 : 6855 : epilogue->simduid = loop->simduid;
11394 : 6855 : epilogue->force_vectorize = loop->force_vectorize;
11395 : 6855 : epilogue->dont_vectorize = false;
11396 : : }
11397 : :
11398 : 60623 : return epilogue;
11399 : 60623 : }
11400 : :
11401 : : /* The code below is trying to perform simple optimization - revert
11402 : : if-conversion for masked stores, i.e. if the mask of a store is zero
11403 : : do not perform it and all stored value producers also if possible.
11404 : : For example,
11405 : : for (i=0; i<n; i++)
11406 : : if (c[i])
11407 : : {
11408 : : p1[i] += 1;
11409 : : p2[i] = p3[i] +2;
11410 : : }
11411 : : this transformation will produce the following semi-hammock:
11412 : :
11413 : : if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11414 : : {
11415 : : vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11416 : : vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11417 : : MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11418 : : vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11419 : : vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11420 : : MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11421 : : }
11422 : : */
11423 : :
11424 : : void
11425 : 495 : optimize_mask_stores (class loop *loop)
11426 : : {
11427 : 495 : basic_block *bbs = get_loop_body (loop);
11428 : 495 : unsigned nbbs = loop->num_nodes;
11429 : 495 : unsigned i;
11430 : 495 : basic_block bb;
11431 : 495 : class loop *bb_loop;
11432 : 495 : gimple_stmt_iterator gsi;
11433 : 495 : gimple *stmt;
11434 : 495 : auto_vec<gimple *> worklist;
11435 : 495 : auto_purge_vect_location sentinel;
11436 : :
11437 : 495 : vect_location = find_loop_location (loop);
11438 : : /* Pick up all masked stores in loop if any. */
11439 : 1980 : for (i = 0; i < nbbs; i++)
11440 : : {
11441 : 990 : bb = bbs[i];
11442 : 16309 : for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11443 : 14329 : gsi_next (&gsi))
11444 : : {
11445 : 14329 : stmt = gsi_stmt (gsi);
11446 : 14329 : if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11447 : 649 : worklist.safe_push (stmt);
11448 : : }
11449 : : }
11450 : :
11451 : 495 : free (bbs);
11452 : 495 : if (worklist.is_empty ())
11453 : 68 : return;
11454 : :
11455 : : /* Loop has masked stores. */
11456 : 1059 : while (!worklist.is_empty ())
11457 : : {
11458 : 632 : gimple *last, *last_store;
11459 : 632 : edge e, efalse;
11460 : 632 : tree mask;
11461 : 632 : basic_block store_bb, join_bb;
11462 : 632 : gimple_stmt_iterator gsi_to;
11463 : 632 : tree vdef, new_vdef;
11464 : 632 : gphi *phi;
11465 : 632 : tree vectype;
11466 : 632 : tree zero;
11467 : :
11468 : 632 : last = worklist.pop ();
11469 : 632 : mask = gimple_call_arg (last, 2);
11470 : 632 : bb = gimple_bb (last);
11471 : : /* Create then_bb and if-then structure in CFG, then_bb belongs to
11472 : : the same loop as if_bb. It could be different to LOOP when two
11473 : : level loop-nest is vectorized and mask_store belongs to the inner
11474 : : one. */
11475 : 632 : e = split_block (bb, last);
11476 : 632 : bb_loop = bb->loop_father;
11477 : 632 : gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11478 : 632 : join_bb = e->dest;
11479 : 632 : store_bb = create_empty_bb (bb);
11480 : 632 : add_bb_to_loop (store_bb, bb_loop);
11481 : 632 : e->flags = EDGE_TRUE_VALUE;
11482 : 632 : efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11483 : : /* Put STORE_BB to likely part. */
11484 : 632 : efalse->probability = profile_probability::likely ();
11485 : 632 : e->probability = efalse->probability.invert ();
11486 : 632 : store_bb->count = efalse->count ();
11487 : 632 : make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11488 : 632 : if (dom_info_available_p (CDI_DOMINATORS))
11489 : 632 : set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11490 : 632 : if (dump_enabled_p ())
11491 : 299 : dump_printf_loc (MSG_NOTE, vect_location,
11492 : : "Create new block %d to sink mask stores.",
11493 : : store_bb->index);
11494 : : /* Create vector comparison with boolean result. */
11495 : 632 : vectype = TREE_TYPE (mask);
11496 : 632 : zero = build_zero_cst (vectype);
11497 : 632 : stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11498 : 632 : gsi = gsi_last_bb (bb);
11499 : 632 : gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11500 : : /* Create new PHI node for vdef of the last masked store:
11501 : : .MEM_2 = VDEF <.MEM_1>
11502 : : will be converted to
11503 : : .MEM.3 = VDEF <.MEM_1>
11504 : : and new PHI node will be created in join bb
11505 : : .MEM_2 = PHI <.MEM_1, .MEM_3>
11506 : : */
11507 : 632 : vdef = gimple_vdef (last);
11508 : 632 : new_vdef = make_ssa_name (gimple_vop (cfun), last);
11509 : 632 : gimple_set_vdef (last, new_vdef);
11510 : 632 : phi = create_phi_node (vdef, join_bb);
11511 : 632 : add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11512 : :
11513 : : /* Put all masked stores with the same mask to STORE_BB if possible. */
11514 : 666 : while (true)
11515 : : {
11516 : 649 : gimple_stmt_iterator gsi_from;
11517 : 649 : gimple *stmt1 = NULL;
11518 : :
11519 : : /* Move masked store to STORE_BB. */
11520 : 649 : last_store = last;
11521 : 649 : gsi = gsi_for_stmt (last);
11522 : 649 : gsi_from = gsi;
11523 : : /* Shift GSI to the previous stmt for further traversal. */
11524 : 649 : gsi_prev (&gsi);
11525 : 649 : gsi_to = gsi_start_bb (store_bb);
11526 : 649 : gsi_move_before (&gsi_from, &gsi_to);
11527 : : /* Setup GSI_TO to the non-empty block start. */
11528 : 649 : gsi_to = gsi_start_bb (store_bb);
11529 : 649 : if (dump_enabled_p ())
11530 : 315 : dump_printf_loc (MSG_NOTE, vect_location,
11531 : : "Move stmt to created bb\n%G", last);
11532 : : /* Move all stored value producers if possible. */
11533 : 4439 : while (!gsi_end_p (gsi))
11534 : : {
11535 : 4438 : tree lhs;
11536 : 4438 : imm_use_iterator imm_iter;
11537 : 4438 : use_operand_p use_p;
11538 : 4438 : bool res;
11539 : :
11540 : : /* Skip debug statements. */
11541 : 4438 : if (is_gimple_debug (gsi_stmt (gsi)))
11542 : : {
11543 : 3 : gsi_prev (&gsi);
11544 : 2777 : continue;
11545 : : }
11546 : 4435 : stmt1 = gsi_stmt (gsi);
11547 : : /* Do not consider statements writing to memory or having
11548 : : volatile operand. */
11549 : 8750 : if (gimple_vdef (stmt1)
11550 : 8750 : || gimple_has_volatile_ops (stmt1))
11551 : : break;
11552 : 4315 : gsi_from = gsi;
11553 : 4315 : gsi_prev (&gsi);
11554 : 4315 : lhs = gimple_get_lhs (stmt1);
11555 : 4315 : if (!lhs)
11556 : : break;
11557 : :
11558 : : /* LHS of vectorized stmt must be SSA_NAME. */
11559 : 4315 : if (TREE_CODE (lhs) != SSA_NAME)
11560 : : break;
11561 : :
11562 : 4315 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11563 : : {
11564 : : /* Remove dead scalar statement. */
11565 : 3067 : if (has_zero_uses (lhs))
11566 : : {
11567 : 2774 : gsi_remove (&gsi_from, true);
11568 : 2774 : release_defs (stmt1);
11569 : 2774 : continue;
11570 : : }
11571 : : }
11572 : :
11573 : : /* Check that LHS does not have uses outside of STORE_BB. */
11574 : 1541 : res = true;
11575 : 4186 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11576 : : {
11577 : 1632 : gimple *use_stmt;
11578 : 1632 : use_stmt = USE_STMT (use_p);
11579 : 1632 : if (is_gimple_debug (use_stmt))
11580 : 0 : continue;
11581 : 1632 : if (gimple_bb (use_stmt) != store_bb)
11582 : : {
11583 : : res = false;
11584 : : break;
11585 : : }
11586 : 1541 : }
11587 : 1541 : if (!res)
11588 : : break;
11589 : :
11590 : 1013 : if (gimple_vuse (stmt1)
11591 : 1448 : && gimple_vuse (stmt1) != gimple_vuse (last_store))
11592 : : break;
11593 : :
11594 : : /* Can move STMT1 to STORE_BB. */
11595 : 1013 : if (dump_enabled_p ())
11596 : 529 : dump_printf_loc (MSG_NOTE, vect_location,
11597 : : "Move stmt to created bb\n%G", stmt1);
11598 : 1013 : gsi_move_before (&gsi_from, &gsi_to);
11599 : : /* Shift GSI_TO for further insertion. */
11600 : 2026 : gsi_prev (&gsi_to);
11601 : : }
11602 : : /* Put other masked stores with the same mask to STORE_BB. */
11603 : 649 : if (worklist.is_empty ()
11604 : 222 : || gimple_call_arg (worklist.last (), 2) != mask
11605 : 17 : || worklist.last () != stmt1)
11606 : : break;
11607 : 17 : last = worklist.pop ();
11608 : 17 : }
11609 : 1264 : add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11610 : : }
11611 : 495 : }
11612 : :
11613 : : /* Decide whether it is possible to use a zero-based induction variable
11614 : : when vectorizing LOOP_VINFO with partial vectors. If it is, return
11615 : : the value that the induction variable must be able to hold in order
11616 : : to ensure that the rgroups eventually have no active vector elements.
11617 : : Return -1 otherwise. */
11618 : :
11619 : : widest_int
11620 : 31308 : vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11621 : : {
11622 : 31308 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11623 : 31308 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11624 : 31308 : unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11625 : :
11626 : : /* Calculate the value that the induction variable must be able
11627 : : to hit in order to ensure that we end the loop with an all-false mask.
11628 : : This involves adding the maximum number of inactive trailing scalar
11629 : : iterations. */
11630 : 31308 : widest_int iv_limit = -1;
11631 : 31308 : if (max_loop_iterations (loop, &iv_limit))
11632 : : {
11633 : 31308 : if (niters_skip)
11634 : : {
11635 : : /* Add the maximum number of skipped iterations to the
11636 : : maximum iteration count. */
11637 : 0 : if (TREE_CODE (niters_skip) == INTEGER_CST)
11638 : 0 : iv_limit += wi::to_widest (niters_skip);
11639 : : else
11640 : 0 : iv_limit += max_vf - 1;
11641 : : }
11642 : 31308 : else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11643 : : /* Make a conservatively-correct assumption. */
11644 : 8 : iv_limit += max_vf - 1;
11645 : :
11646 : : /* IV_LIMIT is the maximum number of latch iterations, which is also
11647 : : the maximum in-range IV value. Round this value down to the previous
11648 : : vector alignment boundary and then add an extra full iteration. */
11649 : 31308 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11650 : 31308 : iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11651 : : }
11652 : 31308 : return iv_limit;
11653 : : }
11654 : :
11655 : : /* For the given rgroup_controls RGC, check whether an induction variable
11656 : : would ever hit a value that produces a set of all-false masks or zero
11657 : : lengths before wrapping around. Return true if it's possible to wrap
11658 : : around before hitting the desirable value, otherwise return false. */
11659 : :
11660 : : bool
11661 : 0 : vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11662 : : {
11663 : 0 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11664 : :
11665 : 0 : if (iv_limit == -1)
11666 : : return true;
11667 : :
11668 : 0 : tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11669 : 0 : unsigned int compare_precision = TYPE_PRECISION (compare_type);
11670 : 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11671 : :
11672 : 0 : if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11673 : : return true;
11674 : :
11675 : : return false;
11676 : 0 : }
|